# SIM Metadata Parsing

In [1]:
import numpy as np
import pandas as pd



## Load Master Document
* url_main = "https://docs.google.com/spreadsheets/d/1m_QqU3iQqLD7W8VDzxER8WAEqKY86fJWyOMQ8akc7jM/edit#gid=41925053"

In [2]:
sheet_main_id = "1m_QqU3iQqLD7W8VDzxER8WAEqKY86fJWyOMQ8akc7jM"
sheet_main_name = "ALL"
url_main = f"https://docs.google.com/spreadsheets/d/{sheet_main_id}/gviz/tq?tqx=out:csv&sheet={sheet_main_name}"

In [3]:
sim_all = pd.read_csv(url_main)
sim_all.head()

Unnamed: 0,SIM PubID,Title,35mm,Years From,Years To,105mm,Years From.1,Years To.1,16mm,Years From.2,...,Pub Language,Subjects,PubCollectionID,PubIssueID,Notes,PubCollection Length,Active,Fulltext in DB,Permissioned,oclc
0,1,American Journal of Pharmacy and the Sciences ...,X,1952.0,1995.0,,,,X,1960.0,...,English,Health & Medical Sciences,pub_american-journal-of-pharmacy-and-the-sciences,sim_american-journal-of-pharmacy-and-the-sciences,,49.0,False,,,206933216;500028152;817882935;875845133;107037...
1,2,National Real Estate and Building Journal,X,1949.0,1956.0,,,,,1962.0,...,English,Building & Construction,pub_national-real-estate-and-building-journal,sim_national-real-estate-and-building-journal,,45.0,True,1989.0,,
2,3,The American Naturalist,X,1872.0,2015.0,,,,X,1970.0,...,English,Biology,pub_american-naturalist,sim_american-naturalist,,23.0,True,1867.0,,18300891;191716864;817602316;898814689;9097196...
3,4,Alcatel Telecommunications Review,X,1922.0,2002.0,,,,X,1970.0,...,English,Communication & Information Sciences,pub_alcatel-telecommunications-review,sim_alcatel-telecommunications-review,,37.0,True,,,192886804;194581676;210222394;210222469;312289...
4,5,The American Journal of Gastroenterology,X,1949.0,2011.0,,,,X,1966.0,...,English,Medical Sciences--Gastroenterology,pub_american-journal-of-gastroenterology,sim_american-journal-of-gastroenterology,,40.0,True,1900.0,,121060592;813634707;909921900;936514672;989314...


In [4]:
sim_all.shape

(15181, 35)

In [5]:
sim_all.columns

Index(['SIM PubID', 'Title', '35mm', 'Years From', 'Years To', '105mm',
       'Years From.1', 'Years To.1', '16mm', 'Years From.2', 'Years To.2',
       'Digital Rights or Public Domain', 'Publisher', 'Country', 'ISSN',
       'Impact Rank', 'Total Cities', 'Journal Impact Factor',
       'Eigenfact or Score', 'First Volume', 'Last Volume', 'NA Gaps',
       'Scholarly / Peer-\nReviewed', 'Peer-\nReviewed', 'Pub Type',
       'Pub Language', 'Subjects', 'PubCollectionID', 'PubIssueID', 'Notes',
       'PubCollection Length', 'Active', 'Fulltext in DB', 'Permissioned',
       'oclc'],
      dtype='object')

In [6]:
sim_all['Pub Language'].value_counts()

English                                  14778
German                                     108
French                                      69
Chinese                                     56
Spanish                                     43
Italian                                     36
English|French                              19
English|French|German                       10
English|German                               9
Dutch                                        6
Japanese|English                             5
Russian                                      4
English|Spanish; Castilian                   4
Portuguese                                   4
Arabic                                       3
English|Arabic                               3
Polish                                       2
German|English                               2
English|Portuguese|Spanish; Castilian        2
Japanese                                     2
English|Russian                              1
Dutch; Flemis

## Load additional Metadata
* url at https://docs.google.com/spreadsheets/d/1mCoDje_mOnQGWhVBEx9YKd_RdJuQAtRY59T0Co-MiK0/edit#gid=0

In [7]:
sheet_ulrich_id = "1mCoDje_mOnQGWhVBEx9YKd_RdJuQAtRY59T0Co-MiK0"
sheet_ulrich_name = "Publications"
url_ulrich = f"https://docs.google.com/spreadsheets/d/{sheet_ulrich_id}/gviz/tq?tqx=out:csv&sheet={sheet_ulrich_name}"

In [8]:
# sim_ulrich = pd.read_csv(url_ulrich)
# sim_ulrich.head()

In [9]:
# sim_ulrich.shape

In [10]:
# sim_ulrich.columns

## Get Most Relevant Info DataFrame

In [11]:
sim_info = sim_all[["Title", "First Volume", "Last Volume", "PubIssueID", "NA Gaps"]]
sim_info.head()

Unnamed: 0,Title,First Volume,Last Volume,PubIssueID,NA Gaps
0,American Journal of Pharmacy and the Sciences ...,1952.0,1995.0,sim_american-journal-of-pharmacy-and-the-sciences,
1,National Real Estate and Building Journal,1949.0,1956.0,sim_national-real-estate-and-building-journal,
2,The American Naturalist,1872.0,2015.0,sim_american-naturalist,
3,Alcatel Telecommunications Review,1922.0,2002.0,sim_alcatel-telecommunications-review,
4,The American Journal of Gastroenterology,1949.0,2011.0,sim_american-journal-of-gastroenterology,1982; 1983; 1984; 1985; 1986; 1987; 1988; 1989...


#### Data Cleaning for SIM

In [12]:
# For example, there are overlapping ids
sim_info[sim_info["PubIssueID"] == "sim_science"]

Unnamed: 0,Title,First Volume,Last Volume,PubIssueID,NA Gaps
1013,Science,1883.0,2016.0,sim_science,
9907,Science,1979.0,1986.0,sim_science,


In [13]:
# Concretely show these overlaps
print("SIM Id/title count: " + str(sim_info["PubIssueID"].shape[0]))
print("Unique SIM Id count: " + str(len(sim_info["PubIssueID"].unique())))
print("Unique title count: " + str(len(sim_info["Title"].unique())))

SIM Id/title count: 15181
Unique SIM Id count: 14972
Unique title count: 14855


In [14]:
# Only keep necessary columns
sim_info_concise = sim_info
sim_info_concise.head()

Unnamed: 0,Title,First Volume,Last Volume,PubIssueID,NA Gaps
0,American Journal of Pharmacy and the Sciences ...,1952.0,1995.0,sim_american-journal-of-pharmacy-and-the-sciences,
1,National Real Estate and Building Journal,1949.0,1956.0,sim_national-real-estate-and-building-journal,
2,The American Naturalist,1872.0,2015.0,sim_american-naturalist,
3,Alcatel Telecommunications Review,1922.0,2002.0,sim_alcatel-telecommunications-review,
4,The American Journal of Gastroenterology,1949.0,2011.0,sim_american-journal-of-gastroenterology,1982; 1983; 1984; 1985; 1986; 1987; 1988; 1989...


In [15]:
sim_info_concise['NA Gaps'] = sim_info_concise['NA Gaps'].fillna("")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [16]:
def find_mode(x):
    return x.value_counts().index[0]

In [17]:
# Aggregate Pub issue IDs such that the maximum range is included
sim_info_concise = sim_info_concise.groupby(["PubIssueID"]).agg({'Title': find_mode, 'NA Gaps': find_mode, 
                                                                 'First Volume': 'min', 'Last Volume': 'max'})
sim_info_concise.head()

Unnamed: 0_level_0,Title,NA Gaps,First Volume,Last Volume
PubIssueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
sim-anatomia-clinica,Anatomia Clinica,,1978.0,1981.0
sim_-,The - -,,1826.0,1826.0
sim_1001-home-ideas,1001 Home Ideas,,1986.0,1991.0
sim_102-monitor,102 Monitor,,1975.0,1981.0
sim_20th-century-british-history,20th Century British History,,1990.0,1994.0


In [18]:
sim_info_concise = sim_info_concise.reset_index()
sim_info_concise.head()

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume
0,sim-anatomia-clinica,Anatomia Clinica,,1978.0,1981.0
1,sim_-,The - -,,1826.0,1826.0
2,sim_1001-home-ideas,1001 Home Ideas,,1986.0,1991.0
3,sim_102-monitor,102 Monitor,,1975.0,1981.0
4,sim_20th-century-british-history,20th Century British History,,1990.0,1994.0


### Sometimes "the" gets removed

In [19]:
sim_info_concise["sim id start with the"] = sim_info_concise["PubIssueID"].str[4:8] == "the-"
sim_info_concise[sim_info_concise["sim id start with the"]].shape

(54, 6)

In [20]:
sim_info_concise["title start with the"] = sim_info_concise["Title"].str[0:4] == "The "
sim_info_concise[sim_info_concise["title start with the"]].shape

(1112, 7)

In [21]:
sim_info_concise[(sim_info_concise["title start with the"] == True) & (sim_info_concise["sim id start with the"] == False)]

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the
1,sim_-,The - -,,1826.0,1826.0,False,True
36,sim_abraham-lincoln-quarterly,The Abraham Lincoln Quarterly,1951;,1948.0,1952.0,False,True
60,sim_academy-and-literature,The Academy and Literature,,1869.0,1916.0,False,True
64,sim_academy-of-management-perspectives,The Academy of Management Perspectives,,1987.0,2012.0,False,True
65,sim_academy-of-management-review,The Academy of Management Review,,1976.0,2015.0,False,True
...,...,...,...,...,...,...,...
14901,sim_yale-review,The Yale Review,1990; 1991;,1892.0,2015.0,False,True
14905,sim_yankee-and-boston-literary-gazette,The Yankee and Boston Literary Gazette,,1828.0,1829.0,False,True
14910,sim_yearbook-of-agriculture,The Yearbook of Agriculture,,1969.0,1982.0,False,True
14911,sim_yearbook-of-education-law,The Yearbook of Education Law,1972; 1973; 1974; 1975; 1976; 1977; 1978; 1979...,1971.0,2015.0,False,True


In [22]:
sim_info_concise[(sim_info_concise["title start with the"] == True) & (sim_info_concise["sim id start with the"] == True)]

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the
13788,sim_the-advocate-for-the-testimony-of-god,"The Advocate for the Testimony of God, as It I...",,1835.0,1839.0,True,True
13789,sim_the-american-medical-review-and-journal,The American Medical Review and Journal of Ori...,,1824.0,1826.0,True,True
13790,sim_the-american-review-of-history-and-politics,"The American Review of History and Politics, a...",,1811.0,1812.0,True,True
13791,sim_the-antijacobin-review-and-protestant-advo...,The Antijacobin Review and Protestant Advocate...,,1798.0,1821.0,True,True
13792,sim_the-architect,The Architect,,1971.0,1978.0,True,True
13793,sim_the-bibliotheca-sacra,The Bibliotheca Sacra,,1844.0,2014.0,True,True
13794,sim_the-bookmark,The Bookmark,,1940.0,1992.0,True,True
13795,sim_the-bottom-line,The Bottom Line,,1993.0,1996.0,True,True
13796,sim_the-cambrian-and-caledonian-quarterly-maga...,The Cambrian and Caledonian Quarterly Magazine...,,1829.0,1833.0,True,True
13797,sim_the-canadian-journal-of-research-in-semiotics,The Canadian Journal of Research in Semiotics....,,1973.0,1979.0,True,True


In [23]:
sim_info_concise[(sim_info_concise["title start with the"] == True) & (sim_info_concise["sim id start with the"] == True)].shape


(53, 7)

### Sometimes "title" contains brackets

In [26]:
sim_info_concise[sim_info_concise["PubIssueID"] == "sim_the-yale-journal-of-biology-and-medicine"].loc[13841, "Title"]

'The Yale Journal of Biology and Medicine. [Filmed from the Online Edition]'

In [27]:
def contains_front_bracket(journal):
    if "[" in journal:
        return True
    return False
    
def contains_back_bracket(journal):
    if "]" in journal:
        return True
    return False
    

In [28]:
sim_info_concise["has '['"] = sim_info_concise["Title"].apply(contains_front_bracket)
sim_info_concise["has ']'"] = sim_info_concise["Title"].apply(contains_back_bracket)
print(sim_info_concise[sim_info_concise["has '['"]].shape)
print(sim_info_concise[sim_info_concise["has ']'"]].shape)

(16, 9)
(16, 9)


In [29]:
sim_info_concise[sim_info_concise["has '['"]]

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']'
171,sim_adelphi-series,Adelphi [Series],1992; 1999;,1963.0,2011.0,False,False,True,True
2236,sim_bulletin-of-the-atomic-scientists,Bulletin of the Atomic Scientists. [Filmed Fro...,,1945.0,2015.0,False,False,True,True
2507,sim_canadian-journal-of-veterinary-research,Canadian Journal of Veterinary Research. [Film...,,1984.0,2015.0,False,False,True,True
3210,sim_college-research-libraries,College & Research Libraries. [Filmed From the...,,1939.0,2016.0,False,False,True,True
3714,sim_crab,The Crab. [Filmed From the Online Edition],,1971.0,2014.0,False,True,True,True
4665,sim_engineering-design-graphics-journal,Engineering Design Graphics Journal. [Filmed F...,2012;,1974.0,2013.0,False,False,True,True
5571,sim_georgia-library-quarterly,Georgia Library Quarterly. [Filmed From the On...,,1964.0,2007.0,False,False,True,True
5693,sim_great-britain-public-record-1625-1649-dome...,Great Britain. Public Record Office. Calendar ...,,1625.0,1649.0,False,False,True,True
7336,sim_journal-of-biological-chemistry,Journal of Biological Chemistry. [Filmed From ...,,1905.0,2014.0,False,False,True,True
7987,sim_journal-of-medical-technology,Journal of Medical Technology : Official Publi...,,1984.0,1987.0,False,False,True,True


In [30]:
def contains_front_brace(journal):
    if "(" in journal:
        return True
    return False
    
def contains_back_brace(journal):
    if ")" in journal:
        return True
    return False

In [31]:
sim_info_concise["has '('"] = sim_info_concise["Title"].apply(contains_front_brace)
sim_info_concise["has ')'"] = sim_info_concise["Title"].apply(contains_back_brace)
print(sim_info_concise[sim_info_concise["has '('"]].shape)
print(sim_info_concise[sim_info_concise["has ')'"]].shape)

(60, 11)
(60, 11)


In [32]:
sim_info_concise[sim_info_concise["has '('"]].head(10)

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')'
51,sim_abtics-abstract-and-book-title-index-card-...,ABTICS (Abstract and Book Title Index Card Ser...,,1960.0,1971.0,False,False,False,False,True,True
126,sim_acta-meteorologica-sinica-chi-hsiang-hsueh...,Acta Meteorologica Sinica (Chi Hsiang Hsueh Pao),,1989.0,1993.0,False,False,False,False,True,True
131,sim_acta-oceanologica-sinica-hai-yang-hsueh-pao,Acta Oceanologica Sinica (Hai Yang Hsueh Pao),,1985.0,1993.0,False,False,False,False,True,True
223,sim_advances-in-tunnelling-technology-and-subs...,Advances in Tunnelling Technology and Subsurfa...,,1981.0,1984.0,False,False,False,False,True,True
637,sim_american-institute-of-mining-and-metallurg...,American Institute of Mining and Metallurgical...,1944;,1940.0,1947.0,False,False,False,False,True,True
770,sim_american-magazine-containing-a-miscellaneo...,"American Magazine, Containing a Miscellaneous ...",,1787.0,1788.0,False,False,False,False,True,True
1290,sim_archives-of-toxicology-archiv-fuer-toxikol...,Archives of Toxicology (Archiv Für Toxikologie),1995; 1998;,1930.0,2007.0,False,False,False,False,True,True
1405,sim_arzneimittel-forschung-drug-research,Arzneimittel Forschung (Drug Research),2001; 2002; 2003; 2004; 2005; 2006; 2007;,2000.0,2012.0,False,False,False,False,True,True
1495,sim_atb-metallurgie-acta-technica-belgica,ATB Metallurgie (Acta Technica Belgica),1974; 1975; 1979; 1980; 1981; 1982; 1983; 1984...,1973.0,1990.0,False,False,False,False,True,True
1531,sim_atr-australian-telecommunication-research,ATR (Australian Telecommunication Research),,1977.0,1977.0,False,False,False,False,True,True


In [83]:
import re
def generate_sim_id(journal):
    journal = re.sub('[^A-Za-z0-9 ]+', '', journal)
    sim_id = ""
    if journal != "":
        sim_id = journal.lower()
        sim_id_lst = sim_id.split()
        sim_id = "-".join(sim_id_lst)
        sim_id = "sim_" + sim_id
    return sim_id

In [34]:
sim_info_concise["basic sim id"] = sim_info_concise["Title"].apply(generate_sim_id)
sim_info_concise.head()

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id
0,sim-anatomia-clinica,Anatomia Clinica,,1978.0,1981.0,False,False,False,False,False,False,sim_anatomia-clinica
1,sim_-,The - -,,1826.0,1826.0,False,True,False,False,False,False,sim_the----
2,sim_1001-home-ideas,1001 Home Ideas,,1986.0,1991.0,False,False,False,False,False,False,sim_1001-home-ideas
3,sim_102-monitor,102 Monitor,,1975.0,1981.0,False,False,False,False,False,False,sim_102-monitor
4,sim_20th-century-british-history,20th Century British History,,1990.0,1994.0,False,False,False,False,False,False,sim_20th-century-british-history


In [35]:
sim_id_non_basic = sim_info_concise[sim_info_concise["PubIssueID"] != sim_info_concise["basic sim id"]]

In [36]:
sim_id_non_basic

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id
0,sim-anatomia-clinica,Anatomia Clinica,,1978.0,1981.0,False,False,False,False,False,False,sim_anatomia-clinica
1,sim_-,The - -,,1826.0,1826.0,False,True,False,False,False,False,sim_the----
9,sim_a-m-a-archives-of-industrial-health,AMA Archives of Industrial Health,,1950.0,1960.0,False,False,False,False,False,False,sim_ama-archives-of-industrial-health
10,sim_a-plus,A+,,1983.0,1989.0,False,False,False,False,False,False,sim_a
12,sim_a-rivista-anarchica,A/Rivista Anarchica,,1971.0,1973.0,False,False,False,False,False,False,sim_arivista-anarchica
...,...,...,...,...,...,...,...,...,...,...,...,...
14939,sim_zeitschrift-fuer-angewandte-mathematik-und...,Zeitschrift Für Angewandte Mathematik und Physik,,1990.0,1996.0,False,False,False,False,False,False,sim_zeitschrift-fr-angewandte-mathematik-und-p...
14943,sim_zeitschrift-fuer-lebensmitte-untersuchung-...,Zeitschrift Fuer Lebensmittel-Untersuchung und...,,1981.0,1983.0,False,False,False,False,False,False,sim_zeitschrift-fuer-lebensmittel-untersuchung...
14951,sim_zeitschrift-fuer-physikalische-chemie-abt-a,"Zeitschrift Fuer Physikalische Chemie, Abteilu...",,1887.0,1943.0,False,False,False,False,False,False,sim_zeitschrift-fuer-physikalische-chemie-abte...
14965,sim_zodiac-a-monthly-periodical-devoted-to-sci...,"Zodiac, a Monthly Periodical, Devoted to Scien...",,1835.0,1837.0,False,False,False,False,False,False,sim_zodiac-a-monthly-periodical-devoted-to-sci...


In [37]:
print(sim_id_non_basic[['Title','PubIssueID', 'basic sim id']].loc[14939, 'Title'])
print(sim_id_non_basic[['Title','PubIssueID', 'basic sim id']].loc[14939, 'PubIssueID'])
print(sim_id_non_basic[['Title','PubIssueID', 'basic sim id']].loc[14939, 'basic sim id'])

Zeitschrift Für Angewandte Mathematik und Physik
sim_zeitschrift-fuer-angewandte-mathematik-und-physik
sim_zeitschrift-fr-angewandte-mathematik-und-physik


In [38]:
print(sim_id_non_basic[['Title','PubIssueID', 'basic sim id']].loc[14943, 'Title'])
print(sim_id_non_basic[['Title','PubIssueID', 'basic sim id']].loc[14943, 'PubIssueID'])
print(sim_id_non_basic[['Title','PubIssueID', 'basic sim id']].loc[14943, 'basic sim id'])

Zeitschrift Fuer Lebensmittel-Untersuchung und -Forschung
sim_zeitschrift-fuer-lebensmitte-untersuchung-und-forschung
sim_zeitschrift-fuer-lebensmittel-untersuchung-und--forschung


In [39]:
print(sim_id_non_basic[['Title','PubIssueID', 'basic sim id']].loc[14951, 'Title'])
print(sim_id_non_basic[['Title','PubIssueID', 'basic sim id']].loc[14951, 'PubIssueID'])
print(sim_id_non_basic[['Title','PubIssueID', 'basic sim id']].loc[14951, 'basic sim id'])

Zeitschrift Fuer Physikalische Chemie, Abteilung A
sim_zeitschrift-fuer-physikalische-chemie-abt-a
sim_zeitschrift-fuer-physikalische-chemie-abteilung-a


In [40]:
print(sim_id_non_basic[['Title','PubIssueID', 'basic sim id']].loc[14965, 'Title'])
print(sim_id_non_basic[['Title','PubIssueID', 'basic sim id']].loc[14965, 'PubIssueID'])
print(sim_id_non_basic[['Title','PubIssueID', 'basic sim id']].loc[14965, 'basic sim id'])

Zodiac, a Monthly Periodical, Devoted to Science, Literature and the Arts
sim_zodiac-a-monthly-periodical-devoted-to-science
sim_zodiac-a-monthly-periodical-devoted-to-science-literature-and-the-arts


In [42]:
print(sim_id_non_basic[['Title','PubIssueID', 'basic sim id']].loc[14966, 'Title'])
print(sim_id_non_basic[['Title','PubIssueID', 'basic sim id']].loc[14966, 'PubIssueID'])
print(sim_id_non_basic[['Title','PubIssueID', 'basic sim id']].loc[14966, 'basic sim id'])

Zoist; A Journal of Cerebral Physiology and Mesmerism and Their Applications to Human Welfare
sim_zoist-a-journal-of-cerebral-physiology-and-mesmerism
sim_zoist-a-journal-of-cerebral-physiology-and-mesmerism-and-their-applications-to-human-welfare


### All Sorts of Non Alphanumeric Characters

#### "+" turn into "plus", but only 1 case so uhhhh

In [43]:
temp_df_plus_sign = sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: "+" in x)]
temp_df_plus_sign[["PubIssueID", "Title", "basic sim id"]]

Unnamed: 0,PubIssueID,Title,basic sim id
10,sim_a-plus,A+,sim_a


#### "=" sometimes is directly removed, other times what comes after is ignored

In [84]:
temp_df_equal_sign = sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: "=" in x)]
temp_df_equal_sign[["PubIssueID", "Title", "basic sim id"]]

Unnamed: 0,PubIssueID,Title,basic sim id
2427,sim_canadian-appraiser-levaluateur-canadien,The Canadian Appraiser = l'Evaluateur Canadien,sim_the-canadian-appraiser-levaluateur-canadien
2749,sim_chemical-engineering-and-processing,Chemical Engineering and Processing = Genie De...,sim_chemical-engineering-and-processing-genie-...
4880,sim_european-journal-of-population,European Journal of Population = Revue Europee...,sim_european-journal-of-population-revue-europ...
6837,sim_international-journal-of-refrigeration,International Journal of Refrigeration = Revue...,sim_international-journal-of-refrigeration-rev...
8116,sim_journal-of-otolaryngology-head-neck-surgery,Journal of Otolaryngology - Head & Neck Surger...,sim_journal-of-otolaryngology---head-neck-surg...
12038,sim_qtly-bulletin-intl-assoc-agricultural-info...,Quarterly Bulletin of the International Associ...,sim_quarterly-bulletin-of-the-international-as...
12472,sim_revista-interamericana-de-bibliografia,Revista Interamericana de Bibliografia = Inter...,sim_revista-interamericana-de-bibliografia-int...
12567,sim_romanian-journal-of-internal-medicine,Romanian Journal of Internal Medicine = Revue ...,sim_romanian-journal-of-internal-medicine-revu...
13203,sim_south-african-journal-of-library-and-infor...,South African Journal of Library and Informati...,sim_south-african-journal-of-library-and-infor...


In [85]:
def equal_sign_remove(title):
    if "=" in title:
        title = re.sub("=", "", title)
        return generate_sim_id(title)
    return ""

def equal_sign_ignore_after(title):
    if "=" in title:
        title_lst = title.split("=")
        return generate_sim_id(title_lst[0])
    return ""

In [86]:
temp_df_equal_sign["removed"] = temp_df_equal_sign["Title"].apply(equal_sign_remove)
temp_df_equal_sign["ignore_after"] = temp_df_equal_sign["Title"].apply(equal_sign_ignore_after)
temp_df_equal_sign

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id,removed,ignore_after
2427,sim_canadian-appraiser-levaluateur-canadien,The Canadian Appraiser = l'Evaluateur Canadien,2002; 2004;,1958.0,2005.0,False,True,False,False,False,False,sim_the-canadian-appraiser-levaluateur-canadien,sim_the-canadian-appraiser-levaluateur-canadien,sim_the-canadian-appraiser
2749,sim_chemical-engineering-and-processing,Chemical Engineering and Processing = Genie De...,,1988.0,1995.0,False,False,False,False,False,False,sim_chemical-engineering-and-processing-genie-...,sim_chemical-engineering-and-processing-genie-...,sim_chemical-engineering-and-processing
4880,sim_european-journal-of-population,European Journal of Population = Revue Europee...,,1990.0,1992.0,False,False,False,False,False,False,sim_european-journal-of-population-revue-europ...,sim_european-journal-of-population-revue-europ...,sim_european-journal-of-population
6837,sim_international-journal-of-refrigeration,International Journal of Refrigeration = Revue...,,1980.0,1996.0,False,False,False,False,False,False,sim_international-journal-of-refrigeration-rev...,sim_international-journal-of-refrigeration-rev...,sim_international-journal-of-refrigeration
8116,sim_journal-of-otolaryngology-head-neck-surgery,Journal of Otolaryngology - Head & Neck Surger...,,1972.0,2011.0,False,False,False,False,False,False,sim_journal-of-otolaryngology---head-neck-surg...,sim_journal-of-otolaryngology-head-neck-surger...,sim_journal-of-otolaryngology-head-neck-surgery
12038,sim_qtly-bulletin-intl-assoc-agricultural-info...,Quarterly Bulletin of the International Associ...,,1956.0,1996.0,False,False,False,False,False,False,sim_quarterly-bulletin-of-the-international-as...,sim_quarterly-bulletin-of-the-international-as...,sim_quarterly-bulletin-of-the-international-as...
12472,sim_revista-interamericana-de-bibliografia,Revista Interamericana de Bibliografia = Inter...,,1951.0,1999.0,False,False,False,False,False,False,sim_revista-interamericana-de-bibliografia-int...,sim_revista-interamericana-de-bibliografia-int...,sim_revista-interamericana-de-bibliografia
12567,sim_romanian-journal-of-internal-medicine,Romanian Journal of Internal Medicine = Revue ...,1974;,1973.0,1980.0,False,False,False,False,False,False,sim_romanian-journal-of-internal-medicine-revu...,sim_romanian-journal-of-internal-medicine-revu...,sim_romanian-journal-of-internal-medicine
13203,sim_south-african-journal-of-library-and-infor...,South African Journal of Library and Informati...,,1933.0,1993.0,False,False,False,False,False,False,sim_south-african-journal-of-library-and-infor...,sim_south-african-journal-of-library-and-infor...,sim_south-african-journal-of-library-and-infor...


In [88]:
temp_df_equal_sign_remove = temp_df_equal_sign[temp_df_equal_sign["removed"] == temp_df_equal_sign["PubIssueID"]]
temp_df_equal_sign_remove[["PubIssueID", "Title", "removed"]]

Unnamed: 0,PubIssueID,Title,removed


In [89]:
temp_df_equal_sign_ignore = temp_df_equal_sign[temp_df_equal_sign["ignore_after"] == temp_df_equal_sign["PubIssueID"]]
temp_df_equal_sign_ignore[["PubIssueID", "Title", "basic sim id"]]

Unnamed: 0,PubIssueID,Title,basic sim id
2749,sim_chemical-engineering-and-processing,Chemical Engineering and Processing = Genie De...,sim_chemical-engineering-and-processing-genie-...
4880,sim_european-journal-of-population,European Journal of Population = Revue Europee...,sim_european-journal-of-population-revue-europ...
6837,sim_international-journal-of-refrigeration,International Journal of Refrigeration = Revue...,sim_international-journal-of-refrigeration-rev...
8116,sim_journal-of-otolaryngology-head-neck-surgery,Journal of Otolaryngology - Head & Neck Surger...,sim_journal-of-otolaryngology---head-neck-surg...
12472,sim_revista-interamericana-de-bibliografia,Revista Interamericana de Bibliografia = Inter...,sim_revista-interamericana-de-bibliografia-int...
12567,sim_romanian-journal-of-internal-medicine,Romanian Journal of Internal Medicine = Revue ...,sim_romanian-journal-of-internal-medicine-revu...
13203,sim_south-african-journal-of-library-and-infor...,South African Journal of Library and Informati...,sim_south-african-journal-of-library-and-infor...


### "/" turn into "-" in most cases

In [92]:
temp_df_slash_sign = sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: "/" in x)]
print(temp_df_slash_sign.shape[0])
temp_df_slash_sign[["PubIssueID", "Title", "basic sim id"]]

29


Unnamed: 0,PubIssueID,Title,basic sim id
12,sim_a-rivista-anarchica,A/Rivista Anarchica,sim_arivista-anarchica
389,sim_air-water-pollution-report,Air/Water Pollution Report,sim_airwater-pollution-report
1330,sim_arlis-na-newsletter,ARLIS/NA Newsletter,sim_arlisna-newsletter
1958,sim_black-male-female-relationships,Black Male/Female Relationships,sim_black-malefemale-relationships
3008,sim_cim-technology-casa-smes-magazine,CIM Technology: CASA/SME's Magazine of Compute...,sim_cim-technology-casasmes-magazine-of-comput...
3577,sim_contemporary-ob-gyn,Contemporary OB/GYN,sim_contemporary-obgyn
4495,sim_efta-news-efta-bulletin,EFTA News/EFTA Bulletin,sim_efta-newsefta-bulletin
4795,sim_esa-engage-social-action,ESA: Engage/Social Action,sim_esa-engagesocial-action
5084,sim_federal-register-find,The Federal Register / FIND,sim_the-federal-register-find
5852,sim_health-pac-bulletin,Health/PAC Bulletin,sim_healthpac-bulletin


In [93]:
def slash_sign_to_hyphen(title):
    if "/" in title:
        title = re.sub("/", " ", title)
        return generate_sim_id(title)
    return title

In [98]:
temp_df_slash_sign["slash"] = temp_df_slash_sign["Title"].apply(slash_sign_to_hyphen)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [100]:
temp_df_slash_sign_good = temp_df_slash_sign[temp_df_slash_sign["slash"] == temp_df_slash_sign["PubIssueID"]]
print(temp_df_slash_sign_good.shape[0])
temp_df_slash_sign_good.head()

23


Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id,slash
12,sim_a-rivista-anarchica,A/Rivista Anarchica,,1971.0,1973.0,False,False,False,False,False,False,sim_arivista-anarchica,sim_a-rivista-anarchica
389,sim_air-water-pollution-report,Air/Water Pollution Report,,1963.0,1995.0,False,False,False,False,False,False,sim_airwater-pollution-report,sim_air-water-pollution-report
1330,sim_arlis-na-newsletter,ARLIS/NA Newsletter,,1972.0,1981.0,False,False,False,False,False,False,sim_arlisna-newsletter,sim_arlis-na-newsletter
1958,sim_black-male-female-relationships,Black Male/Female Relationships,,1979.0,1982.0,False,False,False,False,False,False,sim_black-malefemale-relationships,sim_black-male-female-relationships
3577,sim_contemporary-ob-gyn,Contemporary OB/GYN,,1973.0,2008.0,False,False,False,False,False,False,sim_contemporary-obgyn,sim_contemporary-ob-gyn


In [101]:
temp_df_slash_sign_bad = temp_df_slash_sign[temp_df_slash_sign["slash"] != temp_df_slash_sign["PubIssueID"]]
print(temp_df_slash_sign_bad.shape[0])
temp_df_slash_sign_bad.head()

6


Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id,slash
3008,sim_cim-technology-casa-smes-magazine,CIM Technology: CASA/SME's Magazine of Compute...,,1982.0,1986.0,False,False,False,False,False,False,sim_cim-technology-casasmes-magazine-of-comput...,sim_cim-technology-casa-smes-magazine-of-compu...
5084,sim_federal-register-find,The Federal Register / FIND,,1936.0,2016.0,False,True,False,False,False,False,sim_the-federal-register-find,sim_the-federal-register-find
9317,sim_marketing-communications-1888,Marketing/Communications,,1888.0,1972.0,False,False,False,False,False,False,sim_marketingcommunications,sim_marketing-communications
11770,sim_proceedings-engineering-sciences-indian-ac...,Proceedings. Engineering Sciences/ Indian Acad...,,1954.0,1995.0,False,False,False,False,False,False,sim_proceedings-engineering-sciences-indian-ac...,sim_proceedings-engineering-sciences-indian-ac...
13947,sim_todays-education-nea-career-education-edition,Today's Education: The Journal of the National...,,1981.0,1982.0,False,False,False,False,False,False,sim_todays-education-the-journal-of-the-nation...,sim_todays-education-the-journal-of-the-nation...


#### "?", "@", "dollar sign", "%", "^", "*", "tick", "~", "<", ">" doesn't usually exist in title

In [103]:
sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: "?" in x)]

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id


In [105]:
sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: "@" in x)]

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id


In [107]:
sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: "$" in x)]

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id


In [108]:
sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: "%" in x)]

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id


In [109]:
sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: "^" in x)]

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id


In [110]:
sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: "*" in x)]

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id


In [111]:
sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: "~" in x)]

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id


In [112]:
sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: "`" in x)]

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id


In [113]:
sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: "<" in x)]

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id


In [114]:
sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: ">" in x)]

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id


#### "#" ignored

In [106]:
sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: "#" in x)]

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id
7055,sim_j-school-psych-6,J/School Psych #6,,,,False,False,False,False,False,False,sim_jschool-psych-6


#### "!" ignored

In [104]:
sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: "!" in x)]

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id
2708,sim_challenge-gov,Challenge!,,1969.0,1981.0,False,False,False,False,False,False,sim_challenge


#### "&"
- some ignored

In [116]:
temp_id_and = sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: "&" in x)]
temp_id_and[["PubIssueID", "Title", "basic sim id"]]

Unnamed: 0,PubIssueID,Title,basic sim id
556,sim_american-city-county,The American City & County,sim_the-american-city-county
788,sim_american-medical-assoc-neurology-psychiatry,American Medical Association. A.M.A. Archives ...,sim_american-medical-association-ama-archives-...
1043,sim_annals-of-otology-rhinology-laryngology,"The Annals of Otology, Rhinology & Laryngology",sim_the-annals-of-otology-rhinology-laryngology
1243,sim_architect-building-news,The Architect & Building News,sim_the-architect-building-news
1345,sim_army-al-t,Army AL&T,sim_army-alt
...,...,...,...
13832,sim_the-select-circulating-library,The Select Circulating Library. Containing the...,sim_the-select-circulating-library-containing-...
13973,sim_topics-in-learning-learning-disabilities-t...,Topics in Learning & Learning Disabilities: TL&LD,sim_topics-in-learning-learning-disabilities-tlld
14057,sim_transportation-research-part-e-logistics-t...,"Transportation Research. Part E, Logistics & T...",sim_transportation-research-part-e-logistics-t...
14351,sim_urban-social-change-review,The Urban & Social Change Review,sim_the-urban-social-change-review


#### ":" is dealt with very messily
- some are ignored completely 
- for some, what comes before becomes abbreviated
- some are just shortened

In [51]:
temp_df_colon = sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: ":" in x)]
temp_df_colon[["PubIssueID", "Title", "basic sim id"]]

Unnamed: 0,PubIssueID,Title,basic sim id
16,sim_aacp-teachers-seminar-proceedings,American Association of Colleges of Pharmacy T...,sim_american-association-of-colleges-of-pharma...
23,sim_aatcc-review,American Association of Textile Chemists and C...,sim_american-association-of-textile-chemists-a...
33,sim_abolitionist-or-record-of-thengland-anti-s...,Abolitionist: Or Record of the New England Ant...,sim_abolitionist-or-record-of-the-new-england-...
92,sim_acha-action,American College Health Association: ACHA Action,sim_american-college-health-association-acha-a...
98,sim_acm-computing-surveys,Association for Computing Machinery: Computing...,sim_association-for-computing-machinery-comput...
...,...,...,...
14590,sim_weekly-recorder-a-news-paper,Weekly Recorder: A News Paper Conveying Import...,sim_weekly-recorder-a-news-paper-conveying-imp...
14610,sim_wells-fargo-economic-monitor-california,The Wells Fargo Economic Monitor: California,sim_the-wells-fargo-economic-monitor-california
14636,sim_western-farmer-and-gardener-horticulture-r...,The Western Farmer and Gardener : Devoted to A...,sim_the-western-farmer-and-gardener-devoted-to...
14713,sim_windsor-magazine-an-illustraed-monthly-for...,Windsor Magazine: An Illustrated Monthly for M...,sim_windsor-magazine-an-illustrated-monthly-fo...


#### ";" is 

In [50]:
temp_df_semi_colon = sim_id_non_basic[sim_id_non_basic["Title"].apply(lambda x: ";" in x)]
temp_df_semi_colon[["PubIssueID", "Title", "basic sim id"]]

Unnamed: 0,PubIssueID,Title,basic sim id
148,sim_acta-sanctae-sedis-ephemerides-romanae,Acta Sanctae Sedis; Ephemerides Romanae a SSMO...,sim_acta-sanctae-sedis-ephemerides-romanae-a-s...
472,sim_amateur-work-a-monthly-magazine,Amateur Work; A Monthly Magazine of the Useful...,sim_amateur-work-a-monthly-magazine-of-the-use...
592,sim_american-farmer-devoted-to-agriculture-hor...,"American Farmer; Devoted to Agriculture, Horti...",sim_american-farmer-devoted-to-agriculture-hor...
776,sim_american-magazine-political-state-british-...,American Magazine; Or a Monthly View of the Po...,sim_american-magazine-or-a-monthly-view-of-the...
779,sim_american-masonic-register-and-literary-com...,American Masonic Register and Literary Compani...,sim_american-masonic-register-and-literary-com...
...,...,...,...
14642,sim_western-journal-and-civilian,The Western Journal and Civilian; Devoted to A...,sim_the-western-journal-and-civilian-devoted-t...
14656,sim_western-messenger-devoted-to-religion-life...,"Western Messenger; Devoted to Religion, Life, ...",sim_western-messenger-devoted-to-religion-life...
14657,sim_western-minerva-or-american-annals-of-know...,"Western Minerva; Or, American Annals of Knowle...",sim_western-minerva-or-american-annals-of-know...
14761,sim_wonderful-magazine-and-marvelous-chronicle,"Wonderful Magazine, and Marvelous Chronicle; O...",sim_wonderful-magazine-and-marvelous-chronicle...


In [117]:
print(temp_df_semi_colon[['Title','PubIssueID', 'basic sim id']].loc[148, 'Title'])
print(temp_df_semi_colon[['Title','PubIssueID', 'basic sim id']].loc[148, 'PubIssueID'])
print(temp_df_semi_colon[['Title','PubIssueID', 'basic sim id']].loc[148, 'basic sim id'])

Acta Sanctae Sedis; Ephemerides Romanae a SSMO D. N. Pio PP. X Authenticae et Officales Apostolicae Sedis Actis Publice Evulgandis Declaratae
sim_acta-sanctae-sedis-ephemerides-romanae
sim_acta-sanctae-sedis-ephemerides-romanae-a-ssmo-d-n-pio-pp-x-authenticae-et-officales-apostolicae-sedis-actis-publice-evulgandis-declaratae


In [118]:
print(temp_df_semi_colon[['Title','PubIssueID', 'basic sim id']].loc[472, 'Title'])
print(temp_df_semi_colon[['Title','PubIssueID', 'basic sim id']].loc[472, 'PubIssueID'])
print(temp_df_semi_colon[['Title','PubIssueID', 'basic sim id']].loc[472, 'basic sim id'])

Amateur Work; A Monthly Magazine of the Useful Arts and Sciences
sim_amateur-work-a-monthly-magazine
sim_amateur-work-a-monthly-magazine-of-the-useful-arts-and-sciences


In [119]:
print(temp_df_semi_colon[['Title','PubIssueID', 'basic sim id']].loc[14657, 'Title'])
print(temp_df_semi_colon[['Title','PubIssueID', 'basic sim id']].loc[14657, 'PubIssueID'])
print(temp_df_semi_colon[['Title','PubIssueID', 'basic sim id']].loc[14657, 'basic sim id'])

Western Minerva; Or, American Annals of Knowledge and Literature
sim_western-minerva-or-american-annals-of-knowledge
sim_western-minerva-or-american-annals-of-knowledge-and-literature


In [121]:
print(temp_df_semi_colon[['Title','PubIssueID', 'basic sim id']].loc[14966, 'Title'])
print(temp_df_semi_colon[['Title','PubIssueID', 'basic sim id']].loc[14966, 'PubIssueID'])
print(temp_df_semi_colon[['Title','PubIssueID', 'basic sim id']].loc[14966, 'basic sim id'])

Zoist; A Journal of Cerebral Physiology and Mesmerism and Their Applications to Human Welfare
sim_zoist-a-journal-of-cerebral-physiology-and-mesmerism
sim_zoist-a-journal-of-cerebral-physiology-and-mesmerism-and-their-applications-to-human-welfare


### Sim ID generation - Intermediate

In [141]:
def generate_sim_ids_intermediate(journal):
    # special characters
    if "/" in journal: # turn / into -
        journal = re.sub("/", " ", journal)
    if "-" in journal: # drop - first so that we can join with - later
        journal = re.sub("-", " ", journal)
    if "[" in journal: # ignore what's in between '[' and ']'
        index_front = journal.find("[")
        index_back = journal.find("]")
        if index_back > index_front:
            journal = journal[:index_front] + journal[index_back:]
        else:
            journal = journal[:index_front]
    if "=" in journal: # ignore what comes after =
        journal_lst = journal.split("=")
        journal = journal_lst[0]
    
    journal = re.sub('[^A-Za-z0-9 ]+', '', journal)
    sim_ids = []
    if journal != "":
        sim_id = journal.lower()
        sim_id_lst = sim_id.split()
        
        if sim_id_lst[0] == "the":
            sim_id_without_the = "-".join(sim_id_lst[1:])
            sim_id_without_the = "sim_" + sim_id_without_the
            sim_ids.append(sim_id_without_the)
            
        sim_id = "-".join(sim_id_lst)
        sim_id = "sim_" + sim_id
        
        sim_ids.append(sim_id)
    return sim_ids

In [145]:
sim_info_concise["intermediate sim id"] = sim_info_concise["Title"].apply(generate_sim_ids_intermediate)
sim_info_concise.head()

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id,intermediate sim id,intermediate works
0,sim-anatomia-clinica,Anatomia Clinica,,1978.0,1981.0,False,False,False,False,False,False,sim_anatomia-clinica,[sim_anatomia-clinica],False
1,sim_-,The - -,,1826.0,1826.0,False,True,False,False,False,False,sim_the----,"[sim_, sim_the]",False
2,sim_1001-home-ideas,1001 Home Ideas,,1986.0,1991.0,False,False,False,False,False,False,sim_1001-home-ideas,[sim_1001-home-ideas],True
3,sim_102-monitor,102 Monitor,,1975.0,1981.0,False,False,False,False,False,False,sim_102-monitor,[sim_102-monitor],True
4,sim_20th-century-british-history,20th Century British History,,1990.0,1994.0,False,False,False,False,False,False,sim_20th-century-british-history,[sim_20th-century-british-history],True


In [144]:
sim_info_concise["intermediate works"] = sim_info_concise.apply(lambda x: x["PubIssueID"] in x["intermediate sim id"], 
                                                                axis = 1)
print(sim_info_concise[sim_info_concise["intermediate works"]].shape[0])
sim_info_concise[sim_info_concise["intermediate works"]]

13631


Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id,intermediate sim id,intermediate works
2,sim_1001-home-ideas,1001 Home Ideas,,1986.0,1991.0,False,False,False,False,False,False,sim_1001-home-ideas,[sim_1001-home-ideas],True
3,sim_102-monitor,102 Monitor,,1975.0,1981.0,False,False,False,False,False,False,sim_102-monitor,[sim_102-monitor],True
4,sim_20th-century-british-history,20th Century British History,,1990.0,1994.0,False,False,False,False,False,False,sim_20th-century-british-history,[sim_20th-century-british-history],True
5,sim_50-plus,50 Plus,,1978.0,1988.0,False,False,False,False,False,False,sim_50-plus,[sim_50-plus],True
6,sim_73-amateur-radio-today,73 Amateur Radio Today,,1960.0,2003.0,False,False,False,False,False,False,sim_73-amateur-radio-today,[sim_73-amateur-radio-today],True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14967,sim_zoo-biology,Zoo Biology,,1999.0,2000.0,False,False,False,False,False,False,sim_zoo-biology,[sim_zoo-biology],True
14968,sim_zoologica-scripta,Zoologica Scripta,,1998.0,2001.0,False,False,False,False,False,False,sim_zoologica-scripta,[sim_zoologica-scripta],True
14969,sim_zoomorphology,Zoomorphology,1983; 1985; 1986; 1987; 1988;,1981.0,1996.0,False,False,False,False,False,False,sim_zoomorphology,[sim_zoomorphology],True
14970,sim_zvezda,Zvezda,,1961.0,1974.0,False,False,False,False,False,False,sim_zvezda,[sim_zvezda],True


In [146]:
### compare with basic 
print(sim_info_concise[sim_info_concise["basic sim id"] == sim_info_concise["PubIssueID"]].shape[0])
sim_info_concise[sim_info_concise["basic sim id"] == sim_info_concise["PubIssueID"]]

12534


Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id,intermediate sim id,intermediate works
2,sim_1001-home-ideas,1001 Home Ideas,,1986.0,1991.0,False,False,False,False,False,False,sim_1001-home-ideas,[sim_1001-home-ideas],True
3,sim_102-monitor,102 Monitor,,1975.0,1981.0,False,False,False,False,False,False,sim_102-monitor,[sim_102-monitor],True
4,sim_20th-century-british-history,20th Century British History,,1990.0,1994.0,False,False,False,False,False,False,sim_20th-century-british-history,[sim_20th-century-british-history],True
5,sim_50-plus,50 Plus,,1978.0,1988.0,False,False,False,False,False,False,sim_50-plus,[sim_50-plus],True
6,sim_73-amateur-radio-today,73 Amateur Radio Today,,1960.0,2003.0,False,False,False,False,False,False,sim_73-amateur-radio-today,[sim_73-amateur-radio-today],True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14967,sim_zoo-biology,Zoo Biology,,1999.0,2000.0,False,False,False,False,False,False,sim_zoo-biology,[sim_zoo-biology],True
14968,sim_zoologica-scripta,Zoologica Scripta,,1998.0,2001.0,False,False,False,False,False,False,sim_zoologica-scripta,[sim_zoologica-scripta],True
14969,sim_zoomorphology,Zoomorphology,1983; 1985; 1986; 1987; 1988;,1981.0,1996.0,False,False,False,False,False,False,sim_zoomorphology,[sim_zoomorphology],True
14970,sim_zvezda,Zvezda,,1961.0,1974.0,False,False,False,False,False,False,sim_zvezda,[sim_zvezda],True


### What can be improved about the intermediate sim ids?
- might be weird format 
- might be just shortened 

In [150]:
sim_info_concise[sim_info_concise["intermediate works"] == False].head(10)

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume,sim id start with the,title start with the,has '[',has ']',has '(',has ')',basic sim id,intermediate sim id,intermediate works
0,sim-anatomia-clinica,Anatomia Clinica,,1978.0,1981.0,False,False,False,False,False,False,sim_anatomia-clinica,[sim_anatomia-clinica],False
1,sim_-,The - -,,1826.0,1826.0,False,True,False,False,False,False,sim_the----,"[sim_, sim_the]",False
9,sim_a-m-a-archives-of-industrial-health,AMA Archives of Industrial Health,,1950.0,1960.0,False,False,False,False,False,False,sim_ama-archives-of-industrial-health,[sim_ama-archives-of-industrial-health],False
10,sim_a-plus,A+,,1983.0,1989.0,False,False,False,False,False,False,sim_a,[sim_a],False
13,sim_aI-expert,AI Expert,,1986.0,1995.0,False,False,False,False,False,False,sim_ai-expert,[sim_ai-expert],False
16,sim_aacp-teachers-seminar-proceedings,American Association of Colleges of Pharmacy T...,,1950.0,1963.0,False,False,False,False,False,False,sim_american-association-of-colleges-of-pharma...,[sim_american-association-of-colleges-of-pharm...,False
17,sim_aal-bilad-al-sa-udiyyah,Aal-Bilad al Sa'udiyyah,,,,False,False,False,False,False,False,sim_aal-bilad-al-saudiyyah,[sim_aal-bilad-al-saudiyyah],False
23,sim_aatcc-review,American Association of Textile Chemists and C...,,2001.0,2012.0,False,False,False,False,False,False,sim_american-association-of-textile-chemists-a...,[sim_american-association-of-textile-chemists-...,False
33,sim_abolitionist-or-record-of-thengland-anti-s...,Abolitionist: Or Record of the New England Ant...,,1833.0,1833.0,False,False,False,False,False,False,sim_abolitionist-or-record-of-the-new-england-...,[sim_abolitionist-or-record-of-the-new-england...,False
46,sim_abstracts-of-papers-american-mathematical-...,Abstracts of Papers Presented to the American ...,,1980.0,2004.0,False,False,False,False,False,False,sim_abstracts-of-papers-presented-to-the-ameri...,[sim_abstracts-of-papers-presented-to-the-amer...,False


In [149]:
print(sim_info_concise[sim_info_concise["intermediate works"] == False].loc[14965, "Title"])
print(sim_info_concise[sim_info_concise["intermediate works"] == False].loc[14965, "PubIssueID"])
print(sim_info_concise[sim_info_concise["intermediate works"] == False].loc[14965, "intermediate sim id"])

Zodiac, a Monthly Periodical, Devoted to Science, Literature and the Arts
sim_zodiac-a-monthly-periodical-devoted-to-science
['sim_zodiac-a-monthly-periodical-devoted-to-science-literature-and-the-arts']


## Export

In [151]:
df_to_write = sim_info_concise[["PubIssueID", "Title", "NA Gaps", "First Volume", "Last Volume"]]
df_to_write.head()

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume
0,sim-anatomia-clinica,Anatomia Clinica,,1978.0,1981.0
1,sim_-,The - -,,1826.0,1826.0
2,sim_1001-home-ideas,1001 Home Ideas,,1986.0,1991.0
3,sim_102-monitor,102 Monitor,,1975.0,1981.0
4,sim_20th-century-british-history,20th Century British History,,1990.0,1994.0


In [152]:
df_to_write.to_csv("SIM_info.csv", index=False)