# Wikipedia Journal Dump Parsing

In [1]:
import numpy as np 
import pandas as pd
import re
import requests
import datetime



## Load Wikipedia Dump Data

In [2]:
journal_dump = pd.read_json("en.wikipedia.org.journal.20210606.json", lines = True)

In [3]:
journal_dump.head()

Unnamed: 0,a,c
0,0,{{cite journal |first=R. W. |last=Bemer |title...
1,020413 DOJ White Paper,{{cite document|ssrn=1332096 |title=Combatants...
2,"10,000 Bullets","{{cite journal | author=Nowakowski, Kasper | a..."
3,"10,000 Bullets",{{cite journal | author=''Official UK PlayStat...
4,1000 Plant Genomes Project,"{{cite journal | vauthors = Matasci N, Hung LH..."


In [5]:
# total data entries 
journal_dump.shape

(2594772, 2)

## Load SIM information

In [36]:
sim_info = pd.read_csv("SIM_info.csv")
sim_info.head()

Unnamed: 0,Title,First Volume,Last Volume,Subjects,PubIssueID
0,American Journal of Pharmacy and the Sciences ...,1952.0,1995.0,Health & Medical Sciences,sim_american-journal-of-pharmacy-and-the-sciences
1,National Real Estate and Building Journal,1949.0,1956.0,Building & Construction,sim_national-real-estate-and-building-journal
2,The American Naturalist,1872.0,2015.0,Biology,sim_american-naturalist
3,Alcatel Telecommunications Review,1922.0,2002.0,Communication & Information Sciences,sim_alcatel-telecommunications-review
4,The American Journal of Gastroenterology,1949.0,2011.0,Medical Sciences--Gastroenterology,sim_american-journal-of-gastroenterology


## Clean data 

In [7]:
# store different fields as array
journal_data = journal_dump['c'].apply(lambda x: x.split("|"))
journal_data

0          [{{cite journal , first=R. W. , last=Bemer , t...
1          [{{cite document, ssrn=1332096 , title=Combata...
2          [{{cite journal ,  author=Nowakowski, Kasper ,...
3          [{{cite journal ,  author=''Official UK PlaySt...
4          [{{cite journal ,  vauthors = Matasci N, Hung ...
                                 ...                        
2594767    [{{cite journal , vauthors=Ueda H, Ueda M , ti...
2594768    [{{cite journal ,  last = Blok ,  date = 2017 ...
2594769    [{{cite journal ,  vauthors = Hughes CG, McGra...
2594770    [{{cite journal ,  vauthors = Passchier J, Gou...
2594771    [{{cite journal ,  vauthors = Zhilenko VN, Kho...
Name: c, Length: 2594772, dtype: object

In [8]:
journal_df = pd.DataFrame(journal_data)
journal_df = journal_df.rename(columns = {'c': 'citation_list'})
journal_df.head()

Unnamed: 0,citation_list
0,"[{{cite journal , first=R. W. , last=Bemer , t..."
1,"[{{cite document, ssrn=1332096 , title=Combata..."
2,"[{{cite journal , author=Nowakowski, Kasper ,..."
3,"[{{cite journal , author=''Official UK PlaySt..."
4,"[{{cite journal , vauthors = Matasci N, Hung ..."


In [9]:
# get length of the list items 
journal_df["list_length"] = journal_df["citation_list"].apply(lambda x: len(x))
journal_df.head()

Unnamed: 0,citation_list,list_length
0,"[{{cite journal , first=R. W. , last=Bemer , t...",11
1,"[{{cite document, ssrn=1332096 , title=Combata...",5
2,"[{{cite journal , author=Nowakowski, Kasper ,...",9
3,"[{{cite journal , author=''Official UK PlaySt...",8
4,"[{{cite journal , vauthors = Matasci N, Hung ...",12
...,...,...
2594767,"[{{cite journal , vauthors=Ueda H, Ueda M , ti...",9
2594768,"[{{cite journal , last = Blok , date = 2017 ...",7
2594769,"[{{cite journal , vauthors = Hughes CG, McGra...",11
2594770,"[{{cite journal , vauthors = Passchier J, Gou...",11


In [11]:
journal_df['list_length'].min()

2

In [12]:
# Get the first element, the source of the citation
journal_df['type'] = journal_df["citation_list"].apply(lambda x: x[0][7:])
journal_df.head(10)

Unnamed: 0,citation_list,list_length,type
0,"[{{cite journal , first=R. W. , last=Bemer , t...",11,journal
1,"[{{cite document, ssrn=1332096 , title=Combata...",5,document
2,"[{{cite journal , author=Nowakowski, Kasper ,...",9,journal
3,"[{{cite journal , author=''Official UK PlaySt...",8,journal
4,"[{{cite journal , vauthors = Matasci N, Hung ...",12,journal
5,"[{{cite journal , author = One Thousand Plant...",11,journal
6,"[{{Cite journal, last1=Wong, first1=Gane Ka-Sh...",26,journal
7,"[{{cite journal , vauthors = Hayden EC , tit...",11,journal
8,"[{{Cite journal, last1=Cheng, first1=Shifeng, ...",34,journal
9,"[{{Cite journal, last1=Lewin, first1=Harris A....",35,journal


In [13]:
# filter out other values and only keep 'cite journal'
journal_df['is_cite_journal'] = journal_df['type'].apply(lambda x: 'journal' in x)
journal_df.head(10)

Unnamed: 0,citation_list,list_length,type,is_cite_journal
0,"[{{cite journal , first=R. W. , last=Bemer , t...",11,journal,True
1,"[{{cite document, ssrn=1332096 , title=Combata...",5,document,False
2,"[{{cite journal , author=Nowakowski, Kasper ,...",9,journal,True
3,"[{{cite journal , author=''Official UK PlaySt...",8,journal,True
4,"[{{cite journal , vauthors = Matasci N, Hung ...",12,journal,True
5,"[{{cite journal , author = One Thousand Plant...",11,journal,True
6,"[{{Cite journal, last1=Wong, first1=Gane Ka-Sh...",26,journal,True
7,"[{{cite journal , vauthors = Hayden EC , tit...",11,journal,True
8,"[{{Cite journal, last1=Cheng, first1=Shifeng, ...",34,journal,True
9,"[{{Cite journal, last1=Lewin, first1=Harris A....",35,journal,True


In [15]:
journals_only = journal_df[journal_df['is_cite_journal'] == True]
journals_only = journals_only.drop(columns = "is_cite_journal")
print("Original Dataframe Size: " + str(journal_df.shape))
print("Size after filtering out non journals: " + str(journals_only.shape))
journals_only.head()

Original Dataframe Size: (2594772, 4)
Size after filtering out non journals: (2580625, 3)


Unnamed: 0,citation_list,list_length,type
0,"[{{cite journal , first=R. W. , last=Bemer , t...",11,journal
2,"[{{cite journal , author=Nowakowski, Kasper ,...",9,journal
3,"[{{cite journal , author=''Official UK PlaySt...",8,journal
4,"[{{cite journal , vauthors = Matasci N, Hung ...",12,journal
5,"[{{cite journal , author = One Thousand Plant...",11,journal


In [16]:
journal_df.loc[0, 'citation_list']

['{{cite journal ',
 'first=R. W. ',
 'last=Bemer ',
 'title=Towards standards for handwritten zero and oh: much ado about nothing (and a letter), or a partial dossier on distinguishing between handwritten zero and oh ',
 'journal=Communications of the ACM ',
 'volume=10 ',
 'issue=8 ',
 'year=1967 ',
 'pages=513–518 ',
 'doi=10.1145/363534.363563',
 's2cid=294510 }}']

## Test URL for one instance 

- Franz Boaz Example https://en.wikipedia.org/wiki/Franz_Boas 
- reference 110 

In [17]:
def find_desired_info(arr):
    journal = None
    date = None
    volume = None
    issue = None
    for field in arr:
        if "=" in field:
            if "journal" in field:
                field_arr = field.split("=")
#                 print(field)
                journal = field_arr[1].strip()
                journal = re.sub('[^A-Za-z0-9 ]+', '', journal)
            if "volume" in field:
                volume_arr = field.split("=")
#                 print(field)
                volume = volume_arr[1].strip()
            if "issue" in field:
                issue_arr = field.split("=")
#                 print(field)
                issue = issue_arr[1].strip()
            if "date" in field:
                date_arr = field.split("=")
#                 print(field)
                date = date_arr[1].strip()
    
    return journal, date, volume, issue

In [18]:
test = "{{Cite journal | last1 = Lewis | first1 = Herbert | date = June 2001 | title = The Passion of Franz Boas | journal = [[American Anthropologist]] | volume = 103 | issue = 2| pages = 447–467 | url = https://archive.org/details/sim_american-anthropologist_2001-06_103_2/page/447/mode/2up | doi=10.1525/aa.2001.103.2.447}}"

test_arr = test.split("|")

In [19]:
test_values = find_desired_info(test_arr)

In [20]:
def process_journal(journal):
    journal = journal.lower()
    journal_array = journal.split()
    journal_result = "-".join(journal_array)
    return journal_result

def process_date(date):
    
    # add code to check date format
    if " " in date and re.match("[A-Za-z0-9 ]+", date):
        date_array = date.split()
        month = date_array[0]
        year = date_array[1]
        if re.match("[A-Za-z0-9]+", month):
            if re.match("[A-Za-z]+", month) and "-" not in month:
                try: 
                    month = datetime.datetime.strptime(month, "%B").month
                except:
                    return year
                if month < 10:
                    month = "0" + str(month)
                else:
                    month = str(month)
                date_result = year + "-" + month
            else:
#                 print("month not a-zA-Z: " + month)
                return year
        else:
#             print("month not a-zA-Z0-9: " + month)
            return year
    else: 
        date_result = date
    return date_result

In [21]:
def get_identifier(values):
    sim = "sim"
    journal = process_journal(values[0])
    date = process_date(values[1])
    volume = values[2]
    issue = values[3]
    identifier = sim + "_" + journal + "_" + date + "_" + volume + "_" + issue
    return identifier

In [22]:
def get_url_to_archive_org(values):
    archive_header = "https://archive.org/details/"
    return archive_header + get_identifier(values)

def test_url_exist(url):
    request = requests.get(url)
    if request.status_code == 200:
#         print('Web site exists')
        return True
    else:
#         print('Web site does not exist') 
        return False

In [23]:
test_url = get_url_to_archive_org(test_values)
test_url

'https://archive.org/details/sim_american-anthropologist_2001-06_103_2'

In [24]:
test_url_exist(test_url)

True

## Cross Reference Journals Citation and SIM Info

In [27]:
def find_journal_name(arr):
    journal = None
    for field in arr:
        if "=" in field:
            if "journal" in field:
                field_arr = field.split("=")
                journal = field_arr[1].strip()
                journal = re.sub('[^A-Za-z0-9 ]+', '', journal)
    
    return journal

In [28]:
journals_only["journal_name"] = journals_only["citation_list"].apply(find_journal_name)
journals_only.head()

Unnamed: 0,citation_list,list_length,type,journal_name
0,"[{{cite journal , first=R. W. , last=Bemer , t...",11,journal,Communications of the ACM
2,"[{{cite journal , author=Nowakowski, Kasper ,...",9,journal,LeveL
3,"[{{cite journal , author=''Official UK PlaySt...",8,journal,Official UK PlayStation 2 Magazine
4,"[{{cite journal , vauthors = Matasci N, Hung ...",12,journal,GigaScience
5,"[{{cite journal , author = One Thousand Plant...",11,journal,Nature


In [31]:
def name_to_pub_issue_id(name):
    if name != None:
        sim = "sim_"
        journal_processed = process_journal(name)
        return sim + journal_processed

In [32]:
journals_only["synthetic_pub_issue_id"] = journals_only["journal_name"].apply(name_to_pub_issue_id)
journals_only.head()

Unnamed: 0,citation_list,list_length,type,journal_name,synthetic_pub_issue_id
0,"[{{cite journal , first=R. W. , last=Bemer , t...",11,journal,Communications of the ACM,sim_communications-of-the-acm
2,"[{{cite journal , author=Nowakowski, Kasper ,...",9,journal,LeveL,sim_level
3,"[{{cite journal , author=''Official UK PlaySt...",8,journal,Official UK PlayStation 2 Magazine,sim_official-uk-playstation-2-magazine
4,"[{{cite journal , vauthors = Matasci N, Hung ...",12,journal,GigaScience,sim_gigascience
5,"[{{cite journal , author = One Thousand Plant...",11,journal,Nature,sim_nature


In [37]:
sim_info.head()

Unnamed: 0,Title,First Volume,Last Volume,Subjects,PubIssueID
0,American Journal of Pharmacy and the Sciences ...,1952.0,1995.0,Health & Medical Sciences,sim_american-journal-of-pharmacy-and-the-sciences
1,National Real Estate and Building Journal,1949.0,1956.0,Building & Construction,sim_national-real-estate-and-building-journal
2,The American Naturalist,1872.0,2015.0,Biology,sim_american-naturalist
3,Alcatel Telecommunications Review,1922.0,2002.0,Communication & Information Sciences,sim_alcatel-telecommunications-review
4,The American Journal of Gastroenterology,1949.0,2011.0,Medical Sciences--Gastroenterology,sim_american-journal-of-gastroenterology


In [38]:
merged = pd.merge(left=journals_only, right=sim_info, how="inner", 
                  left_on="synthetic_pub_issue_id", right_on="PubIssueID")
merged.head()

Unnamed: 0,citation_list,list_length,type,journal_name,synthetic_pub_issue_id,Title,First Volume,Last Volume,Subjects,PubIssueID
0,"[{{Cite journal, last1=Wong, first1=Gane Ka-Sh...",26,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology
1,"[{{cite journal , last=Takayama , first=Seiji ...",12,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology
2,"[{{cite journal , last=Li , first=Wei , las...",23,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology
3,"[{{cite journal, last1=Geldner, first1=N., tit...",11,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology
4,"[{{cite journal , vauthors = Popper ZA, Miche...",13,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology


In [41]:
total_journal_count = journals_only.shape[0]
journal_name_in_sim_count = merged.shape[0]
prop_journals_in_sim = journal_name_in_sim_count/total_journal_count
print("Total number of journals in this dump: " + str(total_journal_count))
print("Total number of journals in this dump whose name is in SIM :" + str(journal_name_in_sim_count))
print("Percentage in SIM: " + str(prop_journals_in_sim))

Total number of journals in this dump: 2580625
Total number of journals in this dump whose name is in SIM :924912
Percentage in SIM: 0.3584062000484379


In [47]:
journal_names_in_sim_count = str(len(merged["journal_name"].unique())) 
print("Unique journal names in SIM: " + journal_names_in_sim_count)

Unique journal names in SIM: 8789


In [168]:
def find_year(arr):
    year = ""
    for field in arr:
#         field = field.strip()
        if re.match("year(\s{0,})=", field):
#         if "=" in field:
#             if "year" in field:
            field_arr = field.split("=")
            year = field_arr[1].strip()
#             year = re.sub('[^0-9]+', '', year)
#             if year != '':
#                 year = int(year)
#                 if year < 1800 and year > 2030:
#                     year = None
#             else:
#                 year = None
                
    return year

In [169]:
merged["year"] = merged["citation_list"].apply(find_year)
merged.head()

Unnamed: 0,citation_list,list_length,type,journal_name,synthetic_pub_issue_id,Title,First Volume,Last Volume,Subjects,PubIssueID,year,date,volume,issue
0,"[{{Cite journal, last1=Wong, first1=Gane Ka-Sh...",26,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,,2016-05-04,71,
1,"[{{cite journal , last=Takayama , first=Seiji ...",12,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,2005.0,,56,
2,"[{{cite journal , last=Li , first=Wei , las...",23,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,,2020-04-29,71,1.0
3,"[{{cite journal, last1=Geldner, first1=N., tit...",11,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,2013.0,,64,1.0
4,"[{{cite journal , vauthors = Popper ZA, Miche...",13,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,,2011,62,


In [152]:
# Example to test
merged.loc[2, 'citation_list']

['{{cite journal ',
 ' last=Li ',
 ' first=Wei ',
 ' last2=Deng ',
 ' first2=Yiwen ',
 ' last3=Ning ',
 ' first3=Yuese ',
 ' last4=He ',
 ' first4=Zuhua ',
 ' last5=Wang ',
 ' first5=Guo-Liang ',
 ' title=Exploiting Broad-Spectrum Disease Resistance in Crops: From Molecular Dissection to Breeding ',
 ' journal=[[Annual Review of Plant Biology]] ',
 ' publisher=[[Annual Reviews (publisher)',
 'Annual Reviews]] ',
 ' volume=71 ',
 ' issue=1 ',
 ' date=2020-04-29 ',
 ' issn=1543-5008 ',
 ' doi=10.1146/annurev-arplant-010720-022215 ',
 ' pages=575–603 ',
 " quote=p.{{nbs}}587, {{apostrophe}}'''{{red",
 "4.5. Altered Expression of Defense-Signaling and Pathogenesis-Related Genes}}''' Engineering BSR is possible using both defense signaling and ''PR'' genes because they usually function downstream of the immune receptors. ... Defense signaling and PR genes are conserved in different plant species, allowing BSR to be achieved in many crops by expressing the ''Arabidopsis'' defense master regu

In [176]:
def find_date(arr):
    date = ""
#     year = None
#     month = None
    for field in arr:
        field = field.strip()
        if re.match("date(\s{0,})=", field):
            field_arr = field.split("=")
            date = field_arr[1].strip()
                
    return date

In [177]:
# test
print(find_date(["date=2020-04-29 "]))

2020-04-29


In [178]:
merged["date"] = merged["citation_list"].apply(find_date)
merged.head()

Unnamed: 0,citation_list,list_length,type,journal_name,synthetic_pub_issue_id,Title,First Volume,Last Volume,Subjects,PubIssueID,year,date,volume,issue
0,"[{{Cite journal, last1=Wong, first1=Gane Ka-Sh...",26,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,,2016-05-04,71,
1,"[{{cite journal , last=Takayama , first=Seiji ...",12,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,2005.0,,56,
2,"[{{cite journal , last=Li , first=Wei , las...",23,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,,2020-04-29,71,1.0
3,"[{{cite journal, last1=Geldner, first1=N., tit...",11,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,2013.0,,64,1.0
4,"[{{cite journal , vauthors = Popper ZA, Miche...",13,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,,2011,62,


In [179]:
def find_volume(arr):
    volume = ""
    for field in arr:
        field = field.strip()
        if re.match("volume(\s{0,})=", field):
            volume_arr = field.split("=")
            volume = volume_arr[1].strip()
    return volume 

def find_issue(arr):
    issue = ""
    for field in arr:
        field = field.strip()
        if re.match("issue(\s{0,})=", field):
            issue_arr = field.split("=")
            issue = issue_arr[1].strip()
    return issue

In [180]:
merged["volume"] = merged["citation_list"].apply(find_volume)
merged["issue"] = merged["citation_list"].apply(find_issue)
merged.head()

Unnamed: 0,citation_list,list_length,type,journal_name,synthetic_pub_issue_id,Title,First Volume,Last Volume,Subjects,PubIssueID,year,date,volume,issue
0,"[{{Cite journal, last1=Wong, first1=Gane Ka-Sh...",26,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,,2016-05-04,71,
1,"[{{cite journal , last=Takayama , first=Seiji ...",12,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,2005.0,,56,
2,"[{{cite journal , last=Li , first=Wei , las...",23,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,,2020-04-29,71,1.0
3,"[{{cite journal, last1=Geldner, first1=N., tit...",11,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,2013.0,,64,1.0
4,"[{{cite journal , vauthors = Popper ZA, Miche...",13,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,,2011,62,


## Find Method Viability 

- journal name not in sim titles
- journal name in sim titles
    - journal citation does not have time fields
    - journal citation has some form of time 
        - journal citation template has year 
        - journal citation template has date
    

### Only work with those with year

In [181]:
journal_sim_with_year = merged[merged["year"] != ""]
journal_sim_with_year.head()

Unnamed: 0,citation_list,list_length,type,journal_name,synthetic_pub_issue_id,Title,First Volume,Last Volume,Subjects,PubIssueID,year,date,volume,issue
1,"[{{cite journal , last=Takayama , first=Seiji ...",12,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,2005,,56,
3,"[{{cite journal, last1=Geldner, first1=N., tit...",11,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,2013,,64,1.0
8,"[{{Cite journal, title=Photorespiration and th...",16,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,2012,,63,1.0
13,"[{{cite journal , doi=10.1146/annurev-arplant-...",10,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,2011,,62,
20,"[{{cite journal , doi = 10.1146/annurev-arplan...",18,journal,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,Annual Review of Plant Biology,1950.0,2016.0,Agriculture|Biology--Botany|Gardening And Hort...,sim_annual-review-of-plant-biology,2013,,64,


In [175]:
total_journal_sim_count = merged.shape[0]
journal_sim_with_year_count = journal_sim_with_year.shape[0]
prop_journal_sim_with_year = journal_sim_with_year_count/total_journal_sim_count
print("Number of journals in SIM: " + str(total_journal_sim_count))
print("Number of journals in SIM with year :" + str(journal_sim_with_year_count))
print("Percentage with year: " + str(prop_journal_sim_with_year))

Number of journals in SIM: 924912
Number of journals in SIM with year :207955
Percentage with year: 0.22483760617226287


#### Filter out Unnecessary Columns 

In [221]:
journal_sim_yr = journal_sim_with_year[["citation_list", "list_length", "journal_name", 
                                        "PubIssueID", "year", "volume"]]
journal_sim_yr.head()

Unnamed: 0,citation_list,list_length,journal_name,PubIssueID,year,volume
1,"[{{cite journal , last=Takayama , first=Seiji ...",12,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,2005,56
3,"[{{cite journal, last1=Geldner, first1=N., tit...",11,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,2013,64
8,"[{{Cite journal, title=Photorespiration and th...",16,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,2012,63
13,"[{{cite journal , doi=10.1146/annurev-arplant-...",10,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,2011,62
20,"[{{cite journal , doi = 10.1146/annurev-arplan...",18,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,2013,64


In [223]:
def has_journal_year_volume(row):
    return row["PubIssueID"] != "" and row["year"] != "" and row["volume"] != ""

In [225]:
journal_sim_yr["has_3_fields"] = journal_sim_yr.apply(has_journal_year_volume, axis = 1)
journal_sim_yr.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,citation_list,list_length,journal_name,PubIssueID,year,volume,has_3_fields
1,"[{{cite journal , last=Takayama , first=Seiji ...",12,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,2005,56,True
3,"[{{cite journal, last1=Geldner, first1=N., tit...",11,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,2013,64,True
8,"[{{Cite journal, title=Photorespiration and th...",16,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,2012,63,True
13,"[{{cite journal , doi=10.1146/annurev-arplant-...",10,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,2011,62,True
20,"[{{cite journal , doi = 10.1146/annurev-arplan...",18,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,2013,64,True


In [226]:
journal_sim_yr_has_3 = journal_sim_yr[journal_sim_yr["has_3_fields"] == True]
print("Number of journals in SIM with year: " + str(journal_sim_yr.shape[0]))
print("Number of journals in SIM with journal, year, volume: " + str(journal_sim_yr_has_3.shape[0]))
print("Percentage with 3 fields: " + str(journal_sim_yr_has_3.shape[0]/journal_sim_yr.shape[0]))

Number of journals in SIM with year: 207955
Number of journals in SIM with journal, year, volume: 183254
Percentage with 3 fields: 0.8812194946021976


In [233]:
def generate_archive_url_with_journal_year_volume(row):
    archive_header = "https://archive.org/details/"
    journal = str(row["PubIssueID"])
    year = "_" + str(row["year"])
    volume = "_" + str(row["volume"])
#     issue = ""
#     if row["issue"] != "":
#         issue = "_" + row["issue"]
    identifier = journal + year + volume
    return archive_header + identifier
#     return ""

In [257]:
# test method with false result
print("data: ")
print(journal_sim_with_year.iloc[96, :])
url_example = generate_archive_url_with_journal_year_volume(journal_sim_yr_has_3.iloc[96, :])
print("url: " + url_example)
print("url exist: " + str(test_url_exist(url_example)))

data: 
citation_list             [{{cite journal, last1=Szent-Györgyi, first1=A...
list_length                                                              12
type                                                                journal
journal_name                                                        Science
synthetic_pub_issue_id                                          sim_science
Title                                                               Science
First Volume                                                         1979.0
Last Volume                                                          1986.0
Subjects                                   Mathematical & Physical Sciences
PubIssueID                                                      sim_science
year                                                                   1972
date                                                                       
volume                                                                  176
issue

In [258]:
# test method with true result
print("data: ")
print(journal_sim_with_year.iloc[2, :])
url_example = generate_archive_url_with_journal_year_volume(journal_sim_yr_has_3.iloc[2, :])
print("url: " + url_example)
print("url exist: " + str(test_url_exist(url_example)))

data: 
citation_list             [{{Cite journal, title=Photorespiration and th...
list_length                                                              16
type                                                                journal
journal_name                                 Annual Review of Plant Biology
synthetic_pub_issue_id                   sim_annual-review-of-plant-biology
Title                                        Annual Review of Plant Biology
First Volume                                                         1950.0
Last Volume                                                          2016.0
Subjects                  Agriculture|Biology--Botany|Gardening And Hort...
PubIssueID                               sim_annual-review-of-plant-biology
year                                                                   2012
date                                                                       
volume                                                                   63
issue

In [262]:
journal_sim_yr_has_3["url"] = journal_sim_yr_has_3.apply(generate_archive_url_with_journal_year_volume, axis = 1)
journal_sim_yr_has_3 = journal_sim_yr_has_3.reset_index()
journal_sim_yr_has_3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,index,citation_list,list_length,journal_name,PubIssueID,year,volume,has_3_fields,url,url_exists
0,1,"[{{cite journal , last=Takayama , first=Seiji ...",12,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,2005,56,True,https://archive.org/details/sim_annual-review-...,True
1,3,"[{{cite journal, last1=Geldner, first1=N., tit...",11,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,2013,64,True,https://archive.org/details/sim_annual-review-...,True
2,8,"[{{Cite journal, title=Photorespiration and th...",16,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,2012,63,True,https://archive.org/details/sim_annual-review-...,True
3,13,"[{{cite journal , doi=10.1146/annurev-arplant-...",10,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,2011,62,True,https://archive.org/details/sim_annual-review-...,True
4,20,"[{{cite journal , doi = 10.1146/annurev-arplan...",18,Annual Review of Plant Biology,sim_annual-review-of-plant-biology,2013,64,True,https://archive.org/details/sim_annual-review-...,True


In [265]:
url_exist_0_100 = journal_sim_yr_has_3.loc[0:100, "url"].apply(test_url_exist)
url_exist_0_100

0       True
1       True
2       True
3       True
4       True
       ...  
96     False
97     False
98     False
99     False
100    False
Name: url, Length: 101, dtype: bool

In [266]:
url_exist_0_100.value_counts()

False    62
True     39
Name: url, dtype: int64

##### Sampling

In [267]:
url_exist_random_100 = journal_sim_yr_has_3.sample(n = 100)["url"].apply(test_url_exist)
url_exist_random_100

156895    False
115703    False
25598     False
12128     False
137537    False
          ...  
69848     False
37354     False
73298     False
23867     False
29073     False
Name: url, Length: 100, dtype: bool

In [268]:
url_exist_random_100.value_counts()

False    99
True      1
Name: url, dtype: int64

In [270]:
# url_exist_sample_1_percent = journal_sim_yr_has_3.sample(frac = 0.01)["url"].apply(test_url_exist)
# url_exist_sample_1_percent

In [271]:
# url_exist_sample_1_percent.value_counts()

##### Full Data

In [273]:
# url_exist_0_1000 = journal_sim_yr_has_3.loc[0:1000, "url"].apply(test_url_exist)
# url_exist_0_1000

In [None]:
journal_sim_yr_has_3["url_exists"] = full_url_exist_lst
journal_sim_yr_has_3

In [None]:
journal_sim_yr_has_3_good_link = journal_sim_yr_has_3[journal_sim_yr_has_3["url_exists"] == True]

In [None]:
print("Number of journals in SIM with journal, year, column: " + str(journal_sim_yr_has_3.shape[0]))
print("Number of journals in SIM with journal, year, volume and has good link: " 
      + str(journal_sim_yr_has_3_good_link.shape[0]))
print("Percentage with 3 fields: " + str(journal_sim_yr_has_3_good_link.shape[0]/journal_sim_yr_has_3.shape[0]))

#### Section Conclusion

In this specific data dump, there are 2580625 citation entries, 35% of them have matching titles in the Serials in Microfilms collection. I've noticed that some uses a citation template with "date" field and some with "year" field. I focused on those using the "year" template. Around 88% of these with "year" template also has title, year, and volume. And of these with 3 fields (title, year, and volume), _% of them generated valid links! A table is generated below.

In [None]:
# Generate a csv of links
journal_sim_yr_has_3_good_link.to_csv("Journal_SIM_good_links_with_yr_title_volume.csv")

## Find Method Viability (attempt #1 - directly from single test case) 

In [152]:
journals_only.head()

Unnamed: 0,citation_list,list_length,type
0,"[{{cite journal , first=R. W. , last=Bemer , t...",11,journal
2,"[{{cite journal , author=Nowakowski, Kasper ,...",9,journal
3,"[{{cite journal , author=''Official UK PlaySt...",8,journal
4,"[{{cite journal , vauthors = Matasci N, Hung ...",12,journal
5,"[{{cite journal , author = One Thousand Plant...",11,journal


In [153]:
journals_only['journal_info'] = journals_only['citation_list'].apply(find_desired_info)
journals_only.head()

Unnamed: 0,citation_list,list_length,type,journal_info
0,"[{{cite journal , first=R. W. , last=Bemer , t...",11,journal,"(Communications of the ACM, None, 10, 8)"
2,"[{{cite journal , author=Nowakowski, Kasper ,...",9,journal,"(LeveL, August 2009, None, 41}})"
3,"[{{cite journal , author=''Official UK PlaySt...",8,journal,"(Official UK PlayStation 2 Magazine, August 20..."
4,"[{{cite journal , vauthors = Matasci N, Hung ...",12,journal,"(GigaScience, 2014, 3, 17)"
5,"[{{cite journal , author = One Thousand Plant...",11,journal,"(Nature, October 2019, 574, 7780)"


In [154]:
def has_4_fields(arr):
    for value in arr:
        if value == None:
            return False
    return True

In [155]:
has_4_fields(test_values)

True

In [156]:
journals_only["has_all_4_fields"] = journals_only["journal_info"].apply(has_4_fields)
journals_only.head()

Unnamed: 0,citation_list,list_length,type,journal_info,has_all_4_fields
0,"[{{cite journal , first=R. W. , last=Bemer , t...",11,journal,"(Communications of the ACM, None, 10, 8)",False
2,"[{{cite journal , author=Nowakowski, Kasper ,...",9,journal,"(LeveL, August 2009, None, 41}})",False
3,"[{{cite journal , author=''Official UK PlaySt...",8,journal,"(Official UK PlayStation 2 Magazine, August 20...",False
4,"[{{cite journal , vauthors = Matasci N, Hung ...",12,journal,"(GigaScience, 2014, 3, 17)",True
5,"[{{cite journal , author = One Thousand Plant...",11,journal,"(Nature, October 2019, 574, 7780)",True


#### Stats for proportion with desired info

In [162]:
total_count = journals_only.shape[0]
# full_values_count = journals_only[journals_only["has_all_4_fields"] == True].shape[0]
# partial_values_count = journals_only[journals_only["has_all_4_fields"] == False].shape[0]

In [167]:
total_count = journals_only.shape[0]
print("Cite Journal Count: "+ str(total_count))
full_partial_value_counts = journals_only["has_all_4_fields"].value_counts()
print("Citations with 4 desired fields: " + str(full_partial_value_counts[True]))
print("Citations without 4 desired fields: " + str(full_partial_value_counts[False]))

Cite Journal Count: 2580625
Citations with 4 desired fields: 1083810
Citations without 4 desired fields: 1496815


In [169]:
prop_with = full_partial_value_counts[True] / total_count
print("Proportion of citations that can be parsed as such: " + str(prop_with))

Proportion of citations that can be parsed as such: 0.4199796560910632


## Process Citations With Desired Fields (Journal, Date, Volume, Issue)

In [158]:
journals_only_all_info = journals_only[journals_only["has_all_4_fields"] == True]
journals_only_all_info = journals_only_all_info.drop(columns = "has_all_4_fields")
journals_only_all_info.head()

Unnamed: 0,citation_list,list_length,type,journal_info
4,"[{{cite journal , vauthors = Matasci N, Hung ...",12,journal,"(GigaScience, 2014, 3, 17)"
5,"[{{cite journal , author = One Thousand Plant...",11,journal,"(Nature, October 2019, 574, 7780)"
7,"[{{cite journal , vauthors = Hayden EC , tit...",11,journal,"(Nature, January 2008, 451, 7177)"
8,"[{{Cite journal, last1=Cheng, first1=Shifeng, ...",34,journal,"(GigaScience, 2018-03-01, 7, 3)"
9,"[{{Cite journal, last1=Lewin, first1=Harris A....",35,journal,(Proceedings of the National Academy of Scienc...


In [192]:
journals_only_all_info["journal_name"] = journals_only_all_info["journal_info"].apply(lambda x: process_journal(x[0]))
journals_only_all_info.head()

Unnamed: 0,citation_list,list_length,type,journal_info,journal_name
4,"[{{cite journal , vauthors = Matasci N, Hung ...",12,journal,"(GigaScience, 2014, 3, 17)",gigascience
5,"[{{cite journal , author = One Thousand Plant...",11,journal,"(Nature, October 2019, 574, 7780)",nature
7,"[{{cite journal , vauthors = Hayden EC , tit...",11,journal,"(Nature, January 2008, 451, 7177)",nature
8,"[{{Cite journal, last1=Cheng, first1=Shifeng, ...",34,journal,"(GigaScience, 2018-03-01, 7, 3)",gigascience
9,"[{{Cite journal, last1=Lewin, first1=Harris A....",35,journal,(Proceedings of the National Academy of Scienc...,proceedings-of-the-national-academy-of-sciences


In [199]:
journals_only_all_info["journal_date"] = journals_only_all_info["journal_info"].apply(lambda x: process_date(x[1]))
journals_only_all_info.head()

Unnamed: 0,citation_list,list_length,type,journal_info,journal_name,journal_date
4,"[{{cite journal , vauthors = Matasci N, Hung ...",12,journal,"(GigaScience, 2014, 3, 17)",gigascience,2014
5,"[{{cite journal , author = One Thousand Plant...",11,journal,"(Nature, October 2019, 574, 7780)",nature,2019-10
7,"[{{cite journal , vauthors = Hayden EC , tit...",11,journal,"(Nature, January 2008, 451, 7177)",nature,2008-01
8,"[{{Cite journal, last1=Cheng, first1=Shifeng, ...",34,journal,"(GigaScience, 2018-03-01, 7, 3)",gigascience,2018-03-01
9,"[{{Cite journal, last1=Lewin, first1=Harris A....",35,journal,(Proceedings of the National Academy of Scienc...,proceedings-of-the-national-academy-of-sciences,2018-04-24


In [200]:
journals_only_all_info["generated_url"] = journals_only_all_info["journal_info"].apply(get_url_to_archive_org)
journals_only_all_info.head()

Unnamed: 0,citation_list,list_length,type,journal_info,journal_name,journal_date,generated_url
4,"[{{cite journal , vauthors = Matasci N, Hung ...",12,journal,"(GigaScience, 2014, 3, 17)",gigascience,2014,https://archive.org/details/sim_gigascience_20...
5,"[{{cite journal , author = One Thousand Plant...",11,journal,"(Nature, October 2019, 574, 7780)",nature,2019-10,https://archive.org/details/sim_nature_2019-10...
7,"[{{cite journal , vauthors = Hayden EC , tit...",11,journal,"(Nature, January 2008, 451, 7177)",nature,2008-01,https://archive.org/details/sim_nature_2008-01...
8,"[{{Cite journal, last1=Cheng, first1=Shifeng, ...",34,journal,"(GigaScience, 2018-03-01, 7, 3)",gigascience,2018-03-01,https://archive.org/details/sim_gigascience_20...
9,"[{{Cite journal, last1=Lewin, first1=Harris A....",35,journal,(Proceedings of the National Academy of Scienc...,proceedings-of-the-national-academy-of-sciences,2018-04-24,https://archive.org/details/sim_proceedings-of...


In [204]:
journals_only_all_info["url_exists"] = journals_only_all_info["generated_url"].apply(test_url_exist)
journals_only_all_info.head()

KeyboardInterrupt: 

In [89]:
url_temp = "https://archive.org/advancedsearch.php?q=sim_annual-of-plant-biology&fl%5B%5D=identifier&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=50&page=1&callback=callback&save=yes&output=tables"
test_url_exist(url_temp)

True

In [93]:
url_temp2 = "https://archive.org/advancedsearch.php?q=sim_annual-review-of-plant-biology&fl%5B%5D=identifier&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=50&page=1&output=json&callback=callback&save=yes"
req_temp2 = requests.get(url_temp2)
req_temp2

<Response [200]>

In [101]:
req_temp2.text

'callback({"responseHeader":{"status":0,"QTime":112,"params":{"query":"(( ( (title:\\"sim_annual-review-of-plant-biology\\"^100 OR salients:\\"sim_annual-review-of-plant-biology\\"^50 OR subject:\\"sim_annual-review-of-plant-biology\\"^25 OR description:\\"sim_annual-review-of-plant-biology\\"^15 OR collection:\\"sim_annual-review-of-plant-biology\\"^10 OR language:\\"sim_annual-review-of-plant-biology\\"^10 OR text:\\"sim_annual-review-of-plant-biology\\"^1) ) AND !collection:(podcasts OR radio OR uspto))^2 OR ( ( (title:\\"sim_annual-review-of-plant-biology\\"^100 OR salients:\\"sim_annual-review-of-plant-biology\\"^50 OR subject:\\"sim_annual-review-of-plant-biology\\"^25 OR description:\\"sim_annual-review-of-plant-biology\\"^15 OR collection:\\"sim_annual-review-of-plant-biology\\"^10 OR language:\\"sim_annual-review-of-plant-biology\\"^10 OR text:\\"sim_annual-review-of-plant-biology\\"^1) ) AND collection:(podcasts OR radio OR uspto))^0.5)","qin":"sim_annual-review-of-plant-biol