# URL Generation With Advanced Search

In [1]:
import numpy as np 
import pandas as pd
import re
import requests
import datetime
import dateparser
import json 

from bs4 import BeautifulSoup

import gspread
from gspread_dataframe import set_with_dataframe



## Load Data

In [2]:
journal_dump = pd.read_json("tr.wikipedia.org.journal.20210614.json.gz", lines = True)
journal_dump.head()

Unnamed: 0,a,c
0,1 + 2 + 3 + 4 + · · ·,{{Akademik dergi kaynağı\n| soyadı = Lepowsky ...
1,12 Victoria,{{Akademik dergi kaynağı\n | ad1 = B.\n | s...
2,"1,3,5-Triklorobenzen",{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-G...
3,141 Likya depremi,{{Akademik dergi kaynağı | url=http://blackmed...
4,141 Likya depremi,{{Akademik dergi kaynağı | url=http://www.nat-...


In [3]:
journal_dump.shape

(14624, 2)

In [4]:
def citation_string_into_list(citation):
    citation = re.sub('[{}]', '', citation)
    citation_list = citation.split("|")
    return citation_list

In [5]:
# Test function
citations = journal_dump['c'].apply(citation_string_into_list)
citations[0:5][1]

['Akademik dergi kaynağı\n ',
 ' ad1    = B.\n ',
 ' soyadı1     = Carry\n ',
 'başlık= Density of asteroids\n ',
 'iş= Planetary and Space Science\n ',
 'cilt= 73\n ',
 'sayfalar=98-118',
 'tarih= December 2012\n ',
 'doi= 10.1016/j.pss.2012.03.009\n ',
 'bibcode= 2012P&SS...73...98C\n ',
 'postscript= .\n',
 'arxiv = 1203.4336\n ']

In [6]:
### change this part for different alias for different languages 
journal_aliases = ['journal', 'newspaper', 'magazine', 'work','website',  'periodical', 
                       'encyclopedia', 'encyclopaedia', 'dictionary', 'mailinglist','dergi', 'gazete', 
                       'eser', 'çalışma', 'iş', 'websitesi', 'süreliyayın', 'ansiklopedi', 'sözlük', 'program']
    
date_aliases = ['date', 'air-date', 'airdate', 'tarih']
    
year_aliases = ['year', 'yıl', 'sene']
    
volume_aliases = ['volume', 'cilt']
    
issue_aliases = ['issue', 'number', 'sayı', 'numara']

page_aliases = ['p', 'page', 's', 'sayfa']
pages_aliases = ['pp', 'pages', 'ss', 'sayfalar']

url_aliases = ['url', 'URL', 'katkı-url', 'chapter-url', 'contribution-url', 'entry-url', 
               'article-url', 'section-url']

title_aliases= ['title', 'başlık']

doi_aliases = ['doi', 'DOI', 'pmid', 'PMID', 'jstor']

In [7]:
# Parsing a wikipedia citation data
def parse_citation_data(citation):

    citation = re.sub('[{}]', '', citation)
    citation_list = citation.split("|")
    
    journal = ""
    sim_id = ""
    volume = ""
    issue = ""
    
    title = ""
    page = ""
    
    url = ""
    doi = ""
    
    date = ""
    month_str = ""
    year = 0
    
    for field in citation_list:
        field = field.strip()
        
        # find journal title
        for j_a in journal_aliases:
            journal_regex = j_a + "(\s{0,})="
            if re.match(re.compile(journal_regex), field):
                journal = field.split("=")[1].strip()
                journal = re.sub('[^A-Za-z0-9 ]+', '', journal)
                if journal != "":
                    sim_id = journal.lower()
                    sim_id_lst = sim_id.split()
                    sim_id = "-".join(sim_id_lst)
                    sim_id = "sim_" + sim_id
                break
   
        # find journal volume 
        for v_a in volume_aliases:
            volume_regex = v_a + "(\s{0,})="
            if re.match(re.compile(volume_regex), field):
                volume = field.split("=")[1].strip()
                volume = re.sub('[^0-9]+', '', volume)
                break
            
        # find journal issue
        for i_a in issue_aliases:
            issue_regex = i_a + "(\s{0,})="
            if re.match(issue_regex, field):
                issue = field.split("=")[1].strip()
                break
        
        # find journal year
        for y_a in year_aliases:
            year_regex = y_a + "(\s{0,})="
            if re.match(year_regex, field):
                year = field.split("=")[1].strip()
                date = re.sub('[^0-9]+', '', year)
                try:
                    year = int(date)
                except:
                    year = 0
                break
            
        # find journal date
        for d_a in date_aliases:
            date_regex = d_a + "(\s{0,})="
            if re.match(date_regex, field):
                date = field.split("=")[1].strip()
                
#                 print("this should be a date " + date)

                try:
                    year = int(date)
                    date = str(year)
                except:
                    # use the python library for parsing
                    parsed_date = dateparser.parse(date)
                    if parsed_date != None:
                        if parsed_date.year < 2021 and parsed_date.year > 1800:
                            year = parsed_date.year
                            date = str(year)

                        if parsed_date.month < 10:
                            month = parsed_date.month
                            month_str = "0" + str(month)
                        else:
                            month = parsed_date.month
                            month_str = str(month)
                            
#                         print(month_str)

                        if month_str != "":
                            date = date + "-" + month_str 
                break
        
         # find existing url
        for u_a in url_aliases:
            url_regex = u_a + "(\s{0,})="
            if re.match(url_regex, field):
                url = field.split("=")[1].strip()
                break
            
        # find page field 
        for p_a in page_aliases:
            page_regex = p_a + "(\s{0,})="
            if re.match(page_regex, field):
                page = field.split("=")[1].strip()
                if "[" in page:
                    page = ""
                break
                
        # find pages field
        for ps_a in pages_aliases:
            pages_regex = ps_a + "(\s{0,})="
            if re.match(pages_regex, field):
                pages = field.split("=")[1].strip()
                if "[" not in pages:
                    if "-" in pages:
                        page = pages.split("-")[0].strip()
                    elif "–" in pages:
                        page = pages.split("–")[0].strip()
                    else:
                        page = ""
                        
                break
                
        # find page field 
        for t_a in title_aliases:
            title_regex = t_a + "(\s{0,})="
            if re.match(title_regex, field):
                title = field.split("=")[1].strip()
                if "[" in title:
                    title = ""
                break
                
        # find field 
        for doi_a in doi_aliases:
            doi_regex = doi_a + "(\s{0,})="
            if re.match(doi_regex, field):
                doi = field.split("=")[1].strip()
                break
            
    return {'journal': journal, 'sim_id': sim_id, 'date': date, 'year': year, 
            'volume': volume, 'issue': issue, 'title': title, 'page': page, 
            'url': url, 'doi': doi}
        

In [8]:
# parse citation data test 
cite_test = "{{Cite journal | last1 = Lewis | first1 = Herbert | date = June 2001 | title = The Passion of Franz Boas | journal = [[American Anthropologist]] | volume = 103 | issue = 2| pages = 447–467 | url = https://archive.org/details/sim_american-anthropologist_2001-06_103_2/page/447/mode/2up | doi=10.1525/aa.2001.103.2.447}}"
parse_citation_data(cite_test)


{'journal': 'American Anthropologist',
 'sim_id': 'sim_american-anthropologist',
 'date': '2001-06',
 'year': 2001,
 'volume': '103',
 'issue': '2',
 'title': 'The Passion of Franz Boas',
 'page': '447',
 'url': 'https://archive.org/details/sim_american-anthropologist_2001-06_103_2/page/447/mode/2up',
 'doi': '10.1525/aa.2001.103.2.447'}

In [9]:
cite_test2 = journal_dump.loc[2, 'c']
print(cite_test2)
parse_citation_data(cite_test2)

{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-Guang|soyadı2=Chen|ad2=I-Ming|soyadı3=Yen|ad3=Jui-Hung|soyadı4=Wang|ad4=Yei-Shung|tarih=Aralık 1999|başlık=Partial solubility parameters of chlorobenzene and chlorophenol compounds at equilibrium distribution in two immiscible phases|dergi=Chemosphere|cilt=39|sayı=15|sayfalar=2607-2620|doi=10.1016/s0045-6535(99)00173-3|issn=0045-6535}}


{'journal': 'Chemosphere',
 'sim_id': 'sim_chemosphere',
 'date': '1999-12',
 'year': 1999,
 'volume': '39',
 'issue': '15',
 'title': 'Partial solubility parameters of chlorobenzene and chlorophenol compounds at equilibrium distribution in two immiscible phases',
 'page': '2607',
 'url': '',
 'doi': '10.1016/s0045-6535(99)00173-3'}

In [10]:
parsed_citations = journal_dump['c'].apply(parse_citation_data)
parsed_citations_df = pd.DataFrame(parsed_citations.tolist())

In [11]:
parsed_citations_df.head()

Unnamed: 0,journal,sim_id,date,year,volume,issue,title,page,url,doi
0,Gncel Matematik,sim_gncel-matematik,1999,1999,248,,Nokta operatör cebirleri ve zeta fonksiyonu,327.0,http://arxiv.org/abs/math/9909178,
1,Planetary and Space Science,sim_planetary-and-space-science,2012-12,2012,73,,Density of asteroids,98.0,,10.1016/j.pss.2012.03.009
2,Chemosphere,sim_chemosphere,1999-12,1999,39,15.0,Partial solubility parameters of chlorobenzene...,2607.0,,10.1016/s0045-6535(99)00173-3
3,Journal of Black SeaMediterranean Environment,sim_journal-of-black-seamediterranean-environment,2007,2007,13,,Traces of Historical earthquakes in the ancien...,241.0,http://blackmeditjournal.org/pdf/4-%20Traces%2...,
4,Natural Hazards and Earth System Sciences,sim_natural-hazards-and-earth-system-sciences,2011,2011,11,,Revision of the tsunami catalogue affecting Tu...,,http://www.nat-hazards-earth-syst-sci.net/11/2...,10.5194/nhess-11-273-2011


In [12]:
parsed_citations_df = pd.concat([journal_dump, parsed_citations_df], axis=1)
parsed_citations_df.head()

Unnamed: 0,a,c,journal,sim_id,date,year,volume,issue,title,page,url,doi
0,1 + 2 + 3 + 4 + · · ·,{{Akademik dergi kaynağı\n| soyadı = Lepowsky ...,Gncel Matematik,sim_gncel-matematik,1999,1999,248,,Nokta operatör cebirleri ve zeta fonksiyonu,327.0,http://arxiv.org/abs/math/9909178,
1,12 Victoria,{{Akademik dergi kaynağı\n | ad1 = B.\n | s...,Planetary and Space Science,sim_planetary-and-space-science,2012-12,2012,73,,Density of asteroids,98.0,,10.1016/j.pss.2012.03.009
2,"1,3,5-Triklorobenzen",{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-G...,Chemosphere,sim_chemosphere,1999-12,1999,39,15.0,Partial solubility parameters of chlorobenzene...,2607.0,,10.1016/s0045-6535(99)00173-3
3,141 Likya depremi,{{Akademik dergi kaynağı | url=http://blackmed...,Journal of Black SeaMediterranean Environment,sim_journal-of-black-seamediterranean-environment,2007,2007,13,,Traces of Historical earthquakes in the ancien...,241.0,http://blackmeditjournal.org/pdf/4-%20Traces%2...,
4,141 Likya depremi,{{Akademik dergi kaynağı | url=http://www.nat-...,Natural Hazards and Earth System Sciences,sim_natural-hazards-and-earth-system-sciences,2011,2011,11,,Revision of the tsunami catalogue affecting Tu...,,http://www.nat-hazards-earth-syst-sci.net/11/2...,10.5194/nhess-11-273-2011


## Extract Desired Dataframe

### Citations without Existing Urls 

In [13]:
df_wo_url = parsed_citations_df[parsed_citations_df['url'] == ""]
print("There are " + str(df_wo_url.shape[0]) + " without existing urls out of " + str(parsed_citations_df.shape[0]))
df_wo_url.head()

There are 8153 without existing urls out of 14624


Unnamed: 0,a,c,journal,sim_id,date,year,volume,issue,title,page,url,doi
1,12 Victoria,{{Akademik dergi kaynağı\n | ad1 = B.\n | s...,Planetary and Space Science,sim_planetary-and-space-science,2012-12,2012,73,,Density of asteroids,98,,10.1016/j.pss.2012.03.009
2,"1,3,5-Triklorobenzen",{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-G...,Chemosphere,sim_chemosphere,1999-12,1999,39,15.0,Partial solubility parameters of chlorobenzene...,2607,,10.1016/s0045-6535(99)00173-3
5,1481 Rodos depremi,{{Akademik dergi kaynağı|başlık=Earthquake sou...,Tectonophysics,sim_tectonophysics,2012-04,2012,536537,,Earthquake source parameters along the Helleni...,61,,10.1016/j.tecto.2012.02.019
6,1481 Rodos depremi,{{Akademik dergi kaynağı|başlık=Historical and...,Marine Geology,sim_marine-geology,2014-08,2014,354,,Historical and pre-historical tsunamis in the ...,81,,10.1016/j.margeo.2014.04.014
8,1481 Rodos depremi,{{Akademik dergi kaynağı|başlık=New approaches...,Nat Hazards,sim_nat-hazards,2010-12,2010,63,1.0,New approaches in assessment of tsunami deposi...,181,,10.1007/s11069-010-9692-5


In [14]:
df_wo_url_wo_doi = df_wo_url[df_wo_url['doi'] == ""]
print("There are " + str(df_wo_url_wo_doi.shape[0]) + " without existing urls out of " + str(df_wo_url.shape[0]))
df_wo_url_wo_doi.head()

There are 2512 without existing urls out of 8153


Unnamed: 0,a,c,journal,sim_id,date,year,volume,issue,title,page,url,doi
21,3-Oktanon,{{Akademik dergi kaynağı |başlık= Freshly Dist...,Z Naturforsch C,sim_z-naturforsch-c,1978,1978,33,1-2,Freshly Distilled Oil of the Leaves of Rasmari...,,,
30,5-HT-Reseptörü,{{Akademik dergi kaynağı|başlık=International ...,Pharmacol Rev,sim_pharmacol-rev,1994,1994,46,2,International Union of Pharmacology classifica...,157.0,,
31,5-HT-Reseptörü,{{Akademik dergi kaynağı|url=|başlık=The molec...,Biol Psychiatry,sim_biol-psychiatry,1998,1998,44,11,The molecular biology of serotonin receptors: ...,1128.0,,
32,5-HT-Reseptörü,"{{Akademik dergi kaynağı|yazar=Ramadan NM, Skl...",Cephalalgia,sim_cephalalgia,2003,2003,23,8,5-HT1F receptor agonists in acute migraine tre...,776.0,,
36,5-HT-Reseptörü,{{Akademik dergi kaynağı|url=|başlık=Appetite ...,Trends Pharmacol Sci,sim_trends-pharmacol-sci,1997,1997,18,1,Appetite suppression by commonly used drugs de...,21.0,,


### Citations with Existing Urls
- don't need to work with them

In [15]:
# df_w_url = parsed_citations_df[parsed_citations_df['url'] != ""]
# print("There are " + str(df_w_url.shape[0]) + " without existing urls out of " + str(parsed_citations_df.shape[0]))
# df_w_url.head()

### Filter Citations Not in the SIM collections

#### Load info about SIM collections

In [16]:
# Load SIM information processed by SIM Metadata Parsing.ipynb
sim_info = pd.read_csv("SIM_info.csv")
sim_info.head()

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume
0,sim-anatomia-clinica,Anatomia Clinica,,1978.0,1981.0
1,sim_-,The - -,,1826.0,1826.0
2,sim_1001-home-ideas,1001 Home Ideas,,1986.0,1991.0
3,sim_102-monitor,102 Monitor,,1975.0,1981.0
4,sim_20th-century-british-history,20th Century British History,,1990.0,1994.0


In [17]:
 # Limit to citations with the exact sim_id match
df_wo_url_in_sim = pd.merge(left=df_wo_url, right=sim_info, how="inner", 
                  left_on="sim_id", right_on="PubIssueID")
print("Number of Citations without URL in SIM : " + str(df_wo_url_in_sim.shape[0]))
df_wo_url_in_sim.head()

Number of Citations without URL in SIM : 2634


Unnamed: 0,a,c,journal,sim_id,date,year,volume,issue,title,page,url,doi,PubIssueID,Title,NA Gaps,First Volume,Last Volume
0,12 Victoria,{{Akademik dergi kaynağı\n | ad1 = B.\n | s...,Planetary and Space Science,sim_planetary-and-space-science,2012-12,2012,73,,Density of asteroids,98,,10.1016/j.pss.2012.03.009,sim_planetary-and-space-science,Planetary and Space Science,,1959.0,2002.0
1,Uzay Araştırma ve Teknolojisi Enstitüsü,{{Akademik dergi kaynağı|başlık=Results of TV ...,Planetary and Space Science,sim_planetary-and-space-science,1991,1991,39,1–2,Results of TV imaging of phobos (experiment VS...,281,,10.1016/0032-0633(91)90150-9,sim_planetary-and-space-science,Planetary and Space Science,,1959.0,2002.0
2,Azas Platosu,{{Akademik dergi kaynağı|soyadı1=Komatsu|ad1=G...,Planetary and Space Science,sim_planetary-and-space-science,2004-01,2004,52,1-3,"Interior layered deposits of Valles Marineris,...",167,,10.1016/j.pss.2003.08.003,sim_planetary-and-space-science,Planetary and Space Science,,1959.0,2002.0
3,"1,3,5-Triklorobenzen",{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-G...,Chemosphere,sim_chemosphere,1999-12,1999,39,15,Partial solubility parameters of chlorobenzene...,2607,,10.1016/s0045-6535(99)00173-3,sim_chemosphere,Chemosphere,,1972.0,2003.0
4,Orbitrap,{{Akademik dergi kaynağı|başlık=Identification...,Chemosphere,sim_chemosphere,2014-07,2014,107,,Identification of phase II pharmaceutical meta...,65,,10.1016/j.chemosphere.2014.03.021,sim_chemosphere,Chemosphere,,1972.0,2003.0


In [18]:
 # Limit to citations with the exact sim_id match
df_wo_url_wo_doi_in_sim = pd.merge(left=df_wo_url_wo_doi, right=sim_info, how="inner", 
                  left_on="sim_id", right_on="PubIssueID")
print("Number of Citations without URL in SIM : " + str(df_wo_url_wo_doi_in_sim.shape[0]))
df_wo_url_wo_doi_in_sim.head()

Number of Citations without URL in SIM : 642


Unnamed: 0,a,c,journal,sim_id,date,year,volume,issue,title,page,url,doi,PubIssueID,Title,NA Gaps,First Volume,Last Volume
0,5-HT-Reseptörü,"{{Akademik dergi kaynağı|yazar=Ramadan NM, Skl...",Cephalalgia,sim_cephalalgia,2003,2003,23.0,8.0,5-HT1F receptor agonists in acute migraine tre...,776.0,,,sim_cephalalgia,Cephalalgia,,1989.0,2004.0
1,AIDS Farkındalık Haftası,{{Akademik dergi kaynağı|başlık=Update / Mise ...,Canadian Journal of Public Health,sim_canadian-journal-of-public-health,2002-01,2002,93.0,5.0,Update / Mise à jour,,,,sim_canadian-journal-of-public-health,Canadian Journal of Public Health,,1913.0,2014.0
2,Akran grubu,{{Akademik dergi kaynağı|başlık=Early adolesce...,Adolescence,sim_adolescence,1976,1976,11.0,,Early adolescence and its conflict: Group iden...,261.0,,,sim_adolescence,Adolescence,,1966.0,2009.0
3,Balilla-sınıfı denizaltı,{{Akademik dergi kaynağı|başlık=Question 12/88...,Warship International,sim_warship-international,1989,1989,,1.0,Question 12/88,95.0,,,sim_warship-international,Warship International,,1976.0,1996.0
4,Pikrik asit,{{Akademik dergi kaynağı|soyadı1=Brown|ad1=Dav...,Warship International,sim_warship-international,2001,2001,,1.0,Ammunition Explosions in World War I,58.0,,,sim_warship-international,Warship International,,1976.0,1996.0


In [16]:
#  # Limit to citations with the exact sim_id match
# df_w_url_in_sim = pd.merge(left=df_w_url, right=sim_info, how="inner", 
#                   left_on="sim_id", right_on="PubIssueID")
# print("Number of Citations with URL in SIM : " + str(df_w_url_in_sim.shape[0]))
# df_w_url_in_sim.head()

In [17]:
# # Limit to citations with the exact sim_id match
# df_in_sim = pd.merge(left = parsed_citations_df, right=sim_info, how="inner", 
#                   left_on="sim_id", right_on="PubIssueID")
# print("Number of Citations in SIM : " + str(df_in_sim.shape[0]))
# df_in_sim.head()

#### Filter out years that are not within the range of collection

In [19]:
# Filter out the journals of years that are not within the range of collection
def filter_year_range(row):
    year = row['year']
    
    first = row['First Volume']
    last = row['Last Volume']
    gaps = row['NA Gaps']
    if first != np.nan and last != np.nan:
        if year > first and year < last:
            if gaps != np.nan and gaps != "":
                gaps = str(gaps)
                gaps_list = gaps.split(";")
                for gap in gaps_list:
                    if gap.strip() == str(year):
                        return False
                return True
            return True
    return False
#     first = row['First Volume']
#     last = row['Last Volume']
#     if first != np.nan and last != np.nan:
#         if year > first and year < last:
#             return True
#     return False

In [20]:
df_wo_url_in_sim["within_yr_range"] = df_wo_url_in_sim.apply(filter_year_range, axis = 1)
df_wo_url_in_sim_yr_range = df_wo_url_in_sim[df_wo_url_in_sim["within_yr_range"]]
df_wo_url_in_sim_yr_range = df_wo_url_in_sim_yr_range.drop(columns = ['within_yr_range'])
df_wo_url_in_sim_yr_range = df_wo_url_in_sim_yr_range.reset_index(drop = True)
df_wo_url_in_sim_yr_range.head()

Unnamed: 0,a,c,journal,sim_id,date,year,volume,issue,title,page,url,doi,PubIssueID,Title,NA Gaps,First Volume,Last Volume
0,Uzay Araştırma ve Teknolojisi Enstitüsü,{{Akademik dergi kaynağı|başlık=Results of TV ...,Planetary and Space Science,sim_planetary-and-space-science,1991,1991,39,1–2,Results of TV imaging of phobos (experiment VS...,281,,10.1016/0032-0633(91)90150-9,sim_planetary-and-space-science,Planetary and Space Science,,1959.0,2002.0
1,"1,3,5-Triklorobenzen",{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-G...,Chemosphere,sim_chemosphere,1999-12,1999,39,15,Partial solubility parameters of chlorobenzene...,2607,,10.1016/s0045-6535(99)00173-3,sim_chemosphere,Chemosphere,,1972.0,2003.0
2,İz fosili,{{akademik dergi kaynağı\n | yazar = Seilacher...,Marine Geology,sim_marine-geology,1967,1967,5,5–6,Bathymetry of trace fossils,413,,10.1016/0025-3227(67)90051-5,sim_marine-geology,Marine Geology,,1964.0,2000.0
3,1900’de Arjantin,{{Akademik dergi kaynağı |başlık=ARGENTINA. Re...,Public Health Reports,sim_public-health-reports,1900-06,1900,15,24,ARGENTINA. Republic declared free from plague,1504,,41455481,sim_public-health-reports,Public Health Reports,,1878.0,2015.0
4,5-HT-Reseptörü,"{{Akademik dergi kaynağı|yazar=Ramadan NM, Skl...",Cephalalgia,sim_cephalalgia,2003,2003,23,8,5-HT1F receptor agonists in acute migraine tre...,776,,,sim_cephalalgia,Cephalalgia,,1989.0,2004.0


In [21]:
print("Number of Citations without URL in SIM with range of collection years: " + 
      str(df_wo_url_in_sim_yr_range.shape[0]))

Number of Citations without URL in SIM with range of collection years: 1554


In [22]:
df_wo_url_wo_doi_in_sim["within_yr_range"] = df_wo_url_wo_doi_in_sim.apply(filter_year_range, axis = 1)
df_wo_url_wo_doi_in_sim_yr_range = df_wo_url_wo_doi_in_sim[df_wo_url_in_sim["within_yr_range"]]
df_wo_url_wo_doi_in_sim_yr_range = df_wo_url_wo_doi_in_sim_yr_range.drop(columns = ['within_yr_range'])
df_wo_url_wo_doi_in_sim_yr_range = df_wo_url_wo_doi_in_sim_yr_range.reset_index(drop = True)
df_wo_url_wo_doi_in_sim_yr_range.head()

  


Unnamed: 0,a,c,journal,sim_id,date,year,volume,issue,title,page,url,doi,PubIssueID,Title,NA Gaps,First Volume,Last Volume
0,AIDS Farkındalık Haftası,{{Akademik dergi kaynağı|başlık=Update / Mise ...,Canadian Journal of Public Health,sim_canadian-journal-of-public-health,2002-01,2002,93.0,5,Update / Mise à jour,,,,sim_canadian-journal-of-public-health,Canadian Journal of Public Health,,1913.0,2014.0
1,Balilla-sınıfı denizaltı,{{Akademik dergi kaynağı|başlık=Question 12/88...,Warship International,sim_warship-international,1989,1989,,1,Question 12/88,95.0,,,sim_warship-international,Warship International,,1976.0,1996.0
2,Asar-ı Tevfik,{{Akademik dergi kaynağı\n |soyadı1=Caruana\n...,Warship International,sim_warship-international,2007,2007,,4,Question 38/43: Loss of Ottoman Gunboat ''Inti...,326.0,,,sim_warship-international,Warship International,,1976.0,1996.0
3,Asar-ı Tevfik,{{Akademik dergi kaynağı\n |soyadı=Sturton\n ...,Warship International,sim_warship-international,,0,57.0,2,"Through British Eyes: Constantinople Dockyard,...",,,,sim_warship-international,Warship International,,1976.0,1996.0
4,Bismarck'ın son muharebesi,{{Akademik dergi kaynağı|soyadı1=Garzke|ad1= W...,Warship International,sim_warship-international,1994,1994,,2,The ''Bismarck'''s Final Battle,158.0,,,sim_warship-international,Warship International,,1976.0,1996.0


In [23]:
print("Number of Citations without URL without doi in SIM with range of collection years: " + 
      str(df_wo_url_wo_doi_in_sim_yr_range.shape[0]))

Number of Citations without URL without doi in SIM with range of collection years: 458


In [24]:
df_possible_cite = df_wo_url_in_sim_yr_range[(df_wo_url_in_sim_yr_range["sim_id"] != "") &
                                            (df_wo_url_in_sim_yr_range["year"] != "") & 
                                            (df_wo_url_in_sim_yr_range["volume"] != "") & 
                                            (df_wo_url_in_sim_yr_range["page"] != "")]
print("Number of Citations without URL in SIM with range of collection years, has year, volume, page: " + 
      str(df_possible_cite.shape[0]))
df_possible_cite.head()

Number of Citations without URL in SIM with range of collection years, has year, volume, page: 1433


Unnamed: 0,a,c,journal,sim_id,date,year,volume,issue,title,page,url,doi,PubIssueID,Title,NA Gaps,First Volume,Last Volume
0,Uzay Araştırma ve Teknolojisi Enstitüsü,{{Akademik dergi kaynağı|başlık=Results of TV ...,Planetary and Space Science,sim_planetary-and-space-science,1991,1991,39,1–2,Results of TV imaging of phobos (experiment VS...,281,,10.1016/0032-0633(91)90150-9,sim_planetary-and-space-science,Planetary and Space Science,,1959.0,2002.0
1,"1,3,5-Triklorobenzen",{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-G...,Chemosphere,sim_chemosphere,1999-12,1999,39,15,Partial solubility parameters of chlorobenzene...,2607,,10.1016/s0045-6535(99)00173-3,sim_chemosphere,Chemosphere,,1972.0,2003.0
2,İz fosili,{{akademik dergi kaynağı\n | yazar = Seilacher...,Marine Geology,sim_marine-geology,1967,1967,5,5–6,Bathymetry of trace fossils,413,,10.1016/0025-3227(67)90051-5,sim_marine-geology,Marine Geology,,1964.0,2000.0
3,1900’de Arjantin,{{Akademik dergi kaynağı |başlık=ARGENTINA. Re...,Public Health Reports,sim_public-health-reports,1900-06,1900,15,24,ARGENTINA. Republic declared free from plague,1504,,41455481,sim_public-health-reports,Public Health Reports,,1878.0,2015.0
4,5-HT-Reseptörü,"{{Akademik dergi kaynağı|yazar=Ramadan NM, Skl...",Cephalalgia,sim_cephalalgia,2003,2003,23,8,5-HT1F receptor agonists in acute migraine tre...,776,,,sim_cephalalgia,Cephalalgia,,1989.0,2004.0


In [25]:
list(df_possible_cite['c'])[0:5]

['{{Akademik dergi kaynağı|başlık=Results of TV imaging of phobos (experiment VSK-FREGAT)|yazarlar=Avanesov|sayı=1–2|sayfalar=281-295|çalışma=Planetary and Space Science|yayıncı=Elsevier|yıl=1991|cilt=39|pmid=11538495|doi=10.1016/0032-0633(91)90150-9}}',
 '{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-Guang|soyadı2=Chen|ad2=I-Ming|soyadı3=Yen|ad3=Jui-Hung|soyadı4=Wang|ad4=Yei-Shung|tarih=Aralık 1999|başlık=Partial solubility parameters of chlorobenzene and chlorophenol compounds at equilibrium distribution in two immiscible phases|dergi=Chemosphere|cilt=39|sayı=15|sayfalar=2607-2620|doi=10.1016/s0045-6535(99)00173-3|issn=0045-6535}}',
 '{{akademik dergi kaynağı\n | yazar = Seilacher, D.\n |yıl= 1967\n | başlık = Bathymetry of trace fossils\n | dergi = Marine Geology\n | cilt = 5\n | sayı = 5–6\n |doi= 10.1016/0025-3227(67)90051-5\n | sayfalar =413-428|bibcode= 1967MGeol...5..413S\n }}',
 '{{Akademik dergi kaynağı |başlık=ARGENTINA. Republic declared free from plague |author=John Hay |de

In [26]:
df_possible_cite_wo_doi = df_wo_url_wo_doi_in_sim_yr_range[(df_wo_url_wo_doi_in_sim_yr_range["sim_id"] != "") &
                                            (df_wo_url_wo_doi_in_sim_yr_range["year"] != "") & 
                                            (df_wo_url_wo_doi_in_sim_yr_range["volume"] != "") & 
                                            (df_wo_url_wo_doi_in_sim_yr_range["page"] != "")]
print("Number of Citations without URL in SIM with range of collection years, has year, volume, page: " + 
      str(df_possible_cite_wo_doi.shape[0]))
df_possible_cite_wo_doi.head()

Number of Citations without URL in SIM with range of collection years, has year, volume, page: 369


Unnamed: 0,a,c,journal,sim_id,date,year,volume,issue,title,page,url,doi,PubIssueID,Title,NA Gaps,First Volume,Last Volume
5,Biemann Madalyası,{{Akademik dergi kaynağı|başlık=Focus in Honor...,Journal of the American Society for Mass Spect...,sim_journal-of-the-american-society-for-mass-s...,2011-06,2011,22,8,"Focus in Honor of David Muddiman, Recipient of...",1299,,,sim_journal-of-the-american-society-for-mass-s...,Journal of the American Society for Mass Spect...,,1990.0,1993.0
6,Biemann Madalyası,{{Akademik dergi kaynağı|başlık=Focus in Honor...,Journal of the American Society for Mass Spect...,sim_journal-of-the-american-society-for-mass-s...,2013-09,2013,24,11,"Focus in Honor of Josh Coon, Recipient of the ...",1621,,,sim_journal-of-the-american-society-for-mass-s...,Journal of the American Society for Mass Spect...,,1990.0,1993.0
7,Biemann Madalyası,{{Akademik dergi kaynağı|başlık=Focus Honoring...,Journal of the American Society for Mass Spect...,sim_journal-of-the-american-society-for-mass-s...,2017-07,2017,28,9,"Focus Honoring Dr. Kristina ""Kicki"" Håkansson,...",1739,,,sim_journal-of-the-american-society-for-mass-s...,Journal of the American Society for Mass Spect...,,1990.0,1993.0
8,Biemann Madalyası,{{Akademik dergi kaynağı|başlık=Focus on Appli...,Journal of the American Society for Mass Spect...,sim_journal-of-the-american-society-for-mass-s...,2018-07,2018,29,9,Focus on Application of Photons and Radicals f...,1757,,,sim_journal-of-the-american-society-for-mass-s...,Journal of the American Society for Mass Spect...,,1990.0,1993.0
9,Béla Paizs,{{Akademik dergi kaynağı|başlık=Focus in Honor...,Journal of the American Society for Mass Spect...,sim_journal-of-the-american-society-for-mass-s...,2012,2012,23,4,"Focus in Honor of Béla Paizs, Recipient of the...",573,,,sim_journal-of-the-american-society-for-mass-s...,Journal of the American Society for Mass Spect...,,1990.0,1993.0


In [23]:
# df_in_sim["within_yr_range"] = df_in_sim.apply(filter_year_range, axis = 1)
# df_in_sim_yr_range = df_in_sim[df_in_sim["within_yr_range"]]
# df_in_sim_yr_range = df_in_sim_yr_range.drop(columns = ['within_yr_range'])
# df_in_sim_yr_range = df_in_sim_yr_range.reset_index(drop = True)
# df_in_sim_yr_range.head()

In [24]:
# print("Number of Citations in SIM with range of collection years: " + 
#       str(df_in_sim_yr_range.shape[0]))

### Out of curiosity

In [28]:
name_not_match = df_possible_cite_wo_doi[df_possible_cite_wo_doi['journal'] != df_possible_cite_wo_doi['Title']]
print(name_not_match.shape)
name_not_match.head()

(48, 17)


Unnamed: 0,a,c,journal,sim_id,date,year,volume,issue,title,page,url,doi,PubIssueID,Title,NA Gaps,First Volume,Last Volume
29,Harden M. McConnell,{{Akademik dergi kaynağı|başlık=PEOPLE|sayı=38...,Chemical Engineering News,sim_chemical-engineering-news,1968,1968,46,38,PEOPLE,36.0,,,sim_chemical-engineering-news,Chemical & Engineering News,,1923.0,2014.0
70,Gece yeme sendromu,{{Akademik dergi kaynağı|url=|başlık=Neuroendo...,Journal of Clinical Endocrinology and Metabolism,sim_journal-of-clinical-endocrinology-and-meta...,2005,2005,9,11,Neuroendocrine profiles associated with energy...,6214.0,,,sim_journal-of-clinical-endocrinology-and-meta...,The Journal of Clinical Endocrinology and Meta...,,1941.0,1977.0
89,Güneş Sistemi'ndeki kütleçekimsel yuvarlak nes...,{{Akademik dergi kaynağı|başlık=The Origin and...,Astronomy Geophysics,sim_astronomy-geophysics,2000,2000,41,1,The Origin and Evolution of the Solar System,1.12,,,sim_astronomy-geophysics,Astronomy & Geophysics,,1980.0,1996.0
90,Güneş Sistemi'ndeki kütleçekimsel yuvarlak nes...,"{{Akademik dergi kaynağı|başlık=The size, dens...",Astronomical Journal,sim_astronomical-journal,2009,2009,139,6,"The size, density, and formation of the Orcus-...",2700.0,,,sim_astronomical-journal,The Astronomical Journal,,1849.0,1994.0
133,Kemerli burun,{{Akademik dergi kaynağı|başlık=The Common Sen...,Law Social Inquiry,sim_law-social-inquiry,2006,2006,31,2,The Common Sense of Anti-Indian Racism: Reacti...,313.0,,,sim_law-social-inquiry,Law & Social Inquiry,,1982.0,2006.0


## Work with Advanced Search

#### Concept (one case)

In [29]:
test_url_head = "https://archive.org/advancedsearch.php?q="
test_identifier = "sim_canadian-journal-of-public-health_2002"
test_url_tail = "&fl%5B%5D=identifier&callback=callback&save=yes&output=json"

In [30]:
test_url = test_url_head + test_identifier + test_url_tail
request = requests.get(test_url)
request.text

'callback({"responseHeader":{"status":0,"QTime":146,"params":{"query":"(( ( (title:\\"sim_canadian-journal-of-public-health_2002\\"^100 OR salients:\\"sim_canadian-journal-of-public-health_2002\\"^50 OR subject:\\"sim_canadian-journal-of-public-health_2002\\"^25 OR description:\\"sim_canadian-journal-of-public-health_2002\\"^15 OR collection:\\"sim_canadian-journal-of-public-health_2002\\"^10 OR language:\\"sim_canadian-journal-of-public-health_2002\\"^10 OR text:\\"sim_canadian-journal-of-public-health_2002\\"^1) ) AND !collection:(podcasts OR radio OR uspto))^2 OR ( ( (title:\\"sim_canadian-journal-of-public-health_2002\\"^100 OR salients:\\"sim_canadian-journal-of-public-health_2002\\"^50 OR subject:\\"sim_canadian-journal-of-public-health_2002\\"^25 OR description:\\"sim_canadian-journal-of-public-health_2002\\"^15 OR collection:\\"sim_canadian-journal-of-public-health_2002\\"^10 OR language:\\"sim_canadian-journal-of-public-health_2002\\"^10 OR text:\\"sim_canadian-journal-of-publ

In [31]:
info = request.text[9:-1]
info

'{"responseHeader":{"status":0,"QTime":146,"params":{"query":"(( ( (title:\\"sim_canadian-journal-of-public-health_2002\\"^100 OR salients:\\"sim_canadian-journal-of-public-health_2002\\"^50 OR subject:\\"sim_canadian-journal-of-public-health_2002\\"^25 OR description:\\"sim_canadian-journal-of-public-health_2002\\"^15 OR collection:\\"sim_canadian-journal-of-public-health_2002\\"^10 OR language:\\"sim_canadian-journal-of-public-health_2002\\"^10 OR text:\\"sim_canadian-journal-of-public-health_2002\\"^1) ) AND !collection:(podcasts OR radio OR uspto))^2 OR ( ( (title:\\"sim_canadian-journal-of-public-health_2002\\"^100 OR salients:\\"sim_canadian-journal-of-public-health_2002\\"^50 OR subject:\\"sim_canadian-journal-of-public-health_2002\\"^25 OR description:\\"sim_canadian-journal-of-public-health_2002\\"^15 OR collection:\\"sim_canadian-journal-of-public-health_2002\\"^10 OR language:\\"sim_canadian-journal-of-public-health_2002\\"^10 OR text:\\"sim_canadian-journal-of-public-health

In [32]:
import json
response = json.loads(info)
response

{'responseHeader': {'status': 0,
  'QTime': 146,
  'params': {'query': '(( ( (title:"sim_canadian-journal-of-public-health_2002"^100 OR salients:"sim_canadian-journal-of-public-health_2002"^50 OR subject:"sim_canadian-journal-of-public-health_2002"^25 OR description:"sim_canadian-journal-of-public-health_2002"^15 OR collection:"sim_canadian-journal-of-public-health_2002"^10 OR language:"sim_canadian-journal-of-public-health_2002"^10 OR text:"sim_canadian-journal-of-public-health_2002"^1) ) AND !collection:(podcasts OR radio OR uspto))^2 OR ( ( (title:"sim_canadian-journal-of-public-health_2002"^100 OR salients:"sim_canadian-journal-of-public-health_2002"^50 OR subject:"sim_canadian-journal-of-public-health_2002"^25 OR description:"sim_canadian-journal-of-public-health_2002"^15 OR collection:"sim_canadian-journal-of-public-health_2002"^10 OR language:"sim_canadian-journal-of-public-health_2002"^10 OR text:"sim_canadian-journal-of-public-health_2002"^1) ) AND collection:(podcasts OR radi

In [33]:
response['response']

{'numFound': 1,
 'start': 0,
 'docs': [{'_score': 65.774864,
   'identifier': 'sim_canadian-journal-of-public-health_2002_93_index'}]}

In [34]:
response['response']['docs']

[{'_score': 65.774864,
  'identifier': 'sim_canadian-journal-of-public-health_2002_93_index'}]

### Generalize

In [56]:
def initialize_archive_session():
    with open('cookie.txt') as f:
        login_data_raw = f.read()
    
    login_data = json.loads(login_data_raw)
    
    session = requests.Session()
    
#     # Mount it for both http and https usage
#     adapter = TimeoutHTTPAdapter(timeout=10)
#     session.mount("https://", adapter)
#     session.mount("http://", adapter)
    
    # get login values through scraping
    login_url = "https://archive.org/account/login"
    res = session.get(login_url, timeout = 10)
    soup = BeautifulSoup(res.content, "html.parser")
    login_data['login'] = soup.find('input', attrs = {'name':'login'})['value']    
    
    # post cookie values to login 
    res = session.post(login_url, data = login_data, timeout = 10)
    
    return session, login_data


In [57]:
def perform_advanced_search(session, login_data, identifier):
    url_head = "https://archive.org/advancedsearch.php?q="
    url_tail = "&fl%5B%5D=identifier&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=5000&page=1&output=json&callback=callback&save=yes"
    
    url = url_head + identifier + url_tail
    
    try:
        request = session.get(url, data = login_data, timeout = 10)
        request.raise_for_status()
        
        response = request.text[9:-1]
        response_json = json.loads(response)['response']

        nums = response_json['numFound']

        # if nothing is found, return empty string
        if nums == 0: 
            return ""

        # if more than one thing is found, return 
        result = dict()
        response_list = response_json['docs']
        for item in response_list:
            temp_id = item['identifier']
            temp_score = item['_score']
            result[temp_id] = temp_score

        return result
    except requests.exceptions.HTTPError as errh:
        print ("Http Error:",errh)
    except requests.exceptions.ConnectionError as errc:
        print ("Error Connecting:",errc)
    except requests.exceptions.Timeout as errt:
        print ("Timeout Error:",errt)
    except requests.exceptions.RequestException as err:
        print ("OOps: Something Else",err)
    
    return ""

In [58]:
# For example, there can be no response
session, login_data = initialize_archive_session()
perform_advanced_search(session, login_data, "sim_marine-geology")

''

In [39]:
# For example, there can be multiple response
perform_advanced_search(session, login_data, "sim_canadian-journal-of-public-health_1994_85")

{'sim_canadian-journal-of-public-health_november-december-1993_84_6': 1275.6439,
 'sim_canadian-journal-of-public-health_january-february-1994_85_1': 1268.1522,
 'sim_canadian-journal-of-public-health_1994_85_index': 79.35465}

Directly Using URL generation, something like https://archive.org/details/sim_bioscience_1996_46_index would be deemed bad when in fact, it is just in a different string format

In [40]:
# For example, there can be a lot of responses
ex_multiple = perform_advanced_search(session, login_data, "sim_bioscience_1996_46")
ex_multiple

{'sim_bioscience_1996-05_46_5': 1442.0604,
 'sim_bioscience_1996-02_46_2': 1438.8639,
 'sim_bioscience_1996-01_46_1': 1432.8804,
 'sim_bioscience_1996-11_46_10': 1429.0781,
 'sim_bioscience_1996-09_46_8': 1423.1581,
 'sim_bioscience_1996-10_46_9': 1420.214,
 'sim_bioscience_1996-04_46_4': 1418.4635,
 'sim_bioscience_july-august-1996_46_7': 1414.2814,
 'sim_bioscience_1996-06_46_6': 1412.6345,
 'sim_bioscience_1996-03_46_3': 1408.5902,
 'sim_bioscience_january-december-1996_46_index': 1359.7362,
 'sim_bioscience_1996-12_46_11': 1358.7174,
 'sim_bioscience_1995-12_45_11': 1255.2961,
 'sim_bioscience_january-december-1997_47_index': 1246.396}

In [41]:
def find_best_identifier_of_multiple(d):
    keys = list(d.keys())
    vals = list(d.values())
    index_of_max = vals.index(max(vals))
    return keys[index_of_max]

In [42]:
find_best_identifier_of_multiple(ex_multiple)

'sim_bioscience_1996-05_46_5'

### Generate Identifiers

In [43]:
### Different functions for generating ids
def generate_id_journal_year(cite_info):
    identifier = cite_info['sim_id'] 
    if cite_info['year'] != 0:
        identifier = identifier + "_" + str(cite_info['year'])
    return identifier

def generate_id_journal_date(cite_info):
    identifier = cite_info['sim_id'] 
    if cite_info['date'] != "":
        identifier = identifier + "_" + str(cite_info['date'])
    return identifier

    
def generate_id_journal_year_volume(cite_info):
    identifier = generate_id_journal_year(cite_info)
    if cite_info['volume'] != "":
        identifier = identifier + "_" + cite_info['volume']
    return identifier

def generate_id_journal_date_volume(cite_info):
    identifier = generate_id_journal_date(cite_info)
    if cite_info['volume'] != "":
        identifier = identifier + "_" + cite_info['volume']
    return identifier


def generate_id_journal_year_volume_issue(cite_info):
    identifier = generate_id_journal_year_volume(cite_info)
    if cite_info['issue'] != "":
        identifier = identifier + "_" + cite_info['issue']
    return identifier

def generate_id_journal_date_volume_issue(cite_info):
    identifier = generate_id_journal_date_volume(cite_info)
    if cite_info['issue'] != "":
        identifier = identifier + "_" + cite_info['issue']
    return identifier

In [44]:
df_possible_cite.head()

Unnamed: 0,a,c,journal,sim_id,date,year,volume,issue,title,page,url,doi,PubIssueID,Title,NA Gaps,First Volume,Last Volume
0,Uzay Araştırma ve Teknolojisi Enstitüsü,{{Akademik dergi kaynağı|başlık=Results of TV ...,Planetary and Space Science,sim_planetary-and-space-science,1991,1991,39,1–2,Results of TV imaging of phobos (experiment VS...,281,,10.1016/0032-0633(91)90150-9,sim_planetary-and-space-science,Planetary and Space Science,,1959.0,2002.0
1,"1,3,5-Triklorobenzen",{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-G...,Chemosphere,sim_chemosphere,1999-12,1999,39,15,Partial solubility parameters of chlorobenzene...,2607,,10.1016/s0045-6535(99)00173-3,sim_chemosphere,Chemosphere,,1972.0,2003.0
2,İz fosili,{{akademik dergi kaynağı\n | yazar = Seilacher...,Marine Geology,sim_marine-geology,1967,1967,5,5–6,Bathymetry of trace fossils,413,,10.1016/0025-3227(67)90051-5,sim_marine-geology,Marine Geology,,1964.0,2000.0
3,1900’de Arjantin,{{Akademik dergi kaynağı |başlık=ARGENTINA. Re...,Public Health Reports,sim_public-health-reports,1900-06,1900,15,24,ARGENTINA. Republic declared free from plague,1504,,41455481,sim_public-health-reports,Public Health Reports,,1878.0,2015.0
4,5-HT-Reseptörü,"{{Akademik dergi kaynağı|yazar=Ramadan NM, Skl...",Cephalalgia,sim_cephalalgia,2003,2003,23,8,5-HT1F receptor agonists in acute migraine tre...,776,,,sim_cephalalgia,Cephalalgia,,1989.0,2004.0


In [45]:
def generate_ids(cite_info):
    
    # check have minimum set of info
    has_2_yr = False
    has_3_yr = False
    has_4_yr = False
    has_2_date = False
    has_3_date = False
    has_4_date = False

    if cite_info["sim_id"] != "" and cite_info["volume"] != "":

        if cite_info['year'] != "":
            has_3_yr = True
            has_2_yr = True
            if cite_info["issue"]:
                has_4_yr = True
        if cite_info['date'] != "":
            has_3_date = True
            has_2_date = True
            if cite_info["issue"]:
                has_4_date = True
    
    gen_ids = []
    if has_2_date:
        gen_ids.append(generate_id_journal_date(cite_info))

    if has_2_yr:
        gen_ids.append(generate_id_journal_year(cite_info))

    if has_3_date: 
        gen_ids.append(generate_id_journal_date_volume(cite_info))

    if has_3_yr:
        gen_ids.append(generate_id_journal_year_volume(cite_info))

    if has_4_date:
        gen_ids.append(generate_id_journal_date_volume_issue(cite_info))

    if has_4_yr:
        gen_ids.append(generate_id_journal_year_volume_issue(cite_info))
        
    return gen_ids


In [46]:
# df_possible_cite['generated_ids'] = df_possible_cite.apply(generate_ids, axis = 1)
# df_possible_cite.head()

In [47]:
# df_possible_cite.loc[0, 'generated_ids']

### Different Methods for finding identifier from generated identifiers 

In [44]:
# def find_highest_score_from_generated_id(identifier):
#     result = perform_advanced_search(identifier)
#     if type(result) is str:
#         return result
#     else:
#         return find_best_identifier_of_multiple(result)

In [48]:
def find_close_match_from_cite_info(cite_info, search_result, verbose = False):
    id_list = list(search_result.keys())
#     identifier_list = identifier.split("_")

#     print(cite_info)
    
    close_matches = dict()
    for possible_id in id_list:
        possible_id_list = possible_id.split("_")
        
        
        if verbose: print(possible_id_list)
        
        if len(possible_id_list) < 4:
            continue;
        
        if cite_info["issue"] != "" and len(possible_id_list) >= 5:
            
            if verbose: print("longer than 5")
            
            # Check that it is sim
            if possible_id_list[0] == "sim":
                
                # Check that journal name matches
                if possible_id_list[1] == cite_info['sim_id'][4:]:
                    
                    # Check that year/date is within other case
                    if str(cite_info["year"]) in possible_id_list[2]:
                        
                        # Check that journal volume matches
                        if possible_id_list[3] == cite_info["volume"]:
                            
                            # Check that journal issue matches
                            if possible_id_list[4] == cite_info["issue"]:
                                close_matches[possible_id] = search_result[possible_id]
                                continue;
                                
                            if verbose: print("Not the right issue")
                            continue;
                            
                        if verbose: print("Not the right volume")
                        continue;
                        
                    if verbose: print("Not the right year")
                    continue;
                    
                if verbose: print("Possible id journal name not exact match")
                continue;
                
            if verbose: print("Possible id is not in sim")
            continue;
            
        if len(possible_id_list) == 4:
            
            if verbose: print("equal to 4")
            
            # Check that it is sim
            if possible_id_list[0] == "sim":
                
                # Check that journal name matches
                if possible_id_list[1] == cite_info['sim_id'][4:]:
                    
                    # Check that year/date is within other case
                    if str(cite_info["year"]) in possible_id_list[2]:
                        
                        # Check that journal volume matches
                        if possible_id_list[3] == cite_info["volume"]:
                            
                            close_matches[possible_id] = search_result[possible_id]
                            continue;
                            
                        if verbose: print("Not the right volume")
                        continue;
                        
                    if verbose: print("Not the right year")
                    continue;
                    
                if verbose: print("Possible id journal name not exact match")
                continue;
                            
            if verbose: print("Possible id is not in sim")
            continue;
            
    if verbose: 
        print('close matches: ')
        print(close_matches)
                    
    if not close_matches:
        return ""
    
    close_matches_keys = list(close_matches.keys())
    close_matches_values = list(close_matches.values())
    
    # if 1 close match, return it
    if len(close_matches_keys) == 1:
        return close_matches_keys[0]
    
    
    # if multiple, return the one with the highest score
    index_of_max = close_matches_values.index(max(close_matches_values))
    return close_matches_keys[index_of_max]
    

In [52]:
def generate_url_actual(identifier):
    return "https://archive.org/details/" + identifier

In [50]:
def find_actual_identifier(cite_info):
    # generate a id
    gen_id = generate_id_journal_year(cite_info)
            
    # initialize session
    session, login_data = initialize_archive_session()
            
    # find all entries with this journal name
    search_result = perform_advanced_search(session, login_data, gen_id)
            
    if search_result != "":

        # find close match on generated id
        real_id = find_close_match_from_cite_info(cite_info, search_result)
                
        if real_id != "":

            # new citation
            url = generate_url_actual(real_id)
            print("url: " + url)
            return url

        print("No close match exist for generated ids. The id is: ")
        return ""

    print("There's no search result for the id: " + gen_id)
    return ""

In [59]:
possible_url_wo_doi_lst = df_possible_cite_wo_doi.apply(find_actual_identifier, axis = 1)
# possible_url_wo_doi_lst

There's no search result for the id: sim_journal-of-the-american-society-for-mass-spectrometry_2011
There's no search result for the id: sim_journal-of-the-american-society-for-mass-spectrometry_2013
There's no search result for the id: sim_journal-of-the-american-society-for-mass-spectrometry_2017
There's no search result for the id: sim_journal-of-the-american-society-for-mass-spectrometry_2018
There's no search result for the id: sim_journal-of-the-american-society-for-mass-spectrometry_2012
There's no search result for the id: sim_journal-of-the-american-society-for-mass-spectrometry_2004
There's no search result for the id: sim_journal-of-the-american-society-for-mass-spectrometry_2006
There's no search result for the id: sim_journal-of-the-american-society-for-mass-spectrometry_2013
There's no search result for the id: sim_journal-of-the-american-society-for-mass-spectrometry_2013
There's no search result for the id: sim_journal-of-the-american-society-for-mass-spectrometry_2013


url: https://archive.org/details/sim_nature-biotechnology_2001-03_19_3
There's no search result for the id: sim_nature-biotechnology_2018
There's no search result for the id: sim_journal-of-personality-and-social-psychology_1993
There's no search result for the id: sim_journal-of-personality-and-social-psychology_2003
There's no search result for the id: sim_journal-of-personality-and-social-psychology_2001
There's no search result for the id: sim_journal-of-personality-and-social-psychology_1996
There's no search result for the id: sim_journal-of-personality-and-social-psychology_1987
There's no search result for the id: sim_journal-of-personality-and-social-psychology_1995
There's no search result for the id: sim_personality-and-individual-differences_2012
There's no search result for the id: sim_personality-and-individual-differences_2009
There's no search result for the id: sim_evolution-and-human-behavior_2002
There's no search result for the id: sim_human-reproduction_2010
No clo

There's no search result for the id: sim_medical-care_2008
There's no search result for the id: sim_journal-of-physical-chemistry_1993
url: https://archive.org/details/sim_biophysical-journal_1993-03_64_3
There's no search result for the id: sim_vision-research_2020
There's no search result for the id: sim_western-journal-of-nursing-research_2017
url: https://archive.org/details/sim_neuropsychology-review_1992-06_3_2
url: https://archive.org/details/sim_current-directions-in-psychological-science_2006-02_15_1
url: https://archive.org/details/sim_canadian-journal-of-microbiology_2009-05_55_5
url: https://archive.org/details/sim_antimicrobial-agents-and-chemotherapy_2005-06_49_6
There's no search result for the id: sim_chemistry-letters_2005
url: https://archive.org/details/sim_virginia-magazine-of-history-and-biography_1993-04_101_2
There's no search result for the id: sim_texas-studies-in-literature-and-language_2014
url: https://archive.org/details/sim_behavioral-and-brain-sciences_20

There's no search result for the id: sim_journal-of-pediatric-and-adolescent-gynecology_2015
There's no search result for the id: sim_journal-of-personality_2017
There's no search result for the id: sim_journal-of-health-and-social-behavior_1998
There's no search result for the id: sim_review-of-research-in-education_2016
There's no search result for the id: sim_journal-of-paleontology_1977
There's no search result for the id: sim_mankind-quarterly
url: https://archive.org/details/sim_australian-journal-of-international-affairs_2008-09_62_3
No close match exist for ids.


In [60]:
possible_url_wo_doi_lst

5                                                       
6                                                       
7                                                       
8                                                       
9                                                       
                             ...                        
452                                                     
453                                                     
454                                                     
456    https://archive.org/details/sim_australian-jou...
457                                                     
Length: 369, dtype: object

In [61]:
df_possible_cite_wo_doi["generated_url"] = possible_url_wo_doi_lst
df_possible_cite_wo_doi.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,a,c,journal,sim_id,date,year,volume,issue,title,page,url,doi,PubIssueID,Title,NA Gaps,First Volume,Last Volume,generated_url
5,Biemann Madalyası,{{Akademik dergi kaynağı|başlık=Focus in Honor...,Journal of the American Society for Mass Spect...,sim_journal-of-the-american-society-for-mass-s...,2011-06,2011,22,8,"Focus in Honor of David Muddiman, Recipient of...",1299,,,sim_journal-of-the-american-society-for-mass-s...,Journal of the American Society for Mass Spect...,,1990.0,1993.0,
6,Biemann Madalyası,{{Akademik dergi kaynağı|başlık=Focus in Honor...,Journal of the American Society for Mass Spect...,sim_journal-of-the-american-society-for-mass-s...,2013-09,2013,24,11,"Focus in Honor of Josh Coon, Recipient of the ...",1621,,,sim_journal-of-the-american-society-for-mass-s...,Journal of the American Society for Mass Spect...,,1990.0,1993.0,
7,Biemann Madalyası,{{Akademik dergi kaynağı|başlık=Focus Honoring...,Journal of the American Society for Mass Spect...,sim_journal-of-the-american-society-for-mass-s...,2017-07,2017,28,9,"Focus Honoring Dr. Kristina ""Kicki"" Håkansson,...",1739,,,sim_journal-of-the-american-society-for-mass-s...,Journal of the American Society for Mass Spect...,,1990.0,1993.0,
8,Biemann Madalyası,{{Akademik dergi kaynağı|başlık=Focus on Appli...,Journal of the American Society for Mass Spect...,sim_journal-of-the-american-society-for-mass-s...,2018-07,2018,29,9,Focus on Application of Photons and Radicals f...,1757,,,sim_journal-of-the-american-society-for-mass-s...,Journal of the American Society for Mass Spect...,,1990.0,1993.0,
9,Béla Paizs,{{Akademik dergi kaynağı|başlık=Focus in Honor...,Journal of the American Society for Mass Spect...,sim_journal-of-the-american-society-for-mass-s...,2012,2012,23,4,"Focus in Honor of Béla Paizs, Recipient of the...",573,,,sim_journal-of-the-american-society-for-mass-s...,Journal of the American Society for Mass Spect...,,1990.0,1993.0,


In [63]:
df_possible_cite_wo_doi_good = df_possible_cite_wo_doi[df_possible_cite_wo_doi["generated_url"] != ""]
print("There are " + str(df_possible_cite_wo_doi_good.shape[0]) + " good urls out of " + str(df_possible_cite_wo_doi.shape[0]))
df_possible_cite_wo_doi_good.head()

There are 92 good urls out of 369


Unnamed: 0,a,c,journal,sim_id,date,year,volume,issue,title,page,url,doi,PubIssueID,Title,NA Gaps,First Volume,Last Volume,generated_url
40,Flaş bellek (psikoloji),{{Akademik dergi kaynağı|başlık=Autobiographic...,Applied Cognitive Psychology,sim_applied-cognitive-psychology,2010-02,2010,24,2,Autobiographical and event memories for surpri...,177,,,sim_applied-cognitive-psychology,Applied Cognitive Psychology,,1987.0,2010.0,https://archive.org/details/sim_applied-cognit...
42,Flaş bellek (psikoloji),{{Akademik dergi kaynağı|başlık=The role of im...,Applied Cognitive Psychology,sim_applied-cognitive-psychology,2009-02,2009,23,2,The role of importance/consequentiality apprai...,236,,,sim_applied-cognitive-psychology,Applied Cognitive Psychology,,1987.0,2010.0,https://archive.org/details/sim_applied-cognit...
43,Flaş bellek (psikoloji),{{Akademik dergi kaynağı|başlık=The role of im...,Applied Cognitive Psychology,sim_applied-cognitive-psychology,2009-02,2009,23,2,The role of importance/consequentiality apprai...,236,,,sim_applied-cognitive-psychology,Applied Cognitive Psychology,,1987.0,2010.0,https://archive.org/details/sim_applied-cognit...
44,Flaş bellek (psikoloji),{{Akademik dergi kaynağı|başlık=The effects of...,Applied Cognitive Psychology,sim_applied-cognitive-psychology,2007-12,2007,21,8,The effects of affect and input source on flas...,1023,,,sim_applied-cognitive-psychology,Applied Cognitive Psychology,,1987.0,2010.0,https://archive.org/details/sim_applied-cognit...
45,Flaş bellek (psikoloji),{{Akademik dergi kaynağı|başlık=FLashbulb memo...,Applied Cognitive Psychology,sim_applied-cognitive-psychology,2009,2009,23,5,FLashbulb memory for 11 September 2001,605,,,sim_applied-cognitive-psychology,Applied Cognitive Psychology,,1987.0,2010.0,https://archive.org/details/sim_applied-cognit...


In [66]:
df_possible_cite_wo_doi_bad = df_possible_cite_wo_doi[df_possible_cite_wo_doi["generated_url"] == ""]
print("There are " + str(df_possible_cite_wo_doi_bad.shape[0]) + " good urls out of " + str(df_possible_cite_wo_doi.shape[0]))
df_possible_cite_wo_doi_bad['sim_id'].value_counts()

There are 277 good urls out of 369


sim_journal-of-the-american-society-for-mass-spectrometry    17
sim_psychological-review                                      8
sim_psychological-science                                     6
sim_cell                                                      6
sim_memory-cognition                                          6
                                                             ..
sim_strategies                                                1
sim_trends-in-genetics                                        1
sim_nature-biotechnology                                      1
sim_american-journal-of-physical-anthropology                 1
sim_journal-of-semitic-studies                                1
Name: sim_id, Length: 159, dtype: int64

In [68]:
df_cite_wo_doi = df_possible_cite_wo_doi[['a', 'c', 'journal', 'year', 'volume', 'issue', 'title', 'page', 'generated_url']]
df_cite_wo_doi.head()

Unnamed: 0,a,c,journal,year,volume,issue,title,page,generated_url
5,Biemann Madalyası,{{Akademik dergi kaynağı|başlık=Focus in Honor...,Journal of the American Society for Mass Spect...,2011,22,8,"Focus in Honor of David Muddiman, Recipient of...",1299,
6,Biemann Madalyası,{{Akademik dergi kaynağı|başlık=Focus in Honor...,Journal of the American Society for Mass Spect...,2013,24,11,"Focus in Honor of Josh Coon, Recipient of the ...",1621,
7,Biemann Madalyası,{{Akademik dergi kaynağı|başlık=Focus Honoring...,Journal of the American Society for Mass Spect...,2017,28,9,"Focus Honoring Dr. Kristina ""Kicki"" Håkansson,...",1739,
8,Biemann Madalyası,{{Akademik dergi kaynağı|başlık=Focus on Appli...,Journal of the American Society for Mass Spect...,2018,29,9,Focus on Application of Photons and Radicals f...,1757,
9,Béla Paizs,{{Akademik dergi kaynağı|başlık=Focus in Honor...,Journal of the American Society for Mass Spect...,2012,23,4,"Focus in Honor of Béla Paizs, Recipient of the...",573,


In [46]:
# ### dumb attempt
# def find_actual_identifier(cite_info):
   
    
# #     print(login_data)
# #     count_global += 1
#     gen_ids = cite_info['generated_ids']

#     # we want to check the 4 field (more exact) possibilities first
#     for gen_id in reversed(gen_ids):

#         # perform advanced search on generated identifier
#         session, login_data = initialize_archive_session()
#         search_result = perform_advanced_search(session, login_data, gen_id)

# #       print("Search Result: ")
# #       print(search_result)

#         if search_result != "":


#             # find close match on generated id
#             real_id = find_close_match_from_generated_id(cite_info, search_result)

# #           print("Actual id: " + real_id)

#             if real_id != "":

#                 # actual id
#                 return real_id
            
# #     if count_global % 10 == 0:
# #         print(count_global)
            
#     return ""

In [49]:
# %%time
# cite_info_test = df_possible_cite.loc[2]
# print(cite_info_test)
# find_actual_identifier(cite_info_test)

## Sample 
- since full dataframe takes a while

In [54]:
sample = df_possible_cite.sample(n = 50)
print(sample.shape)
sample.head()

(50, 18)


Unnamed: 0,a,c,journal,sim_id,date,year,month,volume,issue,title,page,url,PubIssueID,Title,NA Gaps,First Volume,Last Volume,generated_ids
960,Şarabın sağlığa etkileri,{{Akademik dergi kaynağı|başlık=Alcohol and ca...,Mayo Clinic Proceedings,sim_mayo-clinic-proceedings,2014-03,2014,3,89,3,Alcohol and cardiovascular health: the dose ma...,382,,sim_mayo-clinic-proceedings,Mayo Clinic Proceedings,,1964.0,2015.0,"[sim_mayo-clinic-proceedings_2014-03, sim_mayo..."
1072,Üre nefes testi,{{Akademik dergi kaynağı|başlık=Current concep...,Gut,sim_gut,2007,2007,0,56,6,Current concepts in the management of Helicoba...,772,,sim_gut,Gut,,1960.0,2008.0,"[sim_gut_2007, sim_gut_2007, sim_gut_2007_56, ..."
1474,Paul Milgrom,{{Akademik dergi kaynağı|başlık=A Convergence ...,Econometrica,sim_econometrica,1979,1979,0,47,3,A Convergence Theorem for Competitive Bidding ...,679,,sim_econometrica,Econometrica,,1933.0,2015.0,"[sim_econometrica_1979, sim_econometrica_1979,..."
849,Bedenlenmiş Biliş,{{Akademik dergi kaynağı|başlık=From Firm Musc...,Journal of Consumer Research,sim_journal-of-consumer-research,2011,2011,0,37,6,From Firm Muscles to Firm Willpower: Understan...,1046,,sim_journal-of-consumer-research,Journal of Consumer Research,,1974.0,2015.0,"[sim_journal-of-consumer-research_2011, sim_jo..."
572,Otar Lortkipanidze,{{Akademik dergi kaynağı|başlık=Otar Lortkipan...,American Journal of Archaeology,sim_american-journal-of-archaeology,2003-01,2003,1,107,1,"Otar Lortkipanidze, 1930–2002",105,,sim_american-journal-of-archaeology,American Journal of Archaeology,,1885.0,2015.0,"[sim_american-journal-of-archaeology_2003-01, ..."


In [27]:
sample["identifier_actual"] = sample.apply(find_actual_identifier, axis = 1)
sample.head()

In [75]:
sample.head()

Unnamed: 0,a,c,journal_name,sim_id,date,year,month,volume,issue,article,page,url,PubIssueID,Title,First Volume,Last Volume,identifier_in,identifier_exact
401,2. Dünya Savaşı Sovyetler Birliği silahları li...,{{Akademik dergi kaynağı|başlık=When Titans Cl...,Foreign Affairs,sim_foreign-affairs,1995,1995,0,75.0,3.0,When Titans Clashed: How the Red Army Stopped ...,,,sim_foreign-affairs,Foreign Affairs,1922.0,2016.0,sim_foreign-affairs_1995_75_3,
1635,Nebivolol,"{{Akademik dergi kaynağı|başlık=Nebivolol, a v...",Journal of the American College of Cardiology,sim_journal-of-the-american-college-of-cardiology,2009-04,2009,4,53.0,17.0,"Nebivolol, a vasodilating selective beta(1)-bl...",1532.0,,sim_journal-of-the-american-college-of-cardiology,Journal of the American College of Cardiology,1983.0,2014.0,sim_journal-of-the-american-college-of-cardiol...,
1577,Embraer,{{Akademik dergi kaynağı|başlık=Embraer in Chi...,Aviation Week Space Technology,sim_aviation-week-space-technology,2013-10,2013,10,,,Embraer in China,60.0,,sim_aviation-week-space-technology,Aviation Week & Space Technology,1916.0,2016.0,sim_aviation-week-space-technology_2013,
1351,Valz Ödülü,{{Akademik dergi kaynağı|url=https://books.goo...,Popular Astronomy,sim_popular-astronomy,1913,1913,0,21.0,,The Valz Prize,,https://books.google.com/books?id,sim_popular-astronomy,Popular Astronomy,1893.0,1951.0,sim_popular-astronomy_1913_21,
2174,Rüzgâr türbini tasarımı,{{Akademik dergi kaynağı|başlık=Alternative Co...,Journal of Solar Energy Engineering,sim_journal-of-solar-energy-engineering,2003,2003,0,125.0,4.0,Alternative Composite Materials for Megawatt-S...,515.0,,sim_journal-of-solar-energy-engineering,Journal of Solar Energy Engineering,1980.0,2011.0,sim_journal-of-solar-energy-engineering_2003_1...,


In [55]:
# actual_ids = sample['identifier_in'].apply(find_real_identifier_from_generated)
# sample['identifier_out'] = actual_ids
# sample.head()

In [76]:
sample_wo_id = sample[sample['identifier_actual'] == ""]
print(sample_wo_id.shape)
sample_wo_id.head()

(100, 18)


Unnamed: 0,a,c,journal_name,sim_id,date,year,month,volume,issue,article,page,url,PubIssueID,Title,First Volume,Last Volume,identifier_in,identifier_exact
401,2. Dünya Savaşı Sovyetler Birliği silahları li...,{{Akademik dergi kaynağı|başlık=When Titans Cl...,Foreign Affairs,sim_foreign-affairs,1995,1995,0,75.0,3.0,When Titans Clashed: How the Red Army Stopped ...,,,sim_foreign-affairs,Foreign Affairs,1922.0,2016.0,sim_foreign-affairs_1995_75_3,
1635,Nebivolol,"{{Akademik dergi kaynağı|başlık=Nebivolol, a v...",Journal of the American College of Cardiology,sim_journal-of-the-american-college-of-cardiology,2009-04,2009,4,53.0,17.0,"Nebivolol, a vasodilating selective beta(1)-bl...",1532.0,,sim_journal-of-the-american-college-of-cardiology,Journal of the American College of Cardiology,1983.0,2014.0,sim_journal-of-the-american-college-of-cardiol...,
1577,Embraer,{{Akademik dergi kaynağı|başlık=Embraer in Chi...,Aviation Week Space Technology,sim_aviation-week-space-technology,2013-10,2013,10,,,Embraer in China,60.0,,sim_aviation-week-space-technology,Aviation Week & Space Technology,1916.0,2016.0,sim_aviation-week-space-technology_2013,
1351,Valz Ödülü,{{Akademik dergi kaynağı|url=https://books.goo...,Popular Astronomy,sim_popular-astronomy,1913,1913,0,21.0,,The Valz Prize,,https://books.google.com/books?id,sim_popular-astronomy,Popular Astronomy,1893.0,1951.0,sim_popular-astronomy_1913_21,
2174,Rüzgâr türbini tasarımı,{{Akademik dergi kaynağı|başlık=Alternative Co...,Journal of Solar Energy Engineering,sim_journal-of-solar-energy-engineering,2003,2003,0,125.0,4.0,Alternative Composite Materials for Megawatt-S...,515.0,,sim_journal-of-solar-energy-engineering,Journal of Solar Energy Engineering,1980.0,2011.0,sim_journal-of-solar-energy-engineering_2003_1...,


In [77]:
sample_w_id = sample[sample['identifier_actual'] != ""]
print(sample_w_id.shape)
sample_w_id.head()

(100, 18)


Unnamed: 0,a,c,journal_name,sim_id,date,year,month,volume,issue,article,page,url,PubIssueID,Title,First Volume,Last Volume,identifier_in,identifier_exact


### Find urls from actual identifiers 

In [129]:
def generate_url_actual(identifier):
    return "https://archive.org/details/" + identifier

In [130]:
sample_w_id["generated_url"] = sample_w_id["identifier_actual"].apply(generate_url_actual)
sample_w_id.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,journal_name,sim_id,date,year,month,volume,issue,page,url,PubIssueID,Title,First Volume,Last Volume,identifier_in,identifier_out
662,Billboard,sim_billboard,1990-12,1990,12,,,Y-50,https://archive.org/details/VOA_Africa_2018010...,sim_billboard,Billboard,1894.0,2016.0,sim_billboard_1990,VOA_Africa_20180105_150000
1072,Journal of Deaf Studies and Deaf Education,sim_journal-of-deaf-studies-and-deaf-education,2004,2004,0,9.0,2.0,239,https://archive.org/details/sim_journal-of-dea...,sim_journal-of-deaf-studies-and-deaf-education,Journal of Deaf Studies and Deaf Education,1999.0,2007.0,sim_journal-of-deaf-studies-and-deaf-education...,sim_journal-of-deaf-studies-and-deaf-education...
180,Science,sim_science,1934,1934,0,80.0,2084.0,512,https://archive.org/details/sim_science_1934-1...,sim_science,Science,1883.0,2016.0,sim_science_1934_80,sim_science_1934-11-16_80_2081
1060,Neurology,sim_neurology,2008-04,2008,4,70.0,18.0,1630,https://archive.org/details/sim_jama-psychiatr...,sim_neurology,Neurology,1951.0,2012.0,sim_neurology_2008_70,sim_jama-psychiatry_2008-06_65_6


#### No need to do this since advance search generated results must be good

In [131]:

# def test_url_exist(url):
#     request = requests.get(url)
#     if request.status_code == 200:
#         return True
#     else:
#         return False

In [133]:
# sample_w_id["url_exists"] = sample_w_id["url"].apply(test_url_exist)
# sample_w_id.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,journal_name,sim_id,date,year,month,volume,issue,page,url,PubIssueID,Title,First Volume,Last Volume,identifier_in,identifier_out,url_exists
662,Billboard,sim_billboard,1990-12,1990,12,,,Y-50,https://archive.org/details/VOA_Africa_2018010...,sim_billboard,Billboard,1894.0,2016.0,sim_billboard_1990,VOA_Africa_20180105_150000,True
1072,Journal of Deaf Studies and Deaf Education,sim_journal-of-deaf-studies-and-deaf-education,2004,2004,0,9.0,2.0,239,https://archive.org/details/sim_journal-of-dea...,sim_journal-of-deaf-studies-and-deaf-education,Journal of Deaf Studies and Deaf Education,1999.0,2007.0,sim_journal-of-deaf-studies-and-deaf-education...,sim_journal-of-deaf-studies-and-deaf-education...,True
180,Science,sim_science,1934,1934,0,80.0,2084.0,512,https://archive.org/details/sim_science_1934-1...,sim_science,Science,1883.0,2016.0,sim_science_1934_80,sim_science_1934-11-16_80_2081,True
1060,Neurology,sim_neurology,2008-04,2008,4,70.0,18.0,1630,https://archive.org/details/sim_jama-psychiatr...,sim_neurology,Neurology,1951.0,2012.0,sim_neurology_2008_70,sim_jama-psychiatry_2008-06_65_6,True


## Add URL to citation

In [81]:
# perform_advanced_search("sim_review_2004_86")

In [83]:
def generate_new_citation(citation, url):
    url_field = "| url=" + url
    return citation[:-2] + url_field + "}}"

In [88]:
test = perform_advanced_search("sim_applied-cognitive-psychology_2007_21")
test

{'sim_applied-cognitive-psychology_2007_21_index-contents': 75.87896}

In [89]:
test2 = perform_advanced_search("sim_american-journal-of-archaeology_2000-10_104_4")
test2

{'sim_american-journal-of-archaeology_2001_105_contents': 1486.7208}

### Write Data to Google Sheet

In [69]:
import gspread
from gspread_dataframe import set_with_dataframe

In [70]:
gc = gspread.service_account("service_account.json")

sh = gc.open("Citations SIM Test")

print(sh.sheet1.get('A1'))

[['a']]


In [72]:
# ACCESS GOOGLE SHEET
gc = gspread.service_account(filename='service_account.json')
sh = gc.open_by_key('1ih5bIk5_d5WLEtArRzEFPzPlwZVA_BI-O8kSxWH1rNU')
worksheet = sh.get_worksheet(2) #-> 0 - first sheet, 1 - second sheet etc. 

# APPEND DATA TO SHEET
df_to_write = df_cite_wo_doi
set_with_dataframe(worksheet, df_to_write) #-> THIS EXPORTS YOUR DATAFRAME TO THE GOOGLE SHEET