# URL generation with Archive.org Patterns

In [1]:
import numpy as np 
import pandas as pd
import re
import requests
import datetime
import dateparser



### Load SIM Information

In [3]:
# Load SIM information processed by SIM Metadata Parsing.ipynb
sim_info = pd.read_csv("SIM_info.csv")
sim_info.head()

Unnamed: 0,Title,First Volume,Last Volume,Subjects,PubIssueID
0,American Journal of Pharmacy and the Sciences ...,1952.0,1995.0,Health & Medical Sciences,sim_american-journal-of-pharmacy-and-the-sciences
1,National Real Estate and Building Journal,1949.0,1956.0,Building & Construction,sim_national-real-estate-and-building-journal
2,The American Naturalist,1872.0,2015.0,Biology,sim_american-naturalist
3,Alcatel Telecommunications Review,1922.0,2002.0,Communication & Information Sciences,sim_alcatel-telecommunications-review
4,The American Journal of Gastroenterology,1949.0,2011.0,Medical Sciences--Gastroenterology,sim_american-journal-of-gastroenterology


#### Data Cleaning for SIM

In [4]:
# For example, there are overlapping ids
sim_info[sim_info["PubIssueID"] == "sim_science"]

Unnamed: 0,Title,First Volume,Last Volume,Subjects,PubIssueID
1007,Science,1883.0,2016.0,Sciences: Comprehensive Works|Technology: Comp...,sim_science
9855,Science,1979.0,1986.0,Mathematical & Physical Sciences,sim_science


In [5]:
# Concretely show these overlaps
print("SIM Id/title count: " + str(sim_info["PubIssueID"].shape[0]))
print("Unique SIM Id count: " + str(len(sim_info["PubIssueID"].unique())))
print("Unique title count: " + str(len(sim_info["Title"].unique())))

SIM Id/title count: 15119
Unique SIM Id count: 14913
Unique title count: 14797


In [6]:
# Only keep necessary columns
sim_info_concise = sim_info.drop(columns = ["Title", "Subjects"])
sim_info_concise.head()

Unnamed: 0,First Volume,Last Volume,PubIssueID
0,1952.0,1995.0,sim_american-journal-of-pharmacy-and-the-sciences
1,1949.0,1956.0,sim_national-real-estate-and-building-journal
2,1872.0,2015.0,sim_american-naturalist
3,1922.0,2002.0,sim_alcatel-telecommunications-review
4,1949.0,2011.0,sim_american-journal-of-gastroenterology


In [7]:
# Aggregate Pub issue IDs such that the maximum range is included
sim_info.groupby(["PubIssueID"]).agg({'First Volume': 'min', 'Last Volume': 'max',})

Unnamed: 0_level_0,First Volume,Last Volume
PubIssueID,Unnamed: 1_level_1,Unnamed: 2_level_1
sim _annales-academiae-scientiarum-fennicae,,
sim-anatomia-clinica,1978.0,1981.0
sim_-,1826.0,1826.0
sim_1001-home-ideas,1986.0,1991.0
sim_102-monitor,1975.0,1981.0
...,...,...
sim_zoo-biology,1999.0,2000.0
sim_zoologica-scripta,1998.0,2001.0
sim_zoomorphology,1981.0,1996.0
sim_zvezda,1961.0,1974.0


### Load Wiki Dump Data

### English Journal

In [8]:
# #
# # load wikipedia citation data from xcite dump 
# journal_dump = pd.read_json("en.wikipedia.org.journal.20210606.json", lines = True)
# journal_dump.head()

Unnamed: 0,a,c
0,0,{{cite journal |first=R. W. |last=Bemer |title...
1,020413 DOJ White Paper,{{cite document|ssrn=1332096 |title=Combatants...
2,"10,000 Bullets","{{cite journal | author=Nowakowski, Kasper | a..."
3,"10,000 Bullets",{{cite journal | author=''Official UK PlayStat...
4,1000 Plant Genomes Project,"{{cite journal | vauthors = Matasci N, Hung LH..."


### Turkish Journal

In [10]:
journal_dump = pd.read_json("tr.wikipedia.org.journal.20210614.json.gz", lines = True)
journal_dump.head()

Unnamed: 0,a,c
0,1 + 2 + 3 + 4 + · · ·,{{Akademik dergi kaynağı\n| soyadı = Lepowsky ...
1,12 Victoria,{{Akademik dergi kaynağı\n | ad1 = B.\n | s...
2,"1,3,5-Triklorobenzen",{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-G...
3,141 Likya depremi,{{Akademik dergi kaynağı | url=http://blackmed...
4,141 Likya depremi,{{Akademik dergi kaynağı | url=http://www.nat-...


In [12]:
# For example, 'Akademik dergi kaynağı' is turkish for citation
def find_cite_journal_and_alias(citation):
    alias = ["Cite Journal", "Akademik dergi kaynağı", "Cite document", "Cite science", "Cite journal"]
    for name in alias:
        if name in citation:
            return True
    return False

In [13]:
# filter columns such that we are only working with journals 
journal_dump["is_journal"] = journal_dump["c"].apply(find_cite_journal_and_alias)
print(journal_dump.shape)
journal_only = journal_dump[journal_dump["is_journal"]]
journal_only = journal_only.drop(columns = {"is_journal"})
print(journal_only.shape)
journal_only.head()

(14624, 3)
(14310, 2)


Unnamed: 0,a,c
0,1 + 2 + 3 + 4 + · · ·,{{Akademik dergi kaynağı\n| soyadı = Lepowsky ...
1,12 Victoria,{{Akademik dergi kaynağı\n | ad1 = B.\n | s...
2,"1,3,5-Triklorobenzen",{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-G...
3,141 Likya depremi,{{Akademik dergi kaynağı | url=http://blackmed...
4,141 Likya depremi,{{Akademik dergi kaynağı | url=http://www.nat-...


## Sampling Data


### EN wiki dump 
*since it takes too long to run it against 2 million

In [94]:
# # can change up n to get different results
# sample_df = journal_only.sample(n = 50000)
# sample_df.head()

Unnamed: 0,a,c
672231,List of Sufi saints,{{cite journal|author=Tasadduq Husain|date=Jul...
2558012,Thrombophilia,"{{cite journal |vauthors=Crowther MA, Kelton J..."
1867923,Evolutionary physiology,{{cite journal|last=Chown|first=S. L.|author2=...
1624028,Promised Land (2012 film),{{cite journal | last=McNary | first=Dave | ur...
422209,Weld tests for friction welding,{{cite journal|last1=Lacki|first1=P.|last2=Kuc...


### Turkish wiki dump
* only has 14000 citations so very short

In [39]:
sample_df = journal_only

## Parsing Citation 
Combined and cleaned version of testing in the Wikipedia Journal Citation Dump Parsing Test notebook

In [40]:
# Parsing a wikipedia citation data
def parse_citation_data(citation):
    ### change this part for different alias for different languages 
    journal_aliases = ['journal', 'newspaper', 'magazine', 'work','website',  'periodical', 
                       'encyclopedia', 'encyclopaedia', 'dictionary', 'mailinglist','dergi', 'gazete', 
                       'eser', 'çalışma', 'iş', 'websitesi', 'süreliyayın', 'ansiklopedi', 'sözlük', 'program']
    
    date_aliases = ['date', 'air-date', 'airdate', 'tarih']
    
    year_aliases = ['year', 'yıl', 'sene']
    
    volume_aliases = ['volume', 'cilt']
    
    issue_aliases = ['issue', 'number', 'sayı', 'numara']
    
    
    ### leave this part be
    
    citation_list = citation.split("|")
    
    journal = ""
    sim_id = ""
    volume = ""
    issue = ""
    
    date = ""
    year = 0
    month = ""
    
    for field in citation_list:
        field = field.strip()
        
        # find journal title
        for j_a in journal_aliases:
            journal_regex = j_a + "(\s{0,})="
            if re.match(re.compile(journal_regex), field):
                journal = field.split("=")[1].strip()
                journal = re.sub('[^A-Za-z0-9 ]+', '', journal)
                if journal != "":
                    sim_id = journal.lower()
                    sim_id_lst = sim_id.split()
                    sim_id = "-".join(sim_id_lst)
                    sim_id = "sim_" + sim_id
                break
   
        # find journal volume 
        for v_a in volume_aliases:
            volume_regex = v_a + "(\s{0,})="
            if re.match(re.compile(volume_regex), field):
                volume = field.split("=")[1].strip()
                volume = re.sub('[^0-9]+', '', volume)
                break
            
        # find journal issue
        for i_a in issue_aliases:
            issue_regex = i_a + "(\s{0,})="
            if re.match(issue_regex, field):
                issue = field.split("=")[1].strip()
                issue = re.sub('[^0-9]+', '', issue)
                break
            
        
        # find journal year
        for y_a in year_aliases:
            year_regex = y_a + "(\s{0,})="
            if re.match(year_regex, field):
                year = field.split("=")[1].strip()
                date = re.sub('[^0-9]+', '', year)
                try:
                    year = int(date)
                except:
                    year = 0
                break
            
        # find journal date
        for d_a in date_aliases:
            date_regex = d_a + "(\s{0,})="
            if re.match(date_regex, field):
                date = field.split("=")[1].strip()
                date = re.sub('[^a-zA-Z0-9-]+', '', date)

                try:
                    year = int(date)
                    date = str(year)
                except:
                    # use the python library for parsing
                    parsed_date = dateparser.parse(date)
                    if parsed_date != None:
                        if parsed_date.year < 2021 and parsed_date.year > 1800:
                            year = parsed_date.year
                            date = str(year)

                        if parsed_date.month < 10:
                            month = "0" + str(parsed_date.month)
                        else:
                            month = str(parsed_date.month)

                        if month != "":
                            date = date + "-" + month
                
                break

            
    return [journal, sim_id, date, year, volume, issue]
        

In [41]:
# parse citation data test 
cite_test = "{{Cite journal | last1 = Lewis | first1 = Herbert | date = June 2001 | title = The Passion of Franz Boas | journal = [[American Anthropologist]] | volume = 103 | issue = 2| pages = 447–467 | url = https://archive.org/details/sim_american-anthropologist_2001-06_103_2/page/447/mode/2up | doi=10.1525/aa.2001.103.2.447}}"
parse_citation_data(cite_test)


['American Anthropologist',
 'sim_american-anthropologist',
 '2001-06',
 2001,
 '103',
 '2']

In [43]:
# filter out desired info for each citation
parsed_citations = sample_df['c'].apply(parse_citation_data)
parsed_citations_dict = parsed_citations.to_dict()

In [46]:
list(parsed_citations_dict.items())[0:5]

[(0, ['Gncel Matematik', 'sim_gncel-matematik', '1999', 1999, '248', '']),
 (1,
  ['Planetary and Space Science',
   'sim_planetary-and-space-science',
   '2012-12',
   2012,
   '73',
   '']),
 (2, ['Chemosphere', 'sim_chemosphere', 'Aralk1999', 0, '39', '15']),
 (3,
  ['Journal of Black SeaMediterranean Environment',
   'sim_journal-of-black-seamediterranean-environment',
   '2007',
   2007,
   '13',
   '']),
 (4,
  ['Natural Hazards and Earth System Sciences',
   'sim_natural-hazards-and-earth-system-sciences',
   '2011',
   2011,
   '11',
   ''])]

In [47]:
# Get into a dataframe format
parsed_citations_df = pd.DataFrame.from_dict(parsed_citations_dict, 
                                             orient = 'index',
                                   columns = ['journal_name', 'sim_id', 'date', 'year', 'volume', 'issue'])
parsed_citations_df.head()

Unnamed: 0,journal_name,sim_id,date,year,volume,issue
0,Gncel Matematik,sim_gncel-matematik,1999,1999,248,
1,Planetary and Space Science,sim_planetary-and-space-science,2012-12,2012,73,
2,Chemosphere,sim_chemosphere,Aralk1999,0,39,15.0
3,Journal of Black SeaMediterranean Environment,sim_journal-of-black-seamediterranean-environment,2007,2007,13,
4,Natural Hazards and Earth System Sciences,sim_natural-hazards-and-earth-system-sciences,2011,2011,11,


## Filter Citations by SIM Collections

In [48]:
# Limit to citations with the exact sim_id match
merged = pd.merge(left=parsed_citations_df, right=sim_info, how="inner", 
                  left_on="sim_id", right_on="PubIssueID")
merged.head()

Unnamed: 0,journal_name,sim_id,date,year,volume,issue,Title,First Volume,Last Volume,Subjects,PubIssueID
0,Planetary and Space Science,sim_planetary-and-space-science,2012-12,2012,73,,Planetary and Space Science,1959.0,2002.0,Mathematical & Physical Sciences,sim_planetary-and-space-science
1,Planetary and Space Science,sim_planetary-and-space-science,1991,1991,39,12.0,Planetary and Space Science,1959.0,2002.0,Mathematical & Physical Sciences,sim_planetary-and-space-science
2,Planetary and Space Science,sim_planetary-and-space-science,2004-01,2004,52,13.0,Planetary and Space Science,1959.0,2002.0,Mathematical & Physical Sciences,sim_planetary-and-space-science
3,Chemosphere,sim_chemosphere,Aralk1999,0,39,15.0,Chemosphere,1972.0,2003.0,Environmental Sciences,sim_chemosphere
4,Chemosphere,sim_chemosphere,6Haziran2021-06,0,276,,Chemosphere,1972.0,2003.0,Environmental Sciences,sim_chemosphere


In [49]:
print("Number of Citations in SIM : " + str(merged.shape[0]))

Number of Citations in SIM : 4595


In [50]:
# Filter out the journals of years that are not within the range of collection
def filter_year_range(row):
    year = row['year']
    first = row['First Volume']
    last = row['Last Volume']
    if first != np.nan and last != np.nan:
        if year > first and year < last:
            return True
    
    return False

In [51]:
merged["within_yr_range"] = merged.apply(filter_year_range, axis = 1)
journal_year_within_range = merged[merged["within_yr_range"]]
journal_year_within_range = journal_year_within_range.drop(columns = ['within_yr_range'])
journal_year_within_range.head()

Unnamed: 0,journal_name,sim_id,date,year,volume,issue,Title,First Volume,Last Volume,Subjects,PubIssueID
1,Planetary and Space Science,sim_planetary-and-space-science,1991,1991,39.0,12.0,Planetary and Space Science,1959.0,2002.0,Mathematical & Physical Sciences,sim_planetary-and-space-science
13,Journal of the Geological Society,sim_journal-of-the-geological-society,2007,2007,164.0,6.0,Journal of the Geological Society,1980.0,2013.0,Earth Sciences--Geology,sim_journal-of-the-geological-society
16,Public Health Reports,sim_public-health-reports,1900-06,1900,15.0,24.0,Public Health Reports,1878.0,2015.0,Medical Sciences|Public Health And Safety,sim_public-health-reports
17,Time,sim_time,1985-04,1985,,,Time,1923.0,2016.0,General Interest Periodicals--United States|Po...,sim_time
18,Time,sim_time,1985-01,1985,,,Time,1923.0,2016.0,General Interest Periodicals--United States|Po...,sim_time


In [52]:
print("Number of Citations in SIM with range of collection years: " + str(journal_year_within_range.shape[0]))

Number of Citations in SIM with range of collection years: 2147


In [53]:
journal_year_not_within_range = merged[merged["within_yr_range"] == False]
journal_year_not_within_range = journal_year_not_within_range.drop(columns = ['within_yr_range'])
journal_year_not_within_range.head()

Unnamed: 0,journal_name,sim_id,date,year,volume,issue,Title,First Volume,Last Volume,Subjects,PubIssueID
0,Planetary and Space Science,sim_planetary-and-space-science,2012-12,2012,73,,Planetary and Space Science,1959.0,2002.0,Mathematical & Physical Sciences,sim_planetary-and-space-science
2,Planetary and Space Science,sim_planetary-and-space-science,2004-01,2004,52,13.0,Planetary and Space Science,1959.0,2002.0,Mathematical & Physical Sciences,sim_planetary-and-space-science
3,Chemosphere,sim_chemosphere,Aralk1999,0,39,15.0,Chemosphere,1972.0,2003.0,Environmental Sciences,sim_chemosphere
4,Chemosphere,sim_chemosphere,6Haziran2021-06,0,276,,Chemosphere,1972.0,2003.0,Environmental Sciences,sim_chemosphere
5,Chemosphere,sim_chemosphere,2014-07,2014,107,,Chemosphere,1972.0,2003.0,Environmental Sciences,sim_chemosphere


In [54]:
print("Number of Citations in SIM not in range of collection years: " + str(journal_year_not_within_range.shape[0]))

Number of Citations in SIM not in range of collection years: 2448


## Extract Different Dataframes 

In [55]:
total_count = journal_year_within_range.shape[0]
total_count

2147

In [56]:
# Journals that have sim_id, date, volume
journal_sim_has_journal_date_volume = journal_year_within_range[(journal_year_within_range["date"] != "") &
                                                               (journal_year_within_range["volume"] != "")]
print("Journal, date, volume count: " + str(journal_sim_has_journal_date_volume.shape[0]))
journal_sim_has_journal_date_volume.head()

Journal, date, volume count: 2064


Unnamed: 0,journal_name,sim_id,date,year,volume,issue,Title,First Volume,Last Volume,Subjects,PubIssueID
1,Planetary and Space Science,sim_planetary-and-space-science,1991,1991,39,12,Planetary and Space Science,1959.0,2002.0,Mathematical & Physical Sciences,sim_planetary-and-space-science
13,Journal of the Geological Society,sim_journal-of-the-geological-society,2007,2007,164,6,Journal of the Geological Society,1980.0,2013.0,Earth Sciences--Geology,sim_journal-of-the-geological-society
16,Public Health Reports,sim_public-health-reports,1900-06,1900,15,24,Public Health Reports,1878.0,2015.0,Medical Sciences|Public Health And Safety,sim_public-health-reports
57,Science,sim_science,2001-04,2001,292,5517,Science,1883.0,2016.0,Sciences: Comprehensive Works|Technology: Comp...,sim_science
61,Science,sim_science,2010,2010,328,5975,Science,1883.0,2016.0,Sciences: Comprehensive Works|Technology: Comp...,sim_science


In [57]:
# Journals that have sim_id, date, no volume
journal_sim_has_journal_date_no_volume = journal_year_within_range[(journal_year_within_range["date"] != "") &
                                                               (journal_year_within_range["volume"] == "")]
print("Journal date, no volume count: " + str(journal_sim_has_journal_date_no_volume.shape[0]))
journal_sim_has_journal_date_no_volume.head()

Journal date, no volume count: 83


Unnamed: 0,journal_name,sim_id,date,year,volume,issue,Title,First Volume,Last Volume,Subjects,PubIssueID
17,Time,sim_time,1985-04,1985,,,Time,1923.0,2016.0,General Interest Periodicals--United States|Po...,sim_time
18,Time,sim_time,1985-01,1985,,,Time,1923.0,2016.0,General Interest Periodicals--United States|Po...,sim_time
19,Time,sim_time,1985-03,1985,,,Time,1923.0,2016.0,General Interest Periodicals--United States|Po...,sim_time
20,Time,sim_time,1987-06,1987,,,Time,1923.0,2016.0,General Interest Periodicals--United States|Po...,sim_time
21,Time,sim_time,2005-04,2005,,,Time,1923.0,2016.0,General Interest Periodicals--United States|Po...,sim_time


In [58]:
# Journals with sim_id, date, volume, issue
journal_sim_has_journal_date_volume_issue = journal_sim_has_journal_date_volume[journal_sim_has_journal_date_volume["issue"] != ""]
print("Journal, date, volume, issue count: " + str(journal_sim_has_journal_date_volume_issue.shape[0]))
journal_sim_has_journal_date_volume_issue.head()

Journal, date, volume, issue count: 1907


Unnamed: 0,journal_name,sim_id,date,year,volume,issue,Title,First Volume,Last Volume,Subjects,PubIssueID
1,Planetary and Space Science,sim_planetary-and-space-science,1991,1991,39,12,Planetary and Space Science,1959.0,2002.0,Mathematical & Physical Sciences,sim_planetary-and-space-science
13,Journal of the Geological Society,sim_journal-of-the-geological-society,2007,2007,164,6,Journal of the Geological Society,1980.0,2013.0,Earth Sciences--Geology,sim_journal-of-the-geological-society
16,Public Health Reports,sim_public-health-reports,1900-06,1900,15,24,Public Health Reports,1878.0,2015.0,Medical Sciences|Public Health And Safety,sim_public-health-reports
57,Science,sim_science,2001-04,2001,292,5517,Science,1883.0,2016.0,Sciences: Comprehensive Works|Technology: Comp...,sim_science
61,Science,sim_science,2010,2010,328,5975,Science,1883.0,2016.0,Sciences: Comprehensive Works|Technology: Comp...,sim_science


In [59]:
# Journals with sim_id, date, volume, issue
journal_sim_has_journal_date_volume_no_issue = journal_sim_has_journal_date_volume[journal_sim_has_journal_date_volume["issue"] == ""]
print("Journal, date, volume, no issue count: " + str(journal_sim_has_journal_date_volume_no_issue.shape[0]))
journal_sim_has_journal_date_volume_no_issue.head()

Journal, date, volume, no issue count: 157


Unnamed: 0,journal_name,sim_id,date,year,volume,issue,Title,First Volume,Last Volume,Subjects,PubIssueID
65,Science,sim_science,2008,2008,322,,Science,1883.0,2016.0,Sciences: Comprehensive Works|Technology: Comp...,sim_science
381,Science,sim_science,1919,1919,49,,Science,1883.0,2016.0,Sciences: Comprehensive Works|Technology: Comp...,sim_science
507,Developmental Psychology,sim_developmental-psychology,1992,1992,28,,Developmental Psychology,1969.0,2014.0,Psychology,sim_developmental-psychology
523,Journal of Personality and Social Psychology,sim_journal-of-personality-and-social-psychology,2003,2003,85,,Journal of Personality and Social Psychology,1965.0,2015.0,Psychology,sim_journal-of-personality-and-social-psychology
543,Adolescence,sim_adolescence,1976,1976,11,,Adolescence,1966.0,2009.0,Children And Youth - About|Education|Medical S...,sim_adolescence


## Generate URLs

- citation should already have name, date, year 
    - citation has volume
        - citation has issue (url with 4 fields)
            - with date
            - with year
        - citation has no issue (url with 3 fields)
            - with date 
            - with year
    - citation has no volume
        - advanced search?

### 3 Field URLs

In [60]:
def generate_url_with_journal_date_volume(row):
    archive_header = "https://archive.org/details/"
    journal = str(row["sim_id"])
    date = "_" + str(row["date"])
    volume = "_" + str(row["volume"])
    identifier = journal + date + volume
    return archive_header + identifier

In [61]:
url_journal_date_volume = journal_sim_has_journal_date_volume.apply(
                                        generate_url_with_journal_date_volume, axis = 1)
url_journal_date_volume_list = url_journal_date_volume.tolist()
url_journal_date_volume_list[0:5]

['https://archive.org/details/sim_planetary-and-space-science_1991_39',
 'https://archive.org/details/sim_journal-of-the-geological-society_2007_164',
 'https://archive.org/details/sim_public-health-reports_1900-06_15',
 'https://archive.org/details/sim_science_2001-04_292',
 'https://archive.org/details/sim_science_2010_328']

In [62]:
def generate_url_with_journal_year_volume(row):
    archive_header = "https://archive.org/details/"
    journal = str(row["sim_id"])
    year = "_" + str(row["year"])
    volume = "_" + str(row["volume"])
    identifier = journal + year + volume
    return archive_header + identifier

In [63]:
url_journal_year_volume = journal_sim_has_journal_date_volume.apply(
                                        generate_url_with_journal_year_volume, axis = 1)
url_journal_year_volume_list = url_journal_year_volume.tolist()
url_journal_year_volume_list[0:5]

['https://archive.org/details/sim_planetary-and-space-science_1991_39',
 'https://archive.org/details/sim_journal-of-the-geological-society_2007_164',
 'https://archive.org/details/sim_public-health-reports_1900_15',
 'https://archive.org/details/sim_science_2001_292',
 'https://archive.org/details/sim_science_2010_328']

### 4 Field URLs

In [64]:
def generate_url_with_journal_date_volume_issue(row):
    archive_header = "https://archive.org/details/"
    journal = str(row["sim_id"])
    date = "_" + str(row["date"])
    volume = "_" + str(row["volume"])
    issue = "_" + str(row["issue"])
    identifier = journal + date + volume + issue
    return archive_header + identifier

In [65]:
url_journal_date_volume_issue = journal_sim_has_journal_date_volume_issue.apply(
                                        generate_url_with_journal_date_volume_issue, axis = 1)
url_journal_date_volume_issue_list = url_journal_date_volume_issue.tolist()
url_journal_date_volume_issue_list[0:5]

['https://archive.org/details/sim_planetary-and-space-science_1991_39_12',
 'https://archive.org/details/sim_journal-of-the-geological-society_2007_164_6',
 'https://archive.org/details/sim_public-health-reports_1900-06_15_24',
 'https://archive.org/details/sim_science_2001-04_292_5517',
 'https://archive.org/details/sim_science_2010_328_5975']

In [66]:
def generate_url_with_journal_year_volume_issue(row):
    archive_header = "https://archive.org/details/"
    journal = str(row["sim_id"])
    year = "_" + str(row["year"])
    volume = "_" + str(row["volume"])
    issue = "_" + str(row["issue"])
    identifier = journal + year + volume + issue
    return archive_header + identifier

In [74]:
url_journal_year_volume_issue = journal_sim_has_journal_date_volume_issue.apply(
                                        generate_url_with_journal_year_volume_issue, axis = 1)
url_journal_year_volume_issue_list = url_journal_year_volume_issue.tolist()
url_journal_year_volume_issue_list[0:5]


['https://archive.org/details/sim_planetary-and-space-science_1991_39_12',
 'https://archive.org/details/sim_journal-of-the-geological-society_2007_164_6',
 'https://archive.org/details/sim_public-health-reports_1900_15_24',
 'https://archive.org/details/sim_science_2001_292_5517',
 'https://archive.org/details/sim_science_2010_328_5975']

In [76]:
### for example 
### https://archive.org/details/sim_american-political-science-review_1993_87_index

def generate_url_with_year_index(row):
    archive_header = "https://archive.org/details/"
    journal = str(row["sim_id"])
    year = "_" + str(row["year"])
    volume = "_" + str(row["volume"])
    identifier = journal + year + volume
    return archive_header + identifier + "_index"


def generate_url_with_date_index(row):
    archive_header = "https://archive.org/details/"
    journal = str(row["sim_id"])
    date = "_" + str(row["date"])
    volume = "_" + str(row["volume"])
    identifier = journal + date + volume
    return archive_header + identifier + "_index"

In [78]:
url_with_year_index = journal_sim_has_journal_date_volume_issue.apply(generate_url_with_year_index, axis = 1)
url_with_year_index_list = url_with_year_index.tolist()
url_with_year_index_list[0:5]

['https://archive.org/details/sim_planetary-and-space-science_1991_39_index',
 'https://archive.org/details/sim_journal-of-the-geological-society_2007_164_index',
 'https://archive.org/details/sim_public-health-reports_1900_15_index',
 'https://archive.org/details/sim_science_2001_292_index',
 'https://archive.org/details/sim_science_2010_328_index']

In [81]:
url_with_date_index = journal_sim_has_journal_date_volume_issue.apply(generate_url_with_date_index, axis = 1)
url_with_date_index_list = url_with_date_index.tolist()
url_with_date_index_list[0:5]

['https://archive.org/details/sim_planetary-and-space-science_1991_39_index',
 'https://archive.org/details/sim_journal-of-the-geological-society_2007_164_index',
 'https://archive.org/details/sim_public-health-reports_1900-06_15_index',
 'https://archive.org/details/sim_science_2001-04_292_index',
 'https://archive.org/details/sim_science_2010_328_index']

## Write Urls to Txt Files
*Run the deadlink checker on it

In [82]:
# 3 Field URL date
url_count = 0
textfile = open("Journal_3_field_date_turk.txt", "w")
for element in url_journal_date_volume_list:
    textfile.write(element + "\n")
    url_count += 1
textfile.close()
print("File should have " + str(url_count) + " urls")

File should have 2064 urls


In [69]:
# 3 Field URL year
url_count = 0
textfile = open("Journal_3_field_year_turk.txt", "w")
for element in url_journal_year_volume_list:
    textfile.write(element + "\n")
    url_count += 1
textfile.close()
print("File should have " + str(url_count) + " urls")

File should have 2064 urls


In [70]:
# 4 Field URL date
url_count = 0
textfile = open("Journal_4_field_date_turk.txt", "w")
for element in url_journal_date_volume_issue_list:
    textfile.write(element + "\n")
    url_count += 1
textfile.close()
print("File should have " + str(url_count) + " urls")

File should have 1907 urls


In [83]:
# 4 Field URL year
url_count = 0
textfile = open("Journal_4_field_year_turk.txt", "w")
for element in url_journal_year_volume_issue_list:
    textfile.write(element + "\n")
    url_count += 1
textfile.close()
print("File should have " + str(url_count) + " urls")

File should have 1907 urls


In [84]:
# Journal, year, volume, _index
url_count = 0
textfile = open("Journal_year_index_turk.txt", "w")
for element in url_with_year_index_list:
    textfile.write(element + "\n")
    url_count += 1
textfile.close()
print("File should have " + str(url_count) + " urls")

File should have 1907 urls


In [85]:
# Journal, year, volume, _index
url_count = 0
textfile = open("Journal_date_index_turk.txt", "w")
for element in url_with_date_index_list:
    textfile.write(element + "\n")
    url_count += 1
textfile.close()
print("File should have " + str(url_count) + " urls")

File should have 1907 urls
