# URL generation with Journal Date Volume Pattern

In [6]:
import numpy as np 
import pandas as pd
import re
import requests
import datetime
import dateparser

### Load SIM Information

In [2]:
# Load SIM information processed by SIM Metadata Parsing.ipynb
sim_info = pd.read_csv("SIM_info.csv")
sim_info.head()

Unnamed: 0,Title,First Volume,Last Volume,Subjects,PubIssueID
0,American Journal of Pharmacy and the Sciences ...,1952.0,1995.0,Health & Medical Sciences,sim_american-journal-of-pharmacy-and-the-sciences
1,National Real Estate and Building Journal,1949.0,1956.0,Building & Construction,sim_national-real-estate-and-building-journal
2,The American Naturalist,1872.0,2015.0,Biology,sim_american-naturalist
3,Alcatel Telecommunications Review,1922.0,2002.0,Communication & Information Sciences,sim_alcatel-telecommunications-review
4,The American Journal of Gastroenterology,1949.0,2011.0,Medical Sciences--Gastroenterology,sim_american-journal-of-gastroenterology


#### Data Cleaning for SIM

In [22]:
# For example, there are overlapping ids
sim_info[sim_info["PubIssueID"] == "sim_science"]

Unnamed: 0,Title,First Volume,Last Volume,Subjects,PubIssueID
1007,Science,1883.0,2016.0,Sciences: Comprehensive Works|Technology: Comp...,sim_science
9855,Science,1979.0,1986.0,Mathematical & Physical Sciences,sim_science


In [31]:
# Concretely show these overlaps
print("SIM Id/title count: " + str(sim_info["PubIssueID"].shape[0]))
print("Unique SIM Id count: " + str(len(sim_info["PubIssueID"].unique())))
print("Unique title count: " + str(len(sim_info["Title"].unique())))

SIM Id/title count: 15119
Unique SIM Id count: 14913
Unique title count: 14797


In [34]:
# Only keep necessary columns
sim_info_concise = sim_info.drop(columns = ["Title", "Subjects"])
sim_info_concise.head()

Unnamed: 0,First Volume,Last Volume,PubIssueID
0,1952.0,1995.0,sim_american-journal-of-pharmacy-and-the-sciences
1,1949.0,1956.0,sim_national-real-estate-and-building-journal
2,1872.0,2015.0,sim_american-naturalist
3,1922.0,2002.0,sim_alcatel-telecommunications-review
4,1949.0,2011.0,sim_american-journal-of-gastroenterology


In [35]:
# Aggregate Pub issue IDs such that the maximum range is included
sim_info.groupby(["PubIssueID"]).agg({'First Volume': 'min', 'Last Volume': 'max',})

Unnamed: 0_level_0,First Volume,Last Volume
PubIssueID,Unnamed: 1_level_1,Unnamed: 2_level_1
sim _annales-academiae-scientiarum-fennicae,,
sim-anatomia-clinica,1978.0,1981.0
sim_-,1826.0,1826.0
sim_1001-home-ideas,1986.0,1991.0
sim_102-monitor,1975.0,1981.0
...,...,...
sim_zoo-biology,1999.0,2000.0
sim_zoologica-scripta,1998.0,2001.0
sim_zoomorphology,1981.0,1996.0
sim_zvezda,1961.0,1974.0


### Load Wiki Dump Data

In [8]:
# load wikipedia citation data from xcite dump 
journal_dump = pd.read_json("en.wikipedia.org.journal.20210606.json", lines = True)
journal_dump.head()

Unnamed: 0,a,c
0,0,{{cite journal |first=R. W. |last=Bemer |title...
1,020413 DOJ White Paper,{{cite document|ssrn=1332096 |title=Combatants...
2,"10,000 Bullets","{{cite journal | author=Nowakowski, Kasper | a..."
3,"10,000 Bullets",{{cite journal | author=''Official UK PlayStat...
4,1000 Plant Genomes Project,"{{cite journal | vauthors = Matasci N, Hung LH..."


In [39]:
# filter columns such that we are only working with journals 
journal_dump["is_journal"] = journal_dump["c"].apply(lambda x: "cite journal" in x)
print(journal_dump.shape)
journal_only = journal_dump[journal_dump["is_journal"]]
journal_only = journal_only.drop(columns = {"is_journal"})
print(journal_only.shape)
journal_only.head()

(2594772, 3)
(1984338, 2)


Unnamed: 0,a,c
0,0,{{cite journal |first=R. W. |last=Bemer |title...
2,"10,000 Bullets","{{cite journal | author=Nowakowski, Kasper | a..."
3,"10,000 Bullets",{{cite journal | author=''Official UK PlayStat...
4,1000 Plant Genomes Project,"{{cite journal | vauthors = Matasci N, Hung LH..."
5,1000 Plant Genomes Project,{{cite journal | author = One Thousand Plant T...


## Sampling Data
*since it takes too long to run it against 2 million

In [94]:
# can change up n to get different results
sample_df = journal_only.sample(n = 50000)
sample_df.head()

Unnamed: 0,a,c
672231,List of Sufi saints,{{cite journal|author=Tasadduq Husain|date=Jul...
2558012,Thrombophilia,"{{cite journal |vauthors=Crowther MA, Kelton J..."
1867923,Evolutionary physiology,{{cite journal|last=Chown|first=S. L.|author2=...
1624028,Promised Land (2012 film),{{cite journal | last=McNary | first=Dave | ur...
422209,Weld tests for friction welding,{{cite journal|last1=Lacki|first1=P.|last2=Kuc...


## Parsing Citation 
Combined and cleaned version of testing in the Wikipedia Journal Citation Dump Parsing Test notebook

In [95]:
# Parsing a wikipedia citation data
def parse_citation_data(citation):
    citation_list = citation.split("|")
    
    journal = ""
    sim_id = ""
    volume = ""
    issue = ""
    
    date = ""
    year = 0
    month = ""
    
    for field in citation_list:
        field = field.strip()
        
        # find journal title
        if re.match("journal(\s{0,})=", field):
            journal = field.split("=")[1].strip()
            journal = re.sub('[^A-Za-z0-9 ]+', '', journal)
            if journal != "":
                sim_id = journal.lower()
                sim_id_lst = sim_id.split()
                sim_id = "-".join(sim_id_lst)
                sim_id = "sim_" + sim_id
   
        # find journal volume 
        if re.match("volume(\s{0,})=", field):
            volume = field.split("=")[1].strip()
            
        # find journal issue
        if re.match("issue(\s{0,})=", field):
            issue = field.split("=")[1].strip()
            
        
        # find journal year
        if re.match("year(\s{0,})=", field):
            year = field.split("=")[1].strip()
            date = re.sub('[^0-9]+', '', year)
            try:
                year = int(date)
            except:
                year = 0
            
        # find journal date
        if re.match("date(\s{0,})=", field):
            date = field.split("=")[1].strip()
            date = re.sub('[^a-zA-Z0-9-]+', '', date)
            
            try:
                year = int(date)
                date = str(year)
            except:
                # use the python library for parsing
                parsed_date = dateparser.parse(date)
                if parsed_date != None:
                    if parsed_date.year < 2021 and parsed_date.year > 1800:
                        year = parsed_date.year
                        date = str(year)

                    if parsed_date.month < 10:
                        month = "0" + str(parsed_date.month)
                    else:
                        month = str(parsed_date.month)

                    if month != "":
                        date = date + "-" + month

            
    return [journal, sim_id, date, year, volume, issue]
        

In [96]:
# parse citation data test 
cite_test = "{{Cite journal | last1 = Lewis | first1 = Herbert | date = June 2001 | title = The Passion of Franz Boas | journal = [[American Anthropologist]] | volume = 103 | issue = 2| pages = 447–467 | url = https://archive.org/details/sim_american-anthropologist_2001-06_103_2/page/447/mode/2up | doi=10.1525/aa.2001.103.2.447}}"
parse_citation_data(cite_test)


['American Anthropologist',
 'sim_american-anthropologist',
 '2001-06',
 2001,
 '103',
 '2']

In [97]:
# filter out desired info for each citation
parsed_citations = sample_df['c'].apply(parse_citation_data)
parsed_citations_dict = parsed_citations.to_dict()
parsed_citations_dict

{672231: ['Social Scientist',
  'sim_social-scientist',
  'JulAug2002',
  0,
  '30',
  '7/8'],
 2558012: ['Ann Intern Med', 'sim_ann-intern-med', '2003', 2003, '138', '2'],
 1867923: ['Functional Ecology',
  'sim_functional-ecology',
  '2004',
  2004,
  '18',
  '2'],
 1624028: ['Variety magazine',
  'sim_variety-magazine',
  'August232012',
  0,
  '',
  ''],
 422209: ['Archives of Metallurgy and Materials',
  'sim_archives-of-metallurgy-and-materials',
  '2013-06',
  2013,
  '58',
  '2'],
 2079748: ['', '', '1966', 1966, '', ''],
 34745: ['The Journal of Neuroscience',
  'sim_the-journal-of-neuroscience',
  '2004-03',
  2004,
  '24',
  '11'],
 196757: ['Les Ailes', 'sim_les-ailes', '1929-07', 1929, '', '422'],
 918074: ['Annals of the Missouri Botanical Garden',
  'sim_annals-of-the-missouri-botanical-garden',
  '1999',
  1999,
  '86',
  '4'],
 816227: ['Astronomical Journal',
  'sim_astronomical-journal',
  '1995-04',
  1995,
  '109',
  '4'],
 1421440: ['Institution of Aeronautical Sc

In [98]:
# Get into a dataframe format
parsed_citations_df = pd.DataFrame.from_dict(parsed_citations_dict, 
                                             orient = 'index',
                                   columns = ['journal_name', 'sim_id', 'date', 'year', 'volume', 'issue'])
parsed_citations_df.head()

Unnamed: 0,journal_name,sim_id,date,year,volume,issue
672231,Social Scientist,sim_social-scientist,JulAug2002,0,30.0,7/8
2558012,Ann Intern Med,sim_ann-intern-med,2003,2003,138.0,2
1867923,Functional Ecology,sim_functional-ecology,2004,2004,18.0,2
1624028,Variety magazine,sim_variety-magazine,August232012,0,,
422209,Archives of Metallurgy and Materials,sim_archives-of-metallurgy-and-materials,2013-06,2013,58.0,2


## Filter Citations by SIM Collections

In [99]:
# Limit to citations with the exact sim_id match
merged = pd.merge(left=parsed_citations_df, right=sim_info, how="inner", 
                  left_on="sim_id", right_on="PubIssueID")
merged.head()

Unnamed: 0,journal_name,sim_id,date,year,volume,issue,Title,First Volume,Last Volume,Subjects,PubIssueID
0,Functional Ecology,sim_functional-ecology,2004,2004,18,2,Functional Ecology,1987.0,2002.0,Biological Sciences,sim_functional-ecology
1,Functional Ecology,sim_functional-ecology,2010,2010,24,3,Functional Ecology,1987.0,2002.0,Biological Sciences,sim_functional-ecology
2,Functional Ecology,sim_functional-ecology,2002,2002,16,5,Functional Ecology,1987.0,2002.0,Biological Sciences,sim_functional-ecology
3,Functional Ecology,sim_functional-ecology,2005,2005,19,4,Functional Ecology,1987.0,2002.0,Biological Sciences,sim_functional-ecology
4,Functional Ecology,sim_functional-ecology,2005,2005,19,3,Functional Ecology,1987.0,2002.0,Biological Sciences,sim_functional-ecology


In [100]:
print("Number of Citations in SIM : " + str(merged.shape[0]))

Number of Citations in SIM : 14082


In [101]:
# Filter out the journals of years that are not within the range of collection
def filter_year_range(row):
    year = row['year']
    first = row['First Volume']
    last = row['Last Volume']
    if first != np.nan and last != np.nan:
        if year > first and year < last:
            return True
    
    return False

In [102]:
merged["within_yr_range"] = merged.apply(filter_year_range, axis = 1)
journal_year_within_range = merged[merged["within_yr_range"]]
journal_year_within_range = journal_year_within_range.drop(columns = ['within_yr_range'])
journal_year_within_range.head()

Unnamed: 0,journal_name,sim_id,date,year,volume,issue,Title,First Volume,Last Volume,Subjects,PubIssueID
8,Astronomical Journal,sim_astronomical-journal,1967-11,1967,72,,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal
9,Astronomical Journal,sim_astronomical-journal,1993,1993,105,5.0,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal
10,Astronomical Journal,sim_astronomical-journal,1944-08,1944,51,2.0,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal
12,Astronomical Journal,sim_astronomical-journal,1916,1916,29,695.0,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal
15,Astronomical Journal,sim_astronomical-journal,1961-03,1961,70,,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal


In [103]:
print("Number of Citations in SIM with range of collection years: " + str(journal_year_within_range.shape[0]))

Number of Citations in SIM with range of collection years: 7258


In [104]:
journal_year_not_within_range = merged[merged["within_yr_range"] == False]
journal_year_not_within_range = journal_year_not_within_range.drop(columns = ['within_yr_range'])
journal_year_not_within_range.head()

Unnamed: 0,journal_name,sim_id,date,year,volume,issue,Title,First Volume,Last Volume,Subjects,PubIssueID
0,Functional Ecology,sim_functional-ecology,2004,2004,18,2,Functional Ecology,1987.0,2002.0,Biological Sciences,sim_functional-ecology
1,Functional Ecology,sim_functional-ecology,2010,2010,24,3,Functional Ecology,1987.0,2002.0,Biological Sciences,sim_functional-ecology
2,Functional Ecology,sim_functional-ecology,2002,2002,16,5,Functional Ecology,1987.0,2002.0,Biological Sciences,sim_functional-ecology
3,Functional Ecology,sim_functional-ecology,2005,2005,19,4,Functional Ecology,1987.0,2002.0,Biological Sciences,sim_functional-ecology
4,Functional Ecology,sim_functional-ecology,2005,2005,19,3,Functional Ecology,1987.0,2002.0,Biological Sciences,sim_functional-ecology


In [105]:
print("Number of Citations in SIM not in range of collection years: " + str(journal_year_not_within_range.shape[0]))

Number of Citations in SIM not in range of collection years: 6824


## Extract Different Dataframes 

In [106]:
total_count = journal_year_within_range.shape[0]
total_count

7258

In [114]:
# Journals that have sim_id, date, volume
journal_sim_has_journal_date_volume = journal_year_within_range[(journal_year_within_range["date"] != "") &
                                                               (journal_year_within_range["volume"] != "")]
print("Journal, date, volume count: " + str(journal_sim_has_journal_date_volume.shape[0]))
journal_sim_has_journal_date_volume.head()

Journal, date, volume count: 6860


Unnamed: 0,journal_name,sim_id,date,year,volume,issue,Title,First Volume,Last Volume,Subjects,PubIssueID
8,Astronomical Journal,sim_astronomical-journal,1967-11,1967,72,,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal
9,Astronomical Journal,sim_astronomical-journal,1993,1993,105,5.0,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal
10,Astronomical Journal,sim_astronomical-journal,1944-08,1944,51,2.0,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal
12,Astronomical Journal,sim_astronomical-journal,1916,1916,29,695.0,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal
15,Astronomical Journal,sim_astronomical-journal,1961-03,1961,70,,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal


In [115]:
# Journals that have sim_id, date, no volume
journal_sim_has_journal_date_no_volume = journal_year_within_range[(journal_year_within_range["date"] != "") &
                                                               (journal_year_within_range["volume"] == "")]
print("Journal date, no volume count: " + str(journal_sim_has_journal_date_no_volume.shape[0]))
journal_sim_has_journal_date_no_volume.head()

Journal date, no volume count: 398


Unnamed: 0,journal_name,sim_id,date,year,volume,issue,Title,First Volume,Last Volume,Subjects,PubIssueID
36,American Scientist,sim_american-scientist,2001,2001,,,American Scientist,1913.0,2016.0,Sciences: Comprehensive Works,sim_american-scientist
468,Science,sim_science,2015-07,2015,,,Science,1883.0,2016.0,Sciences: Comprehensive Works|Technology: Comp...,sim_science
572,Science,sim_science,2015-01,2015,,,Science,1883.0,2016.0,Sciences: Comprehensive Works|Technology: Comp...,sim_science
1080,Science,sim_science,2009-05,2009,,,Science,1883.0,2016.0,Sciences: Comprehensive Works|Technology: Comp...,sim_science
1220,Science,sim_science,2015-10,2015,,,Science,1883.0,2016.0,Sciences: Comprehensive Works|Technology: Comp...,sim_science


In [117]:
# Journals with sim_id, date, volume, issue
journal_sim_has_journal_date_volume_issue = journal_sim_has_journal_date_volume[journal_sim_has_journal_date_volume["issue"] != ""]
print("Journal, date, volume, issue count: " + str(journal_sim_has_journal_date_volume_issue.shape[0]))
journal_sim_has_journal_date_volume_issue.head()

Journal, date, volume, issue count: 6192


Unnamed: 0,journal_name,sim_id,date,year,volume,issue,Title,First Volume,Last Volume,Subjects,PubIssueID
9,Astronomical Journal,sim_astronomical-journal,1993,1993,105,5,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal
10,Astronomical Journal,sim_astronomical-journal,1944-08,1944,51,2,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal
12,Astronomical Journal,sim_astronomical-journal,1916,1916,29,695,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal
31,American Scientist,sim_american-scientist,2000,2000,American Scientist Online May–June 2000,3,American Scientist,1913.0,2016.0,Sciences: Comprehensive Works,sim_american-scientist
34,American Scientist,sim_american-scientist,1970,1970,58,6,American Scientist,1913.0,2016.0,Sciences: Comprehensive Works,sim_american-scientist


In [118]:
# Journals with sim_id, date, volume, issue
journal_sim_has_journal_date_volume_no_issue = journal_sim_has_journal_date_volume[journal_sim_has_journal_date_volume["issue"] == ""]
print("Journal, date, volume, no issue count: " + str(journal_sim_has_journal_date_volume_no_issue.shape[0]))
journal_sim_has_journal_date_volume_no_issue.head()

Journal, date, volume, no issue count: 668


Unnamed: 0,journal_name,sim_id,date,year,volume,issue,Title,First Volume,Last Volume,Subjects,PubIssueID
8,Astronomical Journal,sim_astronomical-journal,1967-11,1967,72,,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal
15,Astronomical Journal,sim_astronomical-journal,1961-03,1961,70,,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal
17,Astronomical Journal,sim_astronomical-journal,1969-04,1969,74,,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal
19,Astronomical Journal,sim_astronomical-journal,1966-12,1966,71,,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal
21,Astronomical Journal,sim_astronomical-journal,1981,1981,86,,The Astronomical Journal,1849.0,1994.0,Mathematical & Physical Sciences,sim_astronomical-journal


## Generate URLs

- citation should already have name, date, year 
    - citation has volume
        - citation has issue (url with 4 fields)
            - with date
            - with year
        - citation has no issue (url with 3 fields)
            - with date 
            - with year
    - citation has no volume
        - advanced search?

### 3 Field URLs

In [134]:
def generate_url_with_journal_date_volume(row):
    archive_header = "https://archive.org/details/"
    journal = str(row["sim_id"])
    date = "_" + str(row["date"])
    volume = "_" + str(row["volume"])
    identifier = journal + date + volume
    return archive_header + identifier

In [135]:
url_journal_date_volume = journal_sim_has_journal_date_volume.apply(
                                        generate_url_with_journal_date_volume, axis = 1)
url_journal_date_volume_list = url_journal_date_volume.tolist()
url_journal_date_volume_list[0:5]

['https://archive.org/details/sim_astronomical-journal_1967-11_72',
 'https://archive.org/details/sim_astronomical-journal_1993_105',
 'https://archive.org/details/sim_astronomical-journal_1944-08_51',
 'https://archive.org/details/sim_astronomical-journal_1916_29',
 'https://archive.org/details/sim_astronomical-journal_1961-03_70']

In [136]:
def generate_url_with_journal_year_volume(row):
    archive_header = "https://archive.org/details/"
    journal = str(row["sim_id"])
    year = "_" + str(row["year"])
    volume = "_" + str(row["volume"])
    identifier = journal + year + volume
    return archive_header + identifier

In [145]:
url_journal_year_volume = journal_sim_has_journal_date_volume.apply(
                                        generate_url_with_journal_year_volume, axis = 1)
url_journal_year_volume_list = url_journal_year_volume.tolist()
url_journal_year_volume_list[0:5]

['https://archive.org/details/sim_astronomical-journal_1967_72',
 'https://archive.org/details/sim_astronomical-journal_1993_105',
 'https://archive.org/details/sim_astronomical-journal_1944_51',
 'https://archive.org/details/sim_astronomical-journal_1916_29',
 'https://archive.org/details/sim_astronomical-journal_1961_70']

### 4 Field URLs

In [138]:
def generate_url_with_journal_date_volume_issue(row):
    archive_header = "https://archive.org/details/"
    journal = str(row["sim_id"])
    date = "_" + str(row["date"])
    volume = "_" + str(row["volume"])
    issue = "_" + str(row["issue"])
    identifier = journal + date + volume + issue
    return archive_header + identifier

In [139]:
url_journal_date_volume_issue = journal_sim_has_journal_date_volume_issue.apply(
                                        generate_url_with_journal_date_volume_issue, axis = 1)
url_journal_date_volume_issue_list = url_journal_date_volume_issue.tolist()
url_journal_date_volume_issue_list[0:5]

['https://archive.org/details/sim_astronomical-journal_1993_105_5',
 'https://archive.org/details/sim_astronomical-journal_1944-08_51_2',
 'https://archive.org/details/sim_astronomical-journal_1916_29_695',
 'https://archive.org/details/sim_american-scientist_2000_American Scientist Online May–June 2000_3',
 'https://archive.org/details/sim_american-scientist_1970_58_6']

In [140]:
def generate_url_with_journal_year_volume_issue(row):
    archive_header = "https://archive.org/details/"
    journal = str(row["sim_id"])
    year = "_" + str(row["year"])
    volume = "_" + str(row["volume"])
    issue = "_" + str(row["issue"])
    identifier = journal + year + volume + issue
    return archive_header + identifier

In [141]:
url_journal_year_volume_issue = journal_sim_has_journal_date_volume_issue.apply(
                                        generate_url_with_journal_date_volume_issue, axis = 1)
url_journal_year_volume_issue_list = url_journal_year_volume_issue.tolist()
url_journal_year_volume_issue_list[0:5]


['https://archive.org/details/sim_astronomical-journal_1993_105_5',
 'https://archive.org/details/sim_astronomical-journal_1944-08_51_2',
 'https://archive.org/details/sim_astronomical-journal_1916_29_695',
 'https://archive.org/details/sim_american-scientist_2000_American Scientist Online May–June 2000_3',
 'https://archive.org/details/sim_american-scientist_1970_58_6']

## Write Urls to Txt Files
*Run the deadlink checker on it

In [142]:
# 3 Field URL date
url_count = 0
textfile = open("Journal_3_field_date.txt", "w")
for element in url_journal_date_volume_list:
    textfile.write(element + "\n")
    url_count += 1
textfile.close()
print("File should have " + str(url_count) + " urls")

File should have 6860 urls


In [143]:
# 3 Field URL year
url_count = 0
textfile = open("Journal_3_field_year.txt", "w")
for element in url_journal_year_volume_list:
    textfile.write(element + "\n")
    url_count += 1
textfile.close()
print("File should have " + str(url_count) + " urls")

File should have 6860 urls


In [146]:
# 4 Field URL date
url_count = 0
textfile = open("Journal_4_field_date.txt", "w")
for element in url_journal_date_volume_issue_list:
    textfile.write(element + "\n")
    url_count += 1
textfile.close()
print("File should have " + str(url_count) + " urls")

File should have 6192 urls


In [147]:
# 4 Field URL date
url_count = 0
textfile = open("Journal_4_field_year.txt", "w")
for element in url_journal_year_volume_issue_list:
    textfile.write(element + "\n")
    url_count += 1
textfile.close()
print("File should have " + str(url_count) + " urls")

File should have 6192 urls
