# Citation Pipeline

In [1]:
import numpy as np 
import pandas as pd
import re
import requests
import datetime
import dateparser
import json 

from bs4 import BeautifulSoup

import gspread
from gspread_dataframe import set_with_dataframe



In [2]:
### change this part for different alias for different languages 
journal_aliases = ['journal', 'newspaper', 'magazine', 'work','website',  'periodical', 
                       'encyclopedia', 'encyclopaedia', 'dictionary', 'mailinglist','dergi', 'gazete', 
                       'eser', 'çalışma', 'iş', 'websitesi', 'süreliyayın', 'ansiklopedi', 'sözlük', 'program']
    
date_aliases = ['date', 'air-date', 'airdate', 'tarih']
    
year_aliases = ['year', 'yıl', 'sene']
    
volume_aliases = ['volume', 'cilt']
    
issue_aliases = ['issue', 'number', 'sayı', 'numara']

page_aliases = ['p', 'page', 's', 'sayfa']
pages_aliases = ['pp', 'pages', 'ss', 'sayfalar']

url_aliases = ['url', 'URL', 'katkı-url', 'chapter-url', 'contribution-url', 'entry-url', 
               'article-url', 'section-url']

title_aliases= ['title', 'başlık']

In [3]:
# Parsing a wikipedia citation data
def parse_citation_data(citation):

    citation = re.sub('[{}]', '', citation)
    citation_list = citation.split("|")
    
    journal = ""
    sim_id = ""
    volume = ""
    issue = ""
    page = ""
    url = ""
    title = ""
    
    date = ""
    month_str = ""

    year = 0
    month = 0
    
    for field in citation_list:
        field = field.strip()
        
        # find journal title
        for j_a in journal_aliases:
            journal_regex = j_a + "(\s{0,})="
            if re.match(re.compile(journal_regex), field):
                journal = field.split("=")[1].strip()
                journal = re.sub('[^A-Za-z0-9 ]+', '', journal)
                if journal != "":
                    sim_id = journal.lower()
                    sim_id_lst = sim_id.split()
                    sim_id = "-".join(sim_id_lst)
                    sim_id = "sim_" + sim_id
                break
   
        # find journal volume 
        for v_a in volume_aliases:
            volume_regex = v_a + "(\s{0,})="
            if re.match(re.compile(volume_regex), field):
                volume = field.split("=")[1].strip()
                volume = re.sub('[^0-9]+', '', volume)
                break
            
        # find journal issue
        for i_a in issue_aliases:
            issue_regex = i_a + "(\s{0,})="
            if re.match(issue_regex, field):
                issue = field.split("=")[1].strip()
                break
        
        # find journal year
        for y_a in year_aliases:
            year_regex = y_a + "(\s{0,})="
            if re.match(year_regex, field):
                year = field.split("=")[1].strip()
                date = re.sub('[^0-9]+', '', year)
                try:
                    year = int(date)
                except:
                    year = 0
                break
            
        # find journal date
        for d_a in date_aliases:
            date_regex = d_a + "(\s{0,})="
            if re.match(date_regex, field):
                date = field.split("=")[1].strip()
                
#                 print("this should be a date " + date)

                try:
                    year = int(date)
                    date = str(year)
                except:
                    # use the python library for parsing
                    parsed_date = dateparser.parse(date)
                    if parsed_date != None:
                        if parsed_date.year < 2021 and parsed_date.year > 1800:
                            year = parsed_date.year
                            date = str(year)

                        if parsed_date.month < 10:
                            month = parsed_date.month
                            month_str = "0" + str(month)
                        else:
                            month = parsed_date.month
                            month_str = str(month)
                            
#                         print(month_str)

                        if month_str != "":
                            date = date + "-" + month_str 
                break
        
         # find existing url
        for u_a in url_aliases:
            url_regex = u_a + "(\s{0,})="
            if re.match(url_regex, field):
                url = field.split("=")[1].strip()
                break
            
        # find page field 
        for p_a in page_aliases:
            page_regex = p_a + "(\s{0,})="
            if re.match(page_regex, field):
                page = field.split("=")[1].strip()
                if "[" in page:
                    page = ""
                break
                
        # find pages field
        for ps_a in pages_aliases:
            pages_regex = ps_a + "(\s{0,})="
            if re.match(pages_regex, field):
                pages = field.split("=")[1].strip()
                if "[" not in pages:
                    if "-" in pages:
                        page = pages.split("-")[0].strip()
                    elif "–" in pages:
                        page = pages.split("–")[0].strip()
                    else:
                        page = ""
                        
                break
                
        # find page field 
        for t_a in title_aliases:
            title_regex = t_a + "(\s{0,})="
            if re.match(title_regex, field):
                title = field.split("=")[1].strip()
                if "[" in title:
                    title = ""
                break
            
    return {'journal': journal, 'sim_id': sim_id, 'date': date, 'year': year, 'month': month, 
            'volume': volume, 'issue': issue, 'title': title, 'page': page, 'url': url}
        

In [4]:
### make sure year is within range and not one of those na
def within_year_range(row, year):
    first = row['First Volume']
    last = row['Last Volume']
    gaps = row['NA Gaps']
    if first != np.nan and last != np.nan:
        if year > first and year < last:
            if gaps != np.nan and gaps != "":
                gaps = str(gaps)
                gaps_list = gaps.split(";")
                for gap in gaps_list:
                    if gap.strip() == str(year):
                        return False
                return True
            return True
    return False

In [30]:
### Different functions for generating ids
def generate_id_journal_year(cite_info):
    identifier = cite_info['sim_id'] 
    if cite_info['year'] != 0:
        identifier = identifier + "_" + str(cite_info['year'])
    return identifier

def generate_id_journal_date(cite_info):
    identifier = cite_info['sim_id'] 
    if cite_info['date'] != "":
        identifier = identifier + "_" + str(cite_info['date'])
    return identifier

    
def generate_id_journal_year_volume(cite_info):
    identifier = generate_id_journal_year(cite_info)
    if cite_info['volume'] != "":
        identifier = identifier + "_" + cite_info['volume']
    return identifier

def generate_id_journal_date_volume(cite_info):
    identifier = generate_id_journal_date(cite_info)
    if cite_info['volume'] != "":
        identifier = identifier + "_" + cite_info['volume']
    return identifier


def generate_id_journal_year_volume_issue(cite_info):
    identifier = generate_id_journal_year_volume(cite_info)
    if cite_info['issue'] != "":
        identifier = identifier + "_" + cite_info['issue']
    return identifier

def generate_id_journal_date_volume_issue(cite_info):
    identifier = generate_id_journal_date_volume(cite_info)
    if cite_info['issue'] != "":
        identifier = identifier + "_" + cite_info['issue']
    return identifier

## Establish Archive.org Connection

### Method 1 (Python wrapper)
- https://archive.org/services/docs/api/internetarchive/index.html#user-s-guide
- problem: doesn't have same permission as my account

In [5]:
from internetarchive import configure, get_session
configure('email', 'password', config_file='ia-config.ini')

'ia-config.ini'

In [6]:
sess = get_session(config_file = "ia-config.ini")
sess.mount_http_adapter()
# search_results = sess.search_items('sim_australian-journal-of-botany_1998')
search_results = sess.search_items('sim_nature-biotechnology')
for result in search_results:
    print(result)
#     print(type(result))

{'identifier': 'sim_nature-biotechnology_1983_3_index'}
{'identifier': 'sim_nature-biotechnology_1984_1_index'}
{'identifier': 'sim_nature-biotechnology_1984_2_index'}
{'identifier': 'sim_nature-biotechnology_1986_4_index'}
{'identifier': 'sim_nature-biotechnology_1987_5_index'}
{'identifier': 'sim_nature-biotechnology_1988_6_index'}
{'identifier': 'sim_nature-biotechnology_1989_7_index'}
{'identifier': 'sim_nature-biotechnology_1990_8_index'}
{'identifier': 'sim_nature-biotechnology_1991_9_index'}
{'identifier': 'sim_nature-biotechnology_1992_10_index'}
{'identifier': 'sim_nature-biotechnology_1993_11_index'}
{'identifier': 'sim_nature-biotechnology_1994_12_index'}
{'identifier': 'sim_nature-biotechnology_1995_13_index'}
{'identifier': 'sim_nature-biotechnology_1997_15_index'}
{'identifier': 'sim_nature-biotechnology_1998_16_index'}
{'identifier': 'sim_nature-biotechnology_1999_17_index'}
{'identifier': 'sim_nature-biotechnology_2000_18_index'}


In [7]:
def find_ids_for_journal(sim_id):
    sess = get_session(config_file = "ia-config.ini")
    sess.mount_http_adapter()
    search_results = sess.search_items(sim_id)
    
    possible_ids = []
    for result in search_results:
        possible_ids.append(result['identifier'])

    return possible_ids

In [8]:
def find_id_match(cite_info, id_list):
    
    close_matches = []
    
    for possible_id in id_list:
        possible_id_list = possible_id.split("_")
        
        if len(possible_id_list) < 4:
            return ""
        
        if len(possible_id_list) == 4:
            
            # Check that it is sim
            if possible_id_list[0] == "sim":
                
                # Check that journal name matches
                if possible_id_list[1] == cite_info['sim_id'][4:]:
                    
                    # Check that year/date is within other case
                    if str(cite_info["year"]) in possible_id_list[2]:
                        
                        # Check that journal volume matches
                        if possible_id_list[3] == cite_info["volume"]:
                            
                            close_matches.append(possible_id)
                    
        if cite_info["issue"] != "" and len(possible_id_list) >= 5:
            
            # Check that it is sim & journal name matches & and year/date is within possible & 
            # & volume match
            if (possible_id_list[0] == "sim" and possible_id_list[1] == cite_info['sim_id'][4:] 
            and str(cite_info["year"]) in possible_id_list[2] and possible_id_list[3] == cite_info["volume"]):
                      
                # Check that journal issue matches
                if possible_id_list[4] == cite_info["issue"]:
                     close_matches.append(possible_id)
                    
    if not close_matches:
        return ""
    
    return close_matches[0]
    

In [9]:
test_cite1 = "{{Akademik dergi kaynağı|url=|başlık=Large-scale analysis of the yeast proteome by multidimensional protein identification technology|erişimtarihi=|yazarlar=Washburn|tarih=Mart 2001|sayı=3|dil=En|sayfalar=242-247|çalışma=Nature Biotechnology|yayıncı=|cilt=19}}"
print("input: ")
print(test_cite1)

test_cite1_info = parse_citation_data(test_cite1)
print("cite info: ")
print(test_cite1_info)

test_cite1_ids = find_ids_for_journal(test_cite1_info['sim_id'])
print("cite info ids: ")
print(test_cite1_ids)

find_id_match(test_cite1_info, test_cite1_ids)

input: 
{{Akademik dergi kaynağı|url=|başlık=Large-scale analysis of the yeast proteome by multidimensional protein identification technology|erişimtarihi=|yazarlar=Washburn|tarih=Mart 2001|sayı=3|dil=En|sayfalar=242-247|çalışma=Nature Biotechnology|yayıncı=|cilt=19}}
cite info: 
{'journal': 'Nature Biotechnology', 'sim_id': 'sim_nature-biotechnology', 'date': '2001-03', 'year': 2001, 'month': 3, 'volume': '19', 'issue': '3', 'title': 'Large-scale analysis of the yeast proteome by multidimensional protein identification technology', 'page': '242', 'url': ''}
cite info ids: 
['sim_nature-biotechnology_1983_3_index', 'sim_nature-biotechnology_1984_1_index', 'sim_nature-biotechnology_1984_2_index', 'sim_nature-biotechnology_1986_4_index', 'sim_nature-biotechnology_1987_5_index', 'sim_nature-biotechnology_1988_6_index', 'sim_nature-biotechnology_1989_7_index', 'sim_nature-biotechnology_1990_8_index', 'sim_nature-biotechnology_1991_9_index', 'sim_nature-biotechnology_1992_10_index', 'sim_na

''

### Method 2 (Make requests myself) 

In [13]:
import requests.adapters

In [190]:
def initialize_archive_session():
    with open('cookie.txt') as f:
        login_data_raw = f.read()
    
    login_data = json.loads(login_data_raw)
    
    session = requests.Session()
    
#     # Mount it for both http and https usage
#     adapter = TimeoutHTTPAdapter(timeout=10)
#     session.mount("https://", adapter)
#     session.mount("http://", adapter)
    
    
    # get login values through scraping
    login_url = "https://archive.org/account/login"
    res = session.get(login_url, timeout = 5)
    soup = BeautifulSoup(res.content, "html.parser")
    login_data['login'] = soup.find('input', attrs = {'name':'login'})['value']
    
    
    # post cookie values to login 
    res = session.post(login_url, data = login_data, timeout = 10)
    
    return session, login_data


In [112]:
def perform_advanced_search(session, login_data, identifier):
    url_head = "https://archive.org/advancedsearch.php?q="
    url_tail = "&fl%5B%5D=identifier&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=5000&page=1&output=json&callback=callback&save=yes"
    
    url = url_head + identifier + url_tail
    
    try:
        request = session.get(url, data = login_data, timeout = 10)
        request.raise_for_status()
        
        response = request.text[9:-1]
        response_json = json.loads(response)['response']

        nums = response_json['numFound']

        # if nothing is found, return empty string
        if nums == 0: 
            return ""

        # if more than one thing is found, return 
        result = dict()
        response_list = response_json['docs']
        for item in response_list:
            temp_id = item['identifier']
            temp_score = item['_score']
            result[temp_id] = temp_score

        return result
    except requests.exceptions.HTTPError as errh:
        print ("Http Error:",errh)
    except requests.exceptions.ConnectionError as errc:
        print ("Error Connecting:",errc)
    except requests.exceptions.Timeout as errt:
        print ("Timeout Error:",errt)
    except requests.exceptions.RequestException as err:
        print ("OOps: Something Else",err)
    
    return ""

In [184]:
def find_close_match_from_cite_info(cite_info, search_result, verbose = False):
    id_list = list(search_result.keys())
#     identifier_list = identifier.split("_")

#     print(cite_info)
    
    close_matches = dict()
    for possible_id in id_list:
        possible_id_list = possible_id.split("_")
        
        
        if verbose: print(possible_id_list)
        
        if len(possible_id_list) < 4:
            continue;
        
        if cite_info["issue"] != "" and len(possible_id_list) >= 5:
            
            if verbose: print("longer than 5")
            
            # Check that it is sim
            if possible_id_list[0] == "sim":
                
                # Check that journal name matches
                if possible_id_list[1] == cite_info['sim_id'][4:]:
                    
                    # Check that year/date is within other case
                    if str(cite_info["year"]) in possible_id_list[2]:
                        
                        # Check that journal volume matches
                        if possible_id_list[3] == cite_info["volume"]:
                            
                            # Check that journal issue matches
                            if possible_id_list[4] == cite_info["issue"]:
                                close_matches[possible_id] = search_result[possible_id]
                                continue;
                                
                            if verbose: print("Not the right issue")
                            continue;
                            
                        if verbose: print("Not the right volume")
                        continue;
                        
                    if verbose: print("Not the right year")
                    continue;
                    
                if verbose: print("Possible id journal name not exact match")
                continue;
                
            if verbose: print("Possible id is not in sim")
            continue;
            
        if len(possible_id_list) == 4:
            
            if verbose: print("equal to 4")
            
            # Check that it is sim
            if possible_id_list[0] == "sim":
                
                # Check that journal name matches
                if possible_id_list[1] == cite_info['sim_id'][4:]:
                    
                    # Check that year/date is within other case
                    if str(cite_info["year"]) in possible_id_list[2]:
                        
                        # Check that journal volume matches
                        if possible_id_list[3] == cite_info["volume"]:
                            
                            close_matches[possible_id] = search_result[possible_id]
                            continue;
                            
                        if verbose: print("Not the right volume")
                        continue;
                        
                    if verbose: print("Not the right year")
                    continue;
                    
                if verbose: print("Possible id journal name not exact match")
                continue;
                            
            if verbose: print("Possible id is not in sim")
            continue;
            
    if verbose: 
        print('close matches: ')
        print(close_matches)
                    
    if not close_matches:
        return ""
    
    close_matches_keys = list(close_matches.keys())
    close_matches_values = list(close_matches.values())
    
    # if 1 close match, return it
    if len(close_matches_keys) == 1:
        return close_matches_keys[0]
    
    
    # if multiple, return the one with the highest score
    index_of_max = close_matches_values.index(max(close_matches_values))
    return close_matches_keys[index_of_max]
    

In [185]:
def generate_url_actual(identifier):
    return "https://archive.org/details/" + identifier

In [34]:
sim_info = pd.read_csv("SIM_info.csv")
sim_info.head()

Unnamed: 0,PubIssueID,Title,NA Gaps,First Volume,Last Volume
0,sim-anatomia-clinica,Anatomia Clinica,,1978.0,1981.0
1,sim_-,The - -,,1826.0,1826.0
2,sim_1001-home-ideas,1001 Home Ideas,,1986.0,1991.0
3,sim_102-monitor,102 Monitor,,1975.0,1981.0
4,sim_20th-century-british-history,20th Century British History,,1990.0,1994.0


In [116]:
### test functions
test_cite1 = "{{Akademik dergi kaynağı|url=|başlık=Large-scale analysis of the yeast proteome by multidimensional protein identification technology|erişimtarihi=|yazarlar=Washburn|tarih=Mart 2001|sayı=3|dil=En|sayfalar=242-247|çalışma=Nature Biotechnology|yayıncı=|cilt=19}}"
print("input: ")
print(test_cite1)

test_cite1_info = parse_citation_data(test_cite1)
print("cite info: ")
print(test_cite1_info)

test_cite1_id = test_cite1_info['sim_id']
print("cite info ids: ")
print(test_cite1_id)


test_session, test_login_data = initialize_archive_session()

test_search_result = perform_advanced_search(test_session, test_login_data, test_cite1_id)
# print("cite search results: ")
# print(test_search_result)

test_actual_id = find_close_match_from_cite_info(test_cite1_info, test_search_result)
test_url = generate_url_actual(test_actual_id)
print("url: ")
print(test_url)

input: 
{{Akademik dergi kaynağı|url=|başlık=Large-scale analysis of the yeast proteome by multidimensional protein identification technology|erişimtarihi=|yazarlar=Washburn|tarih=Mart 2001|sayı=3|dil=En|sayfalar=242-247|çalışma=Nature Biotechnology|yayıncı=|cilt=19}}
cite info: 
{'journal': 'Nature Biotechnology', 'sim_id': 'sim_nature-biotechnology', 'date': '2001-03', 'year': 2001, 'month': 3, 'volume': '19', 'issue': '3', 'title': 'Large-scale analysis of the yeast proteome by multidimensional protein identification technology', 'page': '242', 'url': ''}
cite info ids: 
sim_nature-biotechnology
url: 
https://archive.org/details/sim_nature-biotechnology_2001-03_19_3


In [194]:
def main2(citation, verbose = False):
    
    # get dictionary of desired citation info 
    cite_info = parse_citation_data(citation)

    print("The citation info are as follow: ")
    print(cite_info)
    
    # make sure there's no existing url 
    if cite_info['url'] != '':
        print("There is already an existing url.")
        return ""

    # check citation has all desired info
    if (cite_info['sim_id'] == '' or cite_info['year'] == '' or 
        cite_info['volume'] == '' or cite_info['page'] == ''):
        print("Citation has incomplete info.")
        return ""
    
    # check if journal in SIM and in year range
    df = sim_info[sim_info["PubIssueID"] == cite_info["sim_id"]]
    
    
    if not df.empty:
        row = df.iloc[0]
        
        
        if within_year_range(row, cite_info['year']):

            # generate a id
            gen_id = cite_info['sim_id'] + "_" + str(cite_info['year'])
            
            if verbose: print("Gen id: " + gen_id)

            # initialize session
            session, login_data = initialize_archive_session()
            
            # find all entries with this journal name
            search_result = perform_advanced_search(session, login_data, gen_id)
            
#             if verbose: 
#                 print("search results: ")
#                 print(search_result)

            if search_result != "":

                # find close match on generated id
                real_id = find_close_match_from_cite_info(cite_info, search_result)
                
                if verbose:
                    print("real id: " + real_id)

                if real_id != "":

                    # new citation
                    url = generate_url_actual(real_id)
                    print("url: " + url)
                    return url
                
                print("No close match exist for ids.")
                return ""
                
            print("There's no search result for the id: " + gen_id)
            return ""
                
        print("Citation not in SIM collection year range")
        return ""
    
    print("Citation not in SIM collection")
    return "" 


In [79]:
test_cite1 = "{{cite journal | last1 = Lewis | first1 = Herbert | date = June 2001 | title = The Passion of Franz Boas | journal = American Anthropologist | volume = 103 | issue = 2| pages = 447–467 | doi=10.1525/aa.2001.103.2.447}}"
print("input: ")
print(test_cite1)
test_res1 = main2(test_cite1)

input: 
{{cite journal | last1 = Lewis | first1 = Herbert | date = June 2001 | title = The Passion of Franz Boas | journal = American Anthropologist | volume = 103 | issue = 2| pages = 447–467 | doi=10.1525/aa.2001.103.2.447}}
url: https://archive.org/details/sim_american-anthropologist_2001-06_103_2


In [80]:
test_cite2 = "{{Akademik dergi kaynağı|url=|başlık=Large-scale analysis of the yeast proteome by multidimensional protein identification technology|erişimtarihi=|yazarlar=Washburn|tarih=Mart 2001|sayı=3|dil=En|sayfalar=242-247|çalışma=Nature Biotechnology|yayıncı=|cilt=19}}"
print("input: ")
print(test_cite2)
test_res2 = main2(test_cite2)

input: 
{{Akademik dergi kaynağı|url=|başlık=Large-scale analysis of the yeast proteome by multidimensional protein identification technology|erişimtarihi=|yazarlar=Washburn|tarih=Mart 2001|sayı=3|dil=En|sayfalar=242-247|çalışma=Nature Biotechnology|yayıncı=|cilt=19}}
url: https://archive.org/details/sim_nature-biotechnology_2001-03_19_3


In [88]:
test_cite3 = "{{Akademik dergi kaynağı |soyadı= Roux|ad= Stanley J.|başlık=Ca<sup>2+</sup> and Phytochrome Action in Plants|dergi=BioScience|yıl=1984|cilt=34|sayı=1|jstor=1309422 |doi= 10.2307/1309422| pmid = 11540810 |sayfalar=25-29}}"
print("input: ")
print(test_cite3)
test_res3 = main2(test_cite3)

input: 
{{Akademik dergi kaynağı |soyadı= Roux|ad= Stanley J.|başlık=Ca<sup>2+</sup> and Phytochrome Action in Plants|dergi=BioScience|yıl=1984|cilt=34|sayı=1|jstor=1309422 |doi= 10.2307/1309422| pmid = 11540810 |sayfalar=25-29}}
url: https://archive.org/details/sim_bioscience_1984-01_34_1


In [89]:
test_cite4 = "{{Akademik dergi kaynağı|başlık=Results of TV imaging of phobos (experiment VSK-FREGAT)|yazarlar=Avanesov|sayı=1–2|sayfalar=281-295|çalışma=Planetary and Space Science|yayıncı=Elsevier|yıl=1991|cilt=39|pmid=11538495|doi=10.1016/0032-0633(91)90150-9}}"
print("input: ")
print(test_cite4)
test_res4 = main2(test_cite4)

input: 
{{Akademik dergi kaynağı|başlık=Results of TV imaging of phobos (experiment VSK-FREGAT)|yazarlar=Avanesov|sayı=1–2|sayfalar=281-295|çalışma=Planetary and Space Science|yayıncı=Elsevier|yıl=1991|cilt=39|pmid=11538495|doi=10.1016/0032-0633(91)90150-9}}
There's no search result for the id: sim_planetary-and-space-science_1991
The citation info are as follow: 
{'journal': 'Planetary and Space Science', 'sim_id': 'sim_planetary-and-space-science', 'date': '1991', 'year': 1991, 'month': 0, 'volume': '39', 'issue': '12', 'title': 'Results of TV imaging of phobos (experiment VSK-FREGAT)', 'page': '281', 'url': ''}


In [91]:
test_cite6 = "{{cite journal |title=The Discodermia calyx Toxin Calyculin A |last1=Edelson |first1=Jessica R. |last2=Brautigan |first2=David L. |date=24 January 2011 |journal=Toxins |volume=3 |issue=1 |pages=105–119 |doi=10.3390/toxins3010105 |doi-access=free |pmid=22069692 |pmc=3210456}}"
print("input: ")
print(test_cite6)
test_res6 = main2(test_cite6)

input: 
{{cite journal |title=The Discodermia calyx Toxin Calyculin A |last1=Edelson |first1=Jessica R. |last2=Brautigan |first2=David L. |date=24 January 2011 |journal=Toxins |volume=3 |issue=1 |pages=105–119 |doi=10.3390/toxins3010105 |doi-access=free |pmid=22069692 |pmc=3210456}}
Citation not in SIM collection
The citation info are as follow: 
{'journal': 'Toxins', 'sim_id': 'sim_toxins', 'date': '2011-01', 'year': 2011, 'month': 1, 'volume': '3', 'issue': '1', 'title': 'The Discodermia calyx Toxin Calyculin A', 'page': '105', 'url': ''}


In [92]:
test_cite7 = "{{Akademik dergi kaynağı|başlık=Lifetime probability of developing lung cancer, by smoking status, Canada|sayı=6|sayfalar=385-8|çalışma=Canadian Journal of Public Health|yıl=1994|cilt=85|pmid=7895211}}"
print("input: ")
print(test_cite7)
test_res7 = main2(test_cite7)

input: 
{{Akademik dergi kaynağı|başlık=Lifetime probability of developing lung cancer, by smoking status, Canada|sayı=6|sayfalar=385-8|çalışma=Canadian Journal of Public Health|yıl=1994|cilt=85|pmid=7895211}}
No close match exist for ids.
The citation info are as follow: 
{'journal': 'Canadian Journal of Public Health', 'sim_id': 'sim_canadian-journal-of-public-health', 'date': '1994', 'year': 1994, 'month': 0, 'volume': '85', 'issue': '6', 'title': 'Lifetime probability of developing lung cancer, by smoking status, Canada', 'page': '385', 'url': ''}


In [195]:
test_cite8 = "{{Akademik dergi kaynağı |soyadı1=Berger |ad1=L. R. |soyadı2=de Ruiter |ad2=D. J. |soyadı3=Churchill |ad3=S. E. |soyadı4=Schmid |ad4=P. |soyadı5=Carlson |ad5=K. J. |soyadı6=Dirks |ad6=P. H. G. M. |soyadı7=Kibii |ad7=J. M. |yıl=2010 |başlık=''Australopithecus sediba'': a new species of ''Homo''-like australopith from South Africa |dergi=Science |cilt=328 |sayı=5975 |sayfalar=195-204|doi=10.1126/science.1184944 |pmid=20378811|citeseerx=10.1.1.729.7802 |s2cid=14209370 }}"
print("input: ")
print(test_cite8)
test_res8 = main2(test_cite8)

input: 
{{Akademik dergi kaynağı |soyadı1=Berger |ad1=L. R. |soyadı2=de Ruiter |ad2=D. J. |soyadı3=Churchill |ad3=S. E. |soyadı4=Schmid |ad4=P. |soyadı5=Carlson |ad5=K. J. |soyadı6=Dirks |ad6=P. H. G. M. |soyadı7=Kibii |ad7=J. M. |yıl=2010 |başlık=''Australopithecus sediba'': a new species of ''Homo''-like australopith from South Africa |dergi=Science |cilt=328 |sayı=5975 |sayfalar=195-204|doi=10.1126/science.1184944 |pmid=20378811|citeseerx=10.1.1.729.7802 |s2cid=14209370 }}
The citation info are as follow: 
{'journal': 'Science', 'sim_id': 'sim_science', 'date': '2010', 'year': 2010, 'month': 0, 'volume': '328', 'issue': '5975', 'title': "''Australopithecus sediba'': a new species of ''Homo''-like australopith from South Africa", 'page': '195', 'url': ''}
url: https://archive.org/details/sim_science_2010-04-09_328_5975


#### Dumber way of making multiple requests per search

In [54]:
# ### test function 
# session, login_data = initialize_archive_session()
# identifier = "sim_nature-biotechnology_2001-03_19_3"
# search_result = perform_advanced_search(session, login_data, identifier)
# find_close_match_from_generated_id(parse, search_result)

In [55]:
# ### function can be very complicated
# def generate_new_citation(citation, url):
#     url_field = "| url=" + url
#     return citation[:-2] + url_field + "}}"

In [247]:
# def main3(citation):
    
#     # get dictionary of desired citation info 
#     cite_info = parse_citation_data(citation)
#     print(cite_info)
    
#     # check have minimum set of info
#     has_2_yr = False
#     has_3_yr = False
#     has_4_yr = False
#     has_2_date = False
#     has_3_date = False
#     has_4_date = False

#     if cite_info["sim_id"] != "" and cite_info["volume"] != "":

#         if cite_info['year'] != "":
#             has_3_yr = True
#             has_2_yr = True
#             if cite_info["issue"]:
#                 has_4_yr = True
#         if cite_info['date'] != "":
#             has_3_date = True
#             has_2_date = True
#             if cite_info["issue"]:
#                 has_4_date = True
    
#     # check if journal in SIM and in year range
#     df = sim_info[sim_info["PubIssueID"] == cite_info["sim_id"]]
    
#     if not df.empty:
#         row = df.iloc[0]
        
        
#         if within_year_range(row, cite_info['year']):

#             # generate identifier
#             gen_ids = []
            
#             if has_2_date:
#                 gen_ids.append(generate_id_journal_date(cite_info))
                
#             if has_2_yr:
#                 gen_ids.append(generate_id_journal_year(cite_info))

#             if has_3_date: 
#                 gen_ids.append(generate_id_journal_date_volume(cite_info))

#             if has_3_yr:
#                 gen_ids.append(generate_id_journal_year_volume(cite_info))

#             if has_4_date:
#                 gen_ids.append(generate_id_journal_date_volume_issue(cite_info))

#             if has_4_yr:
#                 gen_ids.append(generate_id_journal_year_volume_issue(cite_info))

#             print("Generated ids: ")
#             print(gen_ids)

#             if gen_ids:

#                 session, login_data = initialize_archive_session()
# #                 with open('cookie.txt') as f:
# #                     login_data_raw = f.read()
    
# #                 login_data = json.loads(login_data_raw)
        
# # #                 with requests.Session() as session:
# #                 session = requests.Session()
                
# #                 login_url = "https://archive.org/account/login"


# #                 res = session.get(login_url)
# #                 soup = BeautifulSoup(res.content, "html.parser")
# #                 login_data['login'] = soup.find('input', attrs = {'name':'login'})['value']

# #                 res = session.post(login_url, data = login_data)

#                 # we want to check the 4 field (more exact) possibilities first
#                 for gen_id in reversed(gen_ids):

#                     print(gen_id)

#                     # perform advanced search on generated identifier
#                     search_result = perform_advanced_search(session, login_data, gen_id)

#                     print("Search Result: ")
#                     print(search_result)

#                     if search_result != "":

#                         # find close match on generated id
#                         real_id = find_close_match_from_generated_id(cite_info, search_result)

#                         print("Actual id: " + real_id)

#                         if real_id != "":

#                             # new citation
#                             url = generate_url_actual(real_id)
#                             session.close()
#                             return url
                
#                 session.close()

    
#     return "" 


In [56]:
# test_cite1 = "{{cite journal | last1 = Lewis | first1 = Herbert | date = June 2001 | title = The Passion of Franz Boas | journal = American Anthropologist | volume = 103 | issue = 2| pages = 447–467 | doi=10.1525/aa.2001.103.2.447}}"
# print("input: ")
# print(test_cite1)
# test_res1 = main3(test_cite1)
# print("output: ")
# print(test_res1)

In [57]:
# %%time
# test_cite2 = "{{Akademik dergi kaynağı|başlık=Toxicologic and Epidemiologic Clues from the Characterization of the 1952 London Smog Fine Particulate Matter in Archival Autopsy Lung Tissues Hunt|yazarlar=Andrew|sayı=9|sayfalar=1209-14|çalışma=Environmental Health Perspectives|yıl=2003|cilt=111|pmc=1241576|pmid=12842775|doi=10.1289/ehp.6114}}"
# print("input: ")
# print(test_cite2)
# test_res2 = main3(test_cite2)
# print("output: ")
# print(test_res2)

In [58]:
# test_cite3 = "{{Akademik dergi kaynağı |soyadı= Roux|ad= Stanley J.|başlık=Ca<sup>2+</sup> and Phytochrome Action in Plants|dergi=BioScience|yıl=1984|cilt=34|sayı=1|jstor=1309422 |doi= 10.2307/1309422| pmid = 11540810 |sayfalar=25-29}}"
# print("input: ")
# print(test_cite3)
# test_res3 = main3(test_cite3)
# print("output: ")
# print(test_res3)

In [59]:
# test_cite4 = "{{Akademik dergi kaynağı|başlık=Results of TV imaging of phobos (experiment VSK-FREGAT)|yazarlar=Avanesov|sayı=1–2|sayfalar=281-295|çalışma=Planetary and Space Science|yayıncı=Elsevier|yıl=1991|cilt=39|pmid=11538495|doi=10.1016/0032-0633(91)90150-9}}"
# print("input: ")
# print(test_cite4)
# test_res4 = main3(test_cite4)
# print("output: ")
# print(test_res4)

In [60]:
# test_cite5 = "{{Akademik dergi kaynağı|url=|başlık=Large-scale analysis of the yeast proteome by multidimensional protein identification technology|erişimtarihi=|yazarlar=Washburn|tarih=Mart 2001|sayı=3|dil=En|sayfalar=242-247|çalışma=Nature Biotechnology|yayıncı=|cilt=19}}"
# print("input: ")
# print(test_cite5)
# test_res5 = main3(test_cite5)
# print("output: ")
# print(test_res5)

In [61]:
# test_cite6 = "{{cite journal |title=The Discodermia calyx Toxin Calyculin A |last1=Edelson |first1=Jessica R. |last2=Brautigan |first2=David L. |date=24 January 2011 |journal=Toxins |volume=3 |issue=1 |pages=105–119 |doi=10.3390/toxins3010105 |doi-access=free |pmid=22069692 |pmc=3210456}}"
# print("input: ")
# print(test_cite6)
# test_res6 = main3(test_cite6)
# print("output: ")
# print(test_res6)

In [62]:
# test_cite7 = "{{Akademik dergi kaynağı|başlık=Lifetime probability of developing lung cancer, by smoking status, Canada|sayı=6|sayfalar=385-8|çalışma=Canadian Journal of Public Health|yıl=1994|cilt=85|pmid=7895211}}"
# print("input: ")
# print(test_cite7)
# test_res7 = main3(test_cite7)
# print("output: ")
# print(test_res7)

## Run Program on Citation Data

In [63]:
sheet_cite_id = "1ih5bIk5_d5WLEtArRzEFPzPlwZVA_BI-O8kSxWH1rNU"
sheet_cite_name = "Potential_citations_data"
url_cite = f"https://docs.google.com/spreadsheets/d/{sheet_cite_id}/gviz/tq?tqx=out:csv&sheet={sheet_cite_name}"

In [64]:
citations_df = pd.read_csv(url_cite)
citations_df.head()

Unnamed: 0,a,c,journal,sim_id,date,year,month,volume,issue,title,page,url,PubIssueID,Title,NA Gaps,First Volume,Last Volume,generated_ids
0,Uzay Araştırma ve Teknolojisi Enstitüsü,{{Akademik dergi kaynağı|başlık=Results of TV ...,Planetary and Space Science,sim_planetary-and-space-science,1991,1991,0,39,12.0,Results of TV imaging of phobos (experiment VS...,281.0,,sim_planetary-and-space-science,Planetary and Space Science,,1959,2002.0,"['sim_planetary-and-space-science_1991', 'sim_..."
1,"1,3,5-Triklorobenzen",{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-G...,Chemosphere,sim_chemosphere,1999-12,1999,12,39,15.0,Partial solubility parameters of chlorobenzene...,2607.0,,sim_chemosphere,Chemosphere,,1972,2003.0,"['sim_chemosphere_1999-12', 'sim_chemosphere_1..."
2,İz fosili,{{akademik dergi kaynağı\n | yazar = Seilacher...,Marine Geology,sim_marine-geology,1967,1967,0,5,56.0,Bathymetry of trace fossils,413.0,,sim_marine-geology,Marine Geology,,1964,2000.0,"['sim_marine-geology_1967', 'sim_marine-geolog..."
3,1900’de Arjantin,{{Akademik dergi kaynağı |başlık=ARGENTINA. Re...,Public Health Reports,sim_public-health-reports,1900-06,1900,6,15,24.0,ARGENTINA. Republic declared free from plague,1504.0,,sim_public-health-reports,Public Health Reports,,1878,2015.0,"['sim_public-health-reports_1900-06', 'sim_pub..."
4,5-HT-Reseptörü,"{{Akademik dergi kaynağı|yazar=Ramadan NM, Skl...",Cephalalgia,sim_cephalalgia,2003,2003,0,23,8.0,5-HT1F receptor agonists in acute migraine tre...,776.0,,sim_cephalalgia,Cephalalgia,,1989,2004.0,"['sim_cephalalgia_2003', 'sim_cephalalgia_2003..."


In [186]:
sample = citations_df.sample(n = 50)
sample.head()

Unnamed: 0,a,c,journal,sim_id,date,year,month,volume,issue,title,page,url,PubIssueID,Title,NA Gaps,First Volume,Last Volume,generated_ids
524,Metaller ile ametalleri ayıran çizgi,{{akademik dergi kaynağı |ilk=A. L. |son=Horva...,Journal of Chemical Education,sim_journal-of-chemical-education,1973,1973,0,50,5.0,Critical temperature of elements and the perio...,335.0,,sim_journal-of-chemical-education,Journal of Chemical Education,,1924,2014.0,"['sim_journal-of-chemical-education_1973', 'si..."
388,Andrea M. Ghez,{{Akademik dergi kaynağı|başlık=High Proper Mo...,Astrophysical Journal,sim_astrophysical-journal,1998,1998,0,509,2.0,High Proper Motions in the Vicinity of Sgr A*:...,678.0,,sim_astrophysical-journal,The Astrophysical Journal,,1895,2009.0,"['sim_astrophysical-journal_1998', 'sim_astrop..."
175,Botanik,{{Akademik dergi kaynağı | soyadı1= Scharleman...,Science,sim_science,2008,2008,0,319,5859.0,How Green are Biofuels?,43.0,,sim_science,Science,,1883,2016.0,"['sim_science_2008', 'sim_science_2008', 'sim_..."
155,5'-Guanilil imidodifosfat,{{Akademik dergi kaynağı|başlık=Requirement of...,Science,sim_science,1991,1991,0,252,5009.0,Requirement of GTP hydrolysis for dissociation...,1171.0,,sim_science,Science,,1883,2016.0,"['sim_science_1991', 'sim_science_1991', 'sim_..."
919,Evanesan dalga,{{Akademik dergi kaynağı |ad1=R. |soyadı1=Coll...,IEEE Transactions on Antennas and Propagation,sim_ieee-transactions-on-antennas-and-propagation,1964,1964,0,12,1.0,Evaluation of antenna Q,23.0,,sim_ieee-transactions-on-antennas-and-propagation,IEEE Transactions on Antennas and Propagation,,1952,2002.0,['sim_ieee-transactions-on-antennas-and-propag...


In [196]:
sample_input_list = sample['c'].tolist()
print(len(sample_input_list))

50


In [248]:
# url_count_total = 0
# url_count_good = 0
# textfile = open("citations_good.txt", "w")
# for cite_element in sample_input_list:
#     citation_new = main2(cite_element)
#     if citation_new != "":
#         textfile.write(citation_new + "\n")
#         url_count_good += 1
#     url_count_total += 1
    
#     print("total so far " +  str(url_count_total))
#     print("good so far " + str(url_count_good))

# textfile.close()
# print("There are a total of " + str(url_count_total) + " urls")
# print("File should have " + str(url_count_good) + " urls")

In [198]:
citations_1_150 = citations_df[1:150]
citations_1_150.head()

Unnamed: 0,a,c,journal,sim_id,date,year,month,volume,issue,title,page,url,PubIssueID,Title,NA Gaps,First Volume,Last Volume,generated_ids
1,"1,3,5-Triklorobenzen",{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-G...,Chemosphere,sim_chemosphere,1999-12,1999,12,39,15.0,Partial solubility parameters of chlorobenzene...,2607.0,,sim_chemosphere,Chemosphere,,1972,2003.0,"['sim_chemosphere_1999-12', 'sim_chemosphere_1..."
2,İz fosili,{{akademik dergi kaynağı\n | yazar = Seilacher...,Marine Geology,sim_marine-geology,1967,1967,0,5,56.0,Bathymetry of trace fossils,413.0,,sim_marine-geology,Marine Geology,,1964,2000.0,"['sim_marine-geology_1967', 'sim_marine-geolog..."
3,1900’de Arjantin,{{Akademik dergi kaynağı |başlık=ARGENTINA. Re...,Public Health Reports,sim_public-health-reports,1900-06,1900,6,15,24.0,ARGENTINA. Republic declared free from plague,1504.0,,sim_public-health-reports,Public Health Reports,,1878,2015.0,"['sim_public-health-reports_1900-06', 'sim_pub..."
4,5-HT-Reseptörü,"{{Akademik dergi kaynağı|yazar=Ramadan NM, Skl...",Cephalalgia,sim_cephalalgia,2003,2003,0,23,8.0,5-HT1F receptor agonists in acute migraine tre...,776.0,,sim_cephalalgia,Cephalalgia,,1989,2004.0,"['sim_cephalalgia_2003', 'sim_cephalalgia_2003..."
5,Aaron Halfaker,{{Akademik dergi kaynağı|başlık=The Rise and D...,American Behavioral Scientist,sim_american-behavioral-scientist,2012-12,2012,12,57,5.0,The Rise and Decline of an Open Collaboration ...,664.0,,sim_american-behavioral-scientist,The American Behavioral Scientist,,1957,2016.0,"['sim_american-behavioral-scientist_2012-12', ..."


In [199]:
%%capture cap --no-stderr

citations_1_150['generated_url'] = citations_1_150['c'].apply(main2)

with open('output_log_1_150.txt', 'w') as f:
    f.write(cap.stdout)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [200]:
citations_1_150.head()

Unnamed: 0,a,c,journal,sim_id,date,year,month,volume,issue,title,page,url,PubIssueID,Title,NA Gaps,First Volume,Last Volume,generated_ids,generated_url
1,"1,3,5-Triklorobenzen",{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-G...,Chemosphere,sim_chemosphere,1999-12,1999,12,39,15.0,Partial solubility parameters of chlorobenzene...,2607.0,,sim_chemosphere,Chemosphere,,1972,2003.0,"['sim_chemosphere_1999-12', 'sim_chemosphere_1...",
2,İz fosili,{{akademik dergi kaynağı\n | yazar = Seilacher...,Marine Geology,sim_marine-geology,1967,1967,0,5,56.0,Bathymetry of trace fossils,413.0,,sim_marine-geology,Marine Geology,,1964,2000.0,"['sim_marine-geology_1967', 'sim_marine-geolog...",
3,1900’de Arjantin,{{Akademik dergi kaynağı |başlık=ARGENTINA. Re...,Public Health Reports,sim_public-health-reports,1900-06,1900,6,15,24.0,ARGENTINA. Republic declared free from plague,1504.0,,sim_public-health-reports,Public Health Reports,,1878,2015.0,"['sim_public-health-reports_1900-06', 'sim_pub...",
4,5-HT-Reseptörü,"{{Akademik dergi kaynağı|yazar=Ramadan NM, Skl...",Cephalalgia,sim_cephalalgia,2003,2003,0,23,8.0,5-HT1F receptor agonists in acute migraine tre...,776.0,,sim_cephalalgia,Cephalalgia,,1989,2004.0,"['sim_cephalalgia_2003', 'sim_cephalalgia_2003...",
5,Aaron Halfaker,{{Akademik dergi kaynağı|başlık=The Rise and D...,American Behavioral Scientist,sim_american-behavioral-scientist,2012-12,2012,12,57,5.0,The Rise and Decline of an Open Collaboration ...,664.0,,sim_american-behavioral-scientist,The American Behavioral Scientist,,1957,2016.0,"['sim_american-behavioral-scientist_2012-12', ...",


In [201]:
citations_1_150['sim_id'].value_counts()

sim_science                                         67
sim_isis                                            18
sim_journal-of-personality-and-social-psychology    14
sim_developmental-psychology                         9
sim_child-development                                7
sim_journal-of-mammalogy                             5
sim_american-historical-review                       4
sim_european-journal-of-social-psychology            4
sim_classical-quarterly                              3
sim_church-history                                   2
sim_social-psychology-quarterly                      2
sim_social-indicators-research                       1
sim_chemosphere                                      1
sim_cinema-journal                                   1
sim_off-our-backs                                    1
sim_american-journal-of-physiology                   1
sim_plant-disease                                    1
sim_canadian-journal-of-public-health                1
sim_journa

In [202]:
citations_1_150_good = citations_1_150[citations_1_150['generated_url'] != ""]
citations_1_150_good['sim_id'].value_counts()

sim_science                                  64
sim_american-historical-review                4
sim_european-journal-of-social-psychology     4
sim_classical-quarterly                       2
sim_social-psychology-quarterly               2
sim_american-journal-of-physiology            1
sim_economica                                 1
sim_off-our-backs                             1
Name: sim_id, dtype: int64

In [203]:
citations_1_150_bad = citations_1_150[citations_1_150['generated_url'] == ""]
citations_1_150_bad['sim_id'].value_counts()

sim_isis                                            18
sim_journal-of-personality-and-social-psychology    14
sim_developmental-psychology                         9
sim_child-development                                7
sim_journal-of-mammalogy                             5
sim_science                                          3
sim_church-history                                   2
sim_canadian-journal-of-public-health                1
sim_classical-quarterly                              1
sim_marine-geology                                   1
sim_cephalalgia                                      1
sim_american-behavioral-scientist                    1
sim_public-health-reports                            1
sim_journal-of-american-college-health               1
sim_plant-disease                                    1
sim_social-indicators-research                       1
sim_cinema-journal                                   1
sim_adolescence                                      1
sim_chemos

In [231]:
print("Total citation count: " + str(citations_1_150.shape[0]))
print("Has urls: " + str(citations_1_150_good.shape[0]))
print("No urls: " + str(citations_1_150_bad.shape[0]))

Total citation count: 149
Has urls: 79
No urls: 70


##### I manually explored these data points
- sim_isis                                            [can't find]
- sim_journal-of-personality-and-social-psychology    [can't find]
- sim_developmental-psychology                        [can't find]
- sim_child-development                               [can't find]
- sim_journal-of-mammalogy                            [can't find]
- sim_science                                         [incomplete citation, don't have year/volume]
- sim_church-history                                  [can't find]
- sim_canadian-journal-of-public-health               [can't find year]
- sim_classical-quarterly                             [strange format]
- sim_marine-geology                                  [can't find]
- sim_cephalalgia                                     [can't find]
- sim_american-behavioral-scientist                   [can't find]
- sim_public-health-reports                           [can't find year]
- sim_journal-of-american-college-health              [can't find]
- sim_plant-disease                                   [can't find]
- sim_social-indicators-research                      [can't find year]
- sim_cinema-journal                                  [can't find]
- sim_adolescence                                     [incomplete citation]
- sim_chemosphere                                     [can't find]

In [232]:
citations_150_500 = citations_df[150:500]
citations_150_500.head()

Unnamed: 0,a,c,journal,sim_id,date,year,month,volume,issue,title,page,url,PubIssueID,Title,NA Gaps,First Volume,Last Volume,generated_ids
150,Tarih öncesi insan kronolojisi,{{Akademik dergi kaynağı | soyadı=Wu | ad=Xiao...,Science,sim_science,2012-06,2012,6,336,6089.0,"Early Pottery at 20,000 Years Ago in Xianrendo...",1696.0,,sim_science,Science,,1883,2016.0,"['sim_science_2012-06', 'sim_science_2012', 's..."
151,Tat alma,{{Akademik dergi kaynağı|soyadı=Miller|ad=Greg...,Science,sim_science,2011-09,2011,9,333,6047.0,"Sweet here, salty there: Evidence of a taste m...",1213.0,,sim_science,Science,,1883,2016.0,"['sim_science_2011-09', 'sim_science_2011', 's..."
152,Tyrannosauroidea,{{Akademik dergi kaynağı|başlık=The evolution ...,Science,sim_science,1999,1999,0,284,5423.0,The evolution of dinosaurs,2137.0,,sim_science,Science,,1883,2016.0,"['sim_science_1999', 'sim_science_1999', 'sim_..."
153,"Waset (World Academy of Science, Engineering a...",{{Akademik dergi kaynağı|başlık=Who's Afraid o...,Science,sim_science,2013-10,2013,10,342,6154.0,Who's Afraid of Peer Review?,60.0,,sim_science,Science,,1883,2016.0,"['sim_science_2013-10', 'sim_science_2013', 's..."
154,Yeniden ağaçlandırma,{{Akademik dergi kaynağı | soyadı = Wood well ...,Science,sim_science,1988-12,1988,12,242,4885.0,CO<sub>2</sub> Reduction and Reforestation,1493.0,,sim_science,Science,,1883,2016.0,"['sim_science_1988-12', 'sim_science_1988', 's..."


In [233]:
%%capture cap --no-stderr

citations_150_500['generated_url'] = citations_150_500['c'].apply(main2)

with open('output_log_150_500.txt', 'w') as f:
    f.write(cap.stdout)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [234]:
citations_150_500.head()

Unnamed: 0,a,c,journal,sim_id,date,year,month,volume,issue,title,page,url,PubIssueID,Title,NA Gaps,First Volume,Last Volume,generated_ids,generated_url
150,Tarih öncesi insan kronolojisi,{{Akademik dergi kaynağı | soyadı=Wu | ad=Xiao...,Science,sim_science,2012-06,2012,6,336,6089.0,"Early Pottery at 20,000 Years Ago in Xianrendo...",1696.0,,sim_science,Science,,1883,2016.0,"['sim_science_2012-06', 'sim_science_2012', 's...",https://archive.org/details/sim_science_2012-0...
151,Tat alma,{{Akademik dergi kaynağı|soyadı=Miller|ad=Greg...,Science,sim_science,2011-09,2011,9,333,6047.0,"Sweet here, salty there: Evidence of a taste m...",1213.0,,sim_science,Science,,1883,2016.0,"['sim_science_2011-09', 'sim_science_2011', 's...",https://archive.org/details/sim_science_2011-0...
152,Tyrannosauroidea,{{Akademik dergi kaynağı|başlık=The evolution ...,Science,sim_science,1999,1999,0,284,5423.0,The evolution of dinosaurs,2137.0,,sim_science,Science,,1883,2016.0,"['sim_science_1999', 'sim_science_1999', 'sim_...",https://archive.org/details/sim_science_1999-0...
153,"Waset (World Academy of Science, Engineering a...",{{Akademik dergi kaynağı|başlık=Who's Afraid o...,Science,sim_science,2013-10,2013,10,342,6154.0,Who's Afraid of Peer Review?,60.0,,sim_science,Science,,1883,2016.0,"['sim_science_2013-10', 'sim_science_2013', 's...",https://archive.org/details/sim_science_2013-1...
154,Yeniden ağaçlandırma,{{Akademik dergi kaynağı | soyadı = Wood well ...,Science,sim_science,1988-12,1988,12,242,4885.0,CO<sub>2</sub> Reduction and Reforestation,1493.0,,sim_science,Science,,1883,2016.0,"['sim_science_1988-12', 'sim_science_1988', 's...",https://archive.org/details/sim_science_1988-1...


In [235]:
citations_150_500_good = citations_150_500[citations_150_500['generated_url'] != ""]
citations_150_500_good['sim_id'].value_counts()

sim_science                                              35
sim_american-journal-of-public-health                    11
sim_applied-cognitive-psychology                          8
sim_journal-of-cognitive-neuroscience                     7
sim_astrophysical-journal                                 7
sim_american-journal-of-human-genetics                    7
sim_archives-of-sexual-behavior                           6
sim_annals-of-human-genetics                              5
sim_circulation                                           5
sim_animal-behaviour                                      4
sim_chemical-reviews                                      4
sim_american-ethnologist                                  3
sim_gender-society                                        3
sim_environmental-health-perspectives                     3
sim_journal-of-biological-rhythms                         2
sim_european-journal-of-political-research                2
sim_folklore                            

In [238]:
citations_150_500_bad = citations_150_500[citations_150_500['generated_url'] == ""]
citations_150_500_bad['sim_id'].value_counts()

sim_psychological-review                                 15
sim_psychological-science                                11
sim_the-lancet                                           11
sim_molecular-biology-and-evolution                       9
sim_scientific-american                                   9
sim_jama                                                  9
sim_journal-of-near-eastern-studies                       8
sim_chemical-reviews                                      8
sim_journal-of-the-american-oriental-society              8
sim_journal-of-experimental-psychology                    8
sim_psychological-bulletin                                8
sim_international-journal-of-eating-disorders             7
sim_physical-review-letters                               6
sim_neuropsychologia                                      6
sim_chest                                                 5
sim_forensic-science-international                        5
sim_psychology-and-aging                

In [239]:
print("Total citation count: " + str(citations_150_500.shape[0]))
print("Has urls: " + str(citations_150_500_good.shape[0]))
print("No urls: " + str(citations_150_500_bad.shape[0]))

Total citation count: 350
Has urls: 139
No urls: 211


### Join Dataframes together

In [240]:
citations_1_500 = citations_1_150.append(citations_150_500, ignore_index = True)
citations_1_500.head()

Unnamed: 0,a,c,journal,sim_id,date,year,month,volume,issue,title,page,url,PubIssueID,Title,NA Gaps,First Volume,Last Volume,generated_ids,generated_url
0,"1,3,5-Triklorobenzen",{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-G...,Chemosphere,sim_chemosphere,1999-12,1999,12,39,15.0,Partial solubility parameters of chlorobenzene...,2607.0,,sim_chemosphere,Chemosphere,,1972,2003.0,"['sim_chemosphere_1999-12', 'sim_chemosphere_1...",
1,İz fosili,{{akademik dergi kaynağı\n | yazar = Seilacher...,Marine Geology,sim_marine-geology,1967,1967,0,5,56.0,Bathymetry of trace fossils,413.0,,sim_marine-geology,Marine Geology,,1964,2000.0,"['sim_marine-geology_1967', 'sim_marine-geolog...",
2,1900’de Arjantin,{{Akademik dergi kaynağı |başlık=ARGENTINA. Re...,Public Health Reports,sim_public-health-reports,1900-06,1900,6,15,24.0,ARGENTINA. Republic declared free from plague,1504.0,,sim_public-health-reports,Public Health Reports,,1878,2015.0,"['sim_public-health-reports_1900-06', 'sim_pub...",
3,5-HT-Reseptörü,"{{Akademik dergi kaynağı|yazar=Ramadan NM, Skl...",Cephalalgia,sim_cephalalgia,2003,2003,0,23,8.0,5-HT1F receptor agonists in acute migraine tre...,776.0,,sim_cephalalgia,Cephalalgia,,1989,2004.0,"['sim_cephalalgia_2003', 'sim_cephalalgia_2003...",
4,Aaron Halfaker,{{Akademik dergi kaynağı|başlık=The Rise and D...,American Behavioral Scientist,sim_american-behavioral-scientist,2012-12,2012,12,57,5.0,The Rise and Decline of an Open Collaboration ...,664.0,,sim_american-behavioral-scientist,The American Behavioral Scientist,,1957,2016.0,"['sim_american-behavioral-scientist_2012-12', ...",


In [242]:
citations_1_500_good = citations_1_500[citations_1_500['generated_url'] != ""]

In [243]:
citations_1_500_bad = citations_1_500[citations_1_500['generated_url'] == ""]

In [244]:
print("Total citation count: " + str(citations_1_500.shape[0]))
print("Has urls: " + str(citations_1_500_good.shape[0]))
print("No urls: " + str(citations_1_500_bad.shape[0]))

Total citation count: 499
Has urls: 218
No urls: 281


In [246]:
citations_1_500_bad['sim_id'].value_counts()

sim_isis                                            18
sim_psychological-review                            15
sim_journal-of-personality-and-social-psychology    14
sim_psychological-science                           11
sim_the-lancet                                      11
                                                    ..
sim_journal-of-anxiety-disorders                     1
sim_classical-quarterly                              1
sim_physiological-reviews                            1
sim_international-journal-of-obesity                 1
sim_chemosphere                                      1
Name: sim_id, Length: 77, dtype: int64


### Write data to google sheet

In [223]:
gc = gspread.service_account("service_account.json")
sh = gc.open("Citations SIM Test")
print(sh.sheet1.get('A1'))

[['a']]


In [229]:
# ACCESS GOOGLE SHEET
gc = gspread.service_account(filename='service_account.json')
sh = gc.open_by_key('1ih5bIk5_d5WLEtArRzEFPzPlwZVA_BI-O8kSxWH1rNU')
worksheet = sh.get_worksheet(1) #-> 0 - first sheet, 1 - second sheet etc. 

# APPEND DATA TO SHEET
df_to_write = citations_1_150[['a', 'c', 'journal', 'year', 'volume', 'issue', 'title', 'page', 'generated_url']]
set_with_dataframe(worksheet, df_to_write) #-> THIS EXPORTS YOUR DATAFRAME TO THE GOOGLE SHEET

In [222]:
# no issue number in citation 
citations_1_150_bad[citations_1_150_bad['sim_id'] == "sim_science"]

Unnamed: 0,a,c,journal,sim_id,date,year,month,volume,issue,title,page,url,PubIssueID,Title,NA Gaps,First Volume,Last Volume,generated_ids,generated_url
84,Desulforudis audaxviator,"{{Akademik dergi kaynağı| yazarlar=Chivian D, ...",Science,sim_science,2008,2008,0,322,,Environmental genomics reveals a single-specie...,275.0,,sim_science,Science,,1883,2016.0,"['sim_science_2008', 'sim_science_2008', 'sim_...",
120,B12 vitamini,{{Akademik dergi kaynağı | vauthors = Eschenmo...,Science,sim_science,1977-06,1977,6,196,4297.0,Natural product synthesis and vitamin B12,1410.0,,sim_science,Science,,1883,2016.0,"['sim_science_1977-06', 'sim_science_1977', 's...",
128,Kısa süreli bellek,{{Akademik dergi kaynağı|başlık=Dynamic shifts...,Science,sim_science,2008,2008,0,321,5890.0,Dynamic shifts of limited working memory resou...,851.0,,sim_science,Science,,1883,2016.0,"['sim_science_2008', 'sim_science_2008', 'sim_...",


In [204]:
citations_1_150_bad_sim_isis = citations_1_150_bad[citations_1_150_bad['sim_id'] == "sim_isis"]
citations_1_150_bad_sim_isis.head()

Unnamed: 0,a,c,journal,sim_id,date,year,month,volume,issue,title,page,url,PubIssueID,Title,NA Gaps,First Volume,Last Volume,generated_ids,generated_url
59,Asger Aaboe,{{Akademik dergi kaynağı |doi= 10.1086/529271 ...,Isis,sim_isis,2007,2007,0,98,4.0,Eloge: Asger Hartvig Aaboe (1922–2007),796.0,,sim_isis,Isis,,1913,2014.0,"['sim_isis_2007', 'sim_isis_2007', 'sim_isis_2...",
60,Sara B. Pritchard,{{Akademik dergi kaynağı|url=|başlık=Review of...,Isis,sim_isis,2011-12,2011,12,102,,Review of Confluence: The Nature of Technology...,809.0,,sim_isis,Isis,,1913,2014.0,"['sim_isis_2011-12', 'sim_isis_2011', 'sim_isi...",
61,André-Louis Debierne,{{Akademik dergi kaynağı|başlık=The Discovery ...,Isis,sim_isis,1971,1971,0,62,3.0,The Discovery of Actinium,290.0,,sim_isis,Isis,,1913,2014.0,"['sim_isis_1971', 'sim_isis_1971', 'sim_isis_1...",
62,Eratosthenes,{{Akademik dergi kaynağı |ad1=D. H. |soyadı1=F...,Isis,sim_isis,1983,1983,0,74,274.0,Eratosthenes' ratio for the obliquity of the e...,556.0,,sim_isis,Isis,,1913,2014.0,"['sim_isis_1983', 'sim_isis_1983', 'sim_isis_1...",
63,Eratosthenes,{{Akademik dergi kaynağı |ad=D. |soyadı=Rawlin...,Isis,sim_isis,1982,1982,0,73,2.0,Eratosthenes' geodesy unraveled : was there a ...,259.0,,sim_isis,Isis,,1913,2014.0,"['sim_isis_1982', 'sim_isis_1982', 'sim_isis_1...",


In [249]:
# # Journal, year, volume, _index
# url_count_total = 0
# url_count_good = 0

# url_good_lst = []

# for cite_element in citations_lst:
#     url_new = main(cite_element)
#     if url_new != "":
#         url_good_lst.append(url_new)
#         url_count_good += 1
#     url_count_total += 1
    
#     print("total so far " +  str(url_count_total))
#     print("good so far " + str(url_count_good))

# print("There are a total of " + str(url_count_total) + " urls")
# print("File should have " + str(url_count_good) + " urls")
# print(url_good_lst)

In [250]:
# # Journal, year, volume, _index
# url_count_total = 0
# url_count_good = 0
# textfile = open("citations_good.txt", "w")
# for cite_element in citations_lst:
#     citation_new = main(cite_element)
#     if citation_new != "":
#         textfile.write(citation_new + "\n")
#         url_count_good += 1
#     url_count_total += 1
# textfile.close()
# print("There are a total of " + str(url_count_total) + " urls")
# print("File should have " + str(url_count_good) + " urls")

## Draft Attempts for loading files with cookies 

In [None]:
# ### Establish a session for the requests

# curSession = requests.Session() 
# # all cookies received will be stored in the session 

# payload={'username': "gracec@archive.org",'password': "graceCXY"}
# res1 = curSession.post(firstUrl, data=payload)
# print("response 1")
# print(res1.status_code)
# print(res1.text)
# # internally return your expected cookies, can use for following auth

# # # internally use previously generated cookies, can access the resources
# secondUrl = "https://archive.org/advancedsearch.php?q=sim_nature-biotechnology_2001-03_19_3&fl%5B%5D=identifier&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=50&page=1&output=json&callback=callback&save=yes"
# res2 = curSession.get(secondUrl, cookies = res1.cookies)
# print("response 2")
# print(res2.text)
# # curSession.get(thirdUrl)

In [None]:
# s = requests.Session() 
# # all cookies received will be stored in the session object
# payload = {'username':'gracec@archive.org','password': 'graceCXY'}
# res1 = s.post('https://archive.org/account/login',data=payload)
# print(res1.text)
# res2 = s.get('https://archive.org/advancedsearch.php?q=sim_australian-journal-of-botany_1996_44_3&fl%5B%5D=identifier&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=50&page=1&callback=callback&save=yes&output=json')
# print(res2.text)

In [None]:
# import requests
# # login_data =  {'username':'gracec@archive.org','password': 'graceCXY', 'remember':'True'}
# login_data = {'username': 'gracec._', 'password': 'gracec'}
# r = requests.post("https://www.instagram.com/", login_data)

# print(r.status_code)
# print(r.cookies)

# # r2 = requests.get('https://localhost/profile_data.json', ...)

In [None]:
# s = requests.Session()
# s.post(firstUrl, payload)
# #logged in! cookies saved for future requests.
# r2 = s.get(secondUrl, )
# print(r2.text)
# print(r2.cookies)
# #cookies sent automatically!
# #do whatever, s will keep your cookies intact :)

In [None]:
# import requests, pickle
# session = requests.session()

# with open('somefile', 'wb') as f:
#     pickle.dump(session.cookies, f)

In [None]:
# import requests

# # Fill in your details here to be posted to the login form.
# payload={'username': "gracec@archive.org",'password': "graceCXY"}


# # Use 'with' to ensure the session context is closed after use.
# with requests.Session() as s:
#     p = s.post(firstUrl, data=payload)
#     # print the html returned or something more intelligent to see if it's a successful login page.
# #     print(p.text)

#     # An authorised request.
#     r = s.get(secondUrl)
#     print(r.text)

In [None]:
# import pickle
# def save_cookies(requests_cookiejar, filename):
#     with open(filename, 'w', encoding='UTF-8') as f:
#         pickle.dump(requests_cookiejar, f)

# def load_cookies(filename):
#     with open(filename, 'r') as f:
#         return pickle.load(f)


# filename = "cookies.txt"
# firstUrl = "https://archive.org/account/login"
# secondUrl = "https://archive.org/advancedsearch.php?q=sim_nature-biotechnology_2001-03_19_3&fl%5B%5D=identifier&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=50&page=1&output=json&callback=callback&save=yes"
# payload={'username': "gracec@archive.org",'password': "graceCXY"}

# #save cookies
# resp1 = requests.post(firstUrl, data=payload)
# print(resp1.cookies)
# save_cookies(resp1.cookies, filename)

# #load cookies and do a request
# resp2 = requests.get(secondUrl, cookies=load_cookies(filename))
# print(resp2.text)

In [251]:
# ### working
# headers = {
#     'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
# }

# login_data = {
#     'username': 'email',
#     'password': 'password',
#     'remember': 'true',
#     'referer': 'https://archive.org/advancedsearch.php',
#     'submit-to-login': 'Log in'
# }

# # s = requests.Session()
# with requests.Session() as s:
#     url = "https://archive.org/account/login"
#     r = s.get(url, headers = headers)
#     soup = BeautifulSoup(r.content, "html.parser")
#     login_data['login'] = soup.find('input', attrs = {'name':'login'})['value']

#     r = s.post(url, data = login_data, headers = headers)

#     url2 = "https://archive.org/advancedsearch.php?q=sim_nature-biotechnology_2001-03_19_3&fl%5B%5D=identifier&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=50&page=1&output=json&callback=callback&save=yes"
#     r = s.get(url2, data = login_data, headers = headers)

#     print(r.text)