# Autourls 

Goal: 
- https://en.wikipedia.org/wiki/User:GreenC/testcases/autourl 
- we don't want to overwrite auto generated urls

### Approach 1
- remove url and doi
- not reliable to changing conditions

### Approach 2 
- webscrape wikipedia page
- find location of citation 
- check if there is autogeneration 
- takes too long

### Approach 3
- use wikimedia's parse endpoint 
   - example: https://en.wikipedia.org/w/api.php?action=parse&text=%7B%7Bcite%20journal%20%7Ctitle%3DThe%20Discodermia%20calyx%20Toxin%20Calyculin%20A%20%7Clast1%3DEdelson%20%7Cfirst1%3DJessica%20R.%20%7Clast2%3DBrautigan%20%7Cfirst2%3DDavid%20L.%20%7Cdate%3D24%20January%202011%20%7Cjournal%3DToxins%20%7Cvolume%3D3%20%7Cissue%3D1%20%7Cpages%3D105%E2%80%93119%20%7Cdoi%3D10.3390%2Ftoxins3010105%20%7Cdoi-access%3Dfree%20%7Cpmid%3D22069692%20%7Cpmc%3D3210456%7D%7D&contentmodel=wikitext
- convert the template into HTML 
- find link in HTML
- the most stable

In [1]:
import requests
import urllib.parse
import json

In [2]:
### perform http request 
# INPUT:
## language: wikipedia language (ex.en, tr)
## citation: the citation input ({{cite journal}})
## verbose: debug mode 
# OUTPUT:
## json object 
def get_wikimedia_json(language, citation, verbose = False):
    
    ### build url
    url_header = "https://" + language + ".wikipedia.org/w/api.php?action=parse&text="
    url_content = urllib.parse.quote(citation, safe = "")
    url_param = "&contentmodel=wikitext&format=json"
    
    url = url_header + url_content + url_param
    
    ### debug
    if verbose: print(url)
    
    ### make http request
    response = requests.get(url, timeout = 20)
    if response.status_code != 200:
        return ""
    
    res_json = json.loads(response.text)
    return res_json

In [3]:
test_cite = "{{cite journal |title=The Discodermia calyx Toxin Calyculin A |last1=Edelson |first1=Jessica R. |last2=Brautigan |first2=David L. |date=24 January 2011 |journal=Toxins |volume=3 |issue=1 |pages=105–119 |doi=10.3390/toxins3010105 |doi-access=free |pmid=22069692 |pmc=3210456}}"
test_json = get_wikimedia_json("en", test_cite, True)

https://en.wikipedia.org/w/api.php?action=parse&text=%7B%7Bcite%20journal%20%7Ctitle%3DThe%20Discodermia%20calyx%20Toxin%20Calyculin%20A%20%7Clast1%3DEdelson%20%7Cfirst1%3DJessica%20R.%20%7Clast2%3DBrautigan%20%7Cfirst2%3DDavid%20L.%20%7Cdate%3D24%20January%202011%20%7Cjournal%3DToxins%20%7Cvolume%3D3%20%7Cissue%3D1%20%7Cpages%3D105%E2%80%93119%20%7Cdoi%3D10.3390%2Ftoxins3010105%20%7Cdoi-access%3Dfree%20%7Cpmid%3D22069692%20%7Cpmc%3D3210456%7D%7D&contentmodel=wikitext&format=json


In [5]:
## un-comment to see result
# test_json

In [6]:
import re

In [7]:
### find html element strings with href 
# INPUT:
## json: json object returned by wikipedia parse
# OUTPUT:
## list of html strings with href 
def find_html_lst_from_json(json, verbose = False):
    html_str = json["parse"]["text"]["*"]
        
    if verbose: print(html_str)
        
    html_tags = [m.span() for m in re.finditer(r'<[^>]*>', html_str)]
    
    if verbose: print(html_tags)
    has_href = []
    for t_loc in html_tags:
        start = t_loc[0]
        end = t_loc[1]
        substr = html_str[start:end]
        if "href" in substr:
            has_href.append(substr)
    
    return has_href

In [8]:
test_html_lst = find_html_lst_from_json(test_json)
test_html_lst

['<a rel="nofollow" class="external text" href="//www.ncbi.nlm.nih.gov/pmc/articles/PMC3210456">',
 '<a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">',
 '<a rel="nofollow" class="external text" href="https://doi.org/10.3390%2Ftoxins3010105">',
 '<a href="/wiki/PMC_(identifier)" class="mw-redirect" title="PMC (identifier)">',
 '<a rel="nofollow" class="external text" href="//www.ncbi.nlm.nih.gov/pmc/articles/PMC3210456">',
 '<a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">',
 '<a rel="nofollow" class="external text" href="//pubmed.ncbi.nlm.nih.gov/22069692">']

In [55]:
### find html element strings with href 
# INPUT:
## list of html strings with href 
# OUTPUT:
## list of urls
def find_urls(html_lst, verbose = False):
    
    urls = []
    for element in html_lst:
        element = re.sub("<", "", element)
        element = re.sub(">", "", element)
        attr_lst = element.split()
        
        if verbose: print(attr_lst)
            
        for attr in attr_lst:
            
            if "=" in attr:
                field_name = attr.split("=")[0].strip()
                field_content = attr.split("=")[1].strip()
                
                if verbose:
                    print(attr)
                    print(field_name)
                    print(field_content)
                    
                if "href" == field_name or "href" in field_name:
                    
                    if verbose: print("it's href")
                    
#                     url_regex = "(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}"
#                     url_regex += "|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}" 
#                     url_regex += "|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}" 
#                     url_regex += "|www\.[a-zA-Z0-9]+\.[^\s]{2,}(\/\[a-zA-Z0-9]+){1,}"
#                     url_regex += "|[a-zA-Z0-9]+\.[^\s]{2,}(\/\[a-zA-Z0-9]+){1,})" 
                    
                    url_regex = r"\/\/[a-zA-Z0-9]+\.[^\s]{2,}"
        
                    if re.search(url_regex, field_content):
                
                        if verbose: 
                            print("match")
#                             print(field_content)
                        
                        urls.append(field_content)
                        break
                    else: 
                        if verbose: print('href content is not url')
                        
    return urls
                

In [56]:
url_regex = "\/\/[a-zA-Z0-9]+\.[^\s]{2,}"
field_content = "//www.ncbi.nlm.nih.gov/pmc/articles/PMC3210456"
if re.search(url_regex, field_content):
    print("yay")
else: 
    print("nay")

yay


In [58]:
find_urls(test_html_lst)

['"//www.ncbi.nlm.nih.gov/pmc/articles/PMC3210456"',
 '"https://doi.org/10.3390%2Ftoxins3010105"',
 '"//www.ncbi.nlm.nih.gov/pmc/articles/PMC3210456"',
 '"//pubmed.ncbi.nlm.nih.gov/22069692"']

In [59]:
### Main function to checking if auto url exists
# Input:
## language: wikipedia language (ex.en, tr)
## citation: the citation input ({{cite journal}})
# Output:
## Boolean: True or False

def autourl_exists(citation, language = "en", verbose = False):
    
    res_json = get_wikimedia_json(language, citation, verbose)
    
    html_lst = find_html_lst_from_json(res_json, verbose)
    
    urls = find_urls(html_lst, verbose)
    
    if urls == []:
        return False
    
    return True
        

In [60]:
autourl_exists(test_cite)

True

### Mass test this

In [62]:
import pandas as pd



In [66]:
journal_dump = pd.read_json("turkish_wiki_0621_first_3000_with_doi.json", lines = True)
journal_dump.head()

Unnamed: 0,a,u,c
0,1 + 2 + 3 + 4 + · · ·,,{{Akademik dergi kaynağı\n| soyadı = Lepowsky ...
1,12 Victoria,,{{Akademik dergi kaynağı\n | ad1 = B.\n | s...
2,"1,3,5-Triklorobenzen",,{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-G...
3,141 Likya depremi,,{{Akademik dergi kaynağı | url=http://blackmed...
4,141 Likya depremi,,{{Akademik dergi kaynağı | url=http://www.nat-...


In [69]:
df_100 = journal_dump.sample(n = 100)
df_100.shape

(100, 3)

In [70]:
%%time

df_100["has_autourl"] = df_100["c"].apply(lambda x: autourl_exists(x, "tr"))

CPU times: user 2.39 s, sys: 229 ms, total: 2.62 s
Wall time: 55.4 s


In [72]:
df_100.head()

Unnamed: 0,a,u,c,has_autourl
148,Aşı karşıtlığı,,{{Akademik dergi kaynağı|başlık=Association Be...,True
312,Bordigizm,,{{Akademik dergi kaynağı|soyadı1=Goldner|ad1=L...,True
130,Aristid von Grosse,https://archive.org/details/sim_science_1934-1...,{{akademik dergi kaynağı |ilk=Aristid |son=von...,True
221,Banksia marginata,,{{Akademik dergi kaynağı|url=https://www.publi...,True
1730,Richard N. Zare,,{{Akademik dergi kaynağı|başlık=Synchronized D...,False


### Has autourl

In [77]:
df_100_auto = df_100[df_100["has_autourl"]]
df_100_auto.head()

Unnamed: 0,a,u,c,has_autourl
148,Aşı karşıtlığı,,{{Akademik dergi kaynağı|başlık=Association Be...,True
312,Bordigizm,,{{Akademik dergi kaynağı|soyadı1=Goldner|ad1=L...,True
130,Aristid von Grosse,https://archive.org/details/sim_science_1934-1...,{{akademik dergi kaynağı |ilk=Aristid |son=von...,True
221,Banksia marginata,,{{Akademik dergi kaynağı|url=https://www.publi...,True
1747,RNA aşısı,,{{Akademik dergi kaynağı|url=https://www.ncbi....,True


#### I generated url for it

In [81]:
df_100_auto[df_100_auto["u"] != ""]

Unnamed: 0,a,u,c,has_autourl
130,Aristid von Grosse,https://archive.org/details/sim_science_1934-1...,{{akademik dergi kaynağı |ilk=Aristid |son=von...,True
2263,Tütünün sağlığa etkileri,https://archive.org/details/sim_epidemiology-a...,{{Akademik dergi kaynağı|başlık=Influenza A am...,True


In [82]:
print(df_100_auto.loc[130,"a"])
print(df_100_auto.loc[130, "u"])
print(df_100_auto.loc[130, "c"])

Aristid von Grosse
https://archive.org/details/sim_science_1934-12-07_80_2084
{{akademik dergi kaynağı |ilk=Aristid |son=von Grosse |dergi=Science |yıl=1934 |başlık=Element 91 |cilt=80 |sayı=2084 |sayfalar=512-516|dil=İngilizce |doi=10.1126/science.80.2084.512 |pmid=17734249}}


In [83]:
print(df_100_auto.loc[2263,"a"])
print(df_100_auto.loc[2263, "u"])
print(df_100_auto.loc[2263, "c"])

Tütünün sağlığa etkileri
https://archive.org/details/sim_epidemiology-and-infection_1999-08_123_1
{{Akademik dergi kaynağı|başlık=Influenza A among community-dwelling elderly persons in Leicestershire during winter 1993-4; cigarette smoking as a risk factor and the efficacy of influenza vaccination|tarih=August 1999|sayı=1|sayfalar=103-8|çalışma=Epidemiology and Infection|cilt=123|pmc=2810733|pmid=10487646|doi=10.1017/S095026889900271X}}


### Does not have url

In [84]:
df_100_no_auto = df_100[df_100["has_autourl"]==False]
df_100_no_auto.head()

Unnamed: 0,a,u,c,has_autourl
1730,Richard N. Zare,,{{Akademik dergi kaynağı|başlık=Synchronized D...,False
1698,Radyasyon hasarı,,{{Akademik dergi kaynağı|başlık=Mechanism for ...,False
1512,Pedro de Palol,,{{Akademik dergi kaynağı|başlık=Arqueología cr...,False
734,Hazırlama etkisi,,{{Akademik dergi kaynağı|başlık=Masked priming...,False
1465,Pedro de Palol,,{{Akademik dergi kaynağı|başlık=Cabeza femenin...,False


In [87]:
df_100_no_auto[df_100_no_auto["u"] != ""]

Unnamed: 0,a,u,c,has_autourl


In [86]:
print(df_100_no_auto.loc[1730, "a"])
print(df_100_no_auto.loc[1730, "u"])
print(df_100_no_auto.loc[1730, "c"])

Richard N. Zare

{{Akademik dergi kaynağı|başlık=Synchronized Desorption Electrospray Ionization Mass Spectrometry Imaging|yazarlar=Comi|tarih=19 Ocak 2016|sayı=2|sayfalar=1169-1175|çalışma=Analytical Chemistry|cilt=88}}


In [88]:
print(df_100_no_auto.loc[734, "a"])
print(df_100_no_auto.loc[734, "u"])
print(df_100_no_auto.loc[734, "c"])

Hazırlama etkisi

{{Akademik dergi kaynağı|başlık=Masked priming with graphemically related forms: Repetition or partial activation?|sayı=2|sayfalar=211-251|çalışma=The Quarterly Journal of Experimental Psychology A|yıl=1987|cilt=39A}}
