# Autourls 

Goal: 
- https://en.wikipedia.org/wiki/User:GreenC/testcases/autourl 
- we don't want to overwrite auto generated urls

### Approach 1
- remove url and doi
- not reliable to changing conditions

### Approach 2 
- webscrape wikipedia page
- find location of citation 
- check if there is autogeneration 
- takes too long

### Approach 3
- use wikimedia's parse endpoint 
   - example: https://en.wikipedia.org/w/api.php?action=parse&text=%7B%7Bcite%20journal%20%7Ctitle%3DThe%20Discodermia%20calyx%20Toxin%20Calyculin%20A%20%7Clast1%3DEdelson%20%7Cfirst1%3DJessica%20R.%20%7Clast2%3DBrautigan%20%7Cfirst2%3DDavid%20L.%20%7Cdate%3D24%20January%202011%20%7Cjournal%3DToxins%20%7Cvolume%3D3%20%7Cissue%3D1%20%7Cpages%3D105%E2%80%93119%20%7Cdoi%3D10.3390%2Ftoxins3010105%20%7Cdoi-access%3Dfree%20%7Cpmid%3D22069692%20%7Cpmc%3D3210456%7D%7D&contentmodel=wikitext
- convert the template into HTML 
- find link in HTML
- the most stable

In [1]:
import requests
import urllib.parse
import json

In [2]:
import time

In [3]:
### perform http request 
# INPUT:
## language: wikipedia language (ex.en, tr)
## citation: the citation input ({{cite journal}})
## verbose: debug mode 
# OUTPUT:
## json object 
def get_wikimedia_json(language, source, citation, verbose = False):
    
    ### build url
    url_header = "https://" + language + "." + source + ".org/w/api.php?action=parse&text="
    url_content = urllib.parse.quote(citation, safe = "")
    url_param = "&contentmodel=wikitext&format=json"
    
    url = url_header + url_content + url_param
    
    ### debug
    if verbose: print(url)
    
    ### make http requests
    response = requests.get(url, timeout = 20)
    if response.status_code != 200:
        time.sleep(15)
        response = requests.get(url, timeout = 20)
        if response.status_code != 200:
            time.sleep(15)
            response = requests.get(url, timeout = 20)
            if response.status_code != 200:
                time.sleep(15)
                response = requests.get(url, timeout = 20)
                if response.status_code != 200:
                    return ""
    
    try: 
        res_json = json.loads(response.text)
    except:
        return ""
    
    return res_json

In [4]:
test_cite = "{{cite journal |title=The Discodermia calyx Toxin Calyculin A |last1=Edelson |first1=Jessica R. |last2=Brautigan |first2=David L. |date=24 January 2011 |journal=Toxins |volume=3 |issue=1 |pages=105–119 |doi=10.3390/toxins3010105 |doi-access=free |pmid=22069692 |pmc=3210456}}"
test_json = get_wikimedia_json("en", "wikipedia",test_cite)

In [5]:
## un-comment to see result
# test_json

In [6]:
import re

In [7]:
### find html element strings with href 
# INPUT:
## json: json object returned by wikipedia parse
# OUTPUT:
## list of html strings with href 
def find_html_lst_from_json(json, verbose = False):
    if json == "":
        return ""
    
    html_str = json["parse"]["text"]["*"]
    if verbose: print(html_str)
        
    html_tags = [m.span() for m in re.finditer(r'<[^>]*>', html_str)]
    
    if verbose: print(html_tags)
    has_href = []
    for t_loc in html_tags:
        start = t_loc[0]
        end = t_loc[1]
        substr = html_str[start:end]
        if "href" in substr:
            has_href.append(substr)
    
    return has_href

In [8]:
test_html_lst = find_html_lst_from_json(test_json)

In [9]:
## un-comment to see result
# test_html_lst

['<a rel="nofollow" class="external text" href="//www.ncbi.nlm.nih.gov/pmc/articles/PMC3210456">',
 '<a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">',
 '<a rel="nofollow" class="external text" href="https://doi.org/10.3390%2Ftoxins3010105">',
 '<a href="/wiki/PMC_(identifier)" class="mw-redirect" title="PMC (identifier)">',
 '<a rel="nofollow" class="external text" href="//www.ncbi.nlm.nih.gov/pmc/articles/PMC3210456">',
 '<a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">',
 '<a rel="nofollow" class="external text" href="//pubmed.ncbi.nlm.nih.gov/22069692">']

In [10]:
### find html element strings with href 
# INPUT:
## list of html strings with href 
# OUTPUT:
## list of urls
def find_urls(html_lst, verbose = False):
    
    urls = []
    for element in html_lst:
        element = re.sub("<", "", element)
        element = re.sub(">", "", element)
        attr_lst = element.split()
        
        if verbose: print(attr_lst)
            
        for attr in attr_lst:
            
            if "=" in attr:
                field_name = attr.split("=")[0].strip()
                field_content = attr.split("=")[1].strip()
                
                if verbose:
                    print(attr)
                    print(field_name)
                    print(field_content)
                    
                if "href" == field_name or "href" in field_name:
                    
                    if verbose: print("it's href")
                    
#                     url_regex = "(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}"
#                     url_regex += "|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}" 
#                     url_regex += "|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}" 
#                     url_regex += "|www\.[a-zA-Z0-9]+\.[^\s]{2,}(\/\[a-zA-Z0-9]+){1,}"
#                     url_regex += "|[a-zA-Z0-9]+\.[^\s]{2,}(\/\[a-zA-Z0-9]+){1,})" 
                    
                    url_regex = r"\/\/[a-zA-Z0-9]+\.[^\s]{2,}"
        
                    if re.search(url_regex, field_content):
                
                        if verbose: 
                            print("match")
#                             print(field_content)
                        
                        urls.append(field_content)
                        break
                    else: 
                        if verbose: print('href content is not url')
                        
    return urls
                

In [11]:
url_regex = "\/\/[a-zA-Z0-9]+\.[^\s]{2,}"
field_content = "//www.ncbi.nlm.nih.gov/pmc/articles/PMC3210456"

In [12]:
## Uncomment to see result
# if re.search(url_regex, field_content):
#     print("yay")
# else: 
#     print("nay")

In [13]:
## Uncomment to see result
#find_urls(test_html_lst)

In [14]:
### Main function to checking if auto url exists
# Input:
## language: wikipedia language (ex.en, tr)
## citation: the citation input ({{cite journal}})
# Output:
## Boolean: True or False

def autourl_exists(citation, language = "en", source = "wikipedia", verbose = False):
    
    res_json = get_wikimedia_json(language, source, citation, verbose)
    
    if res_json == "":
        return False
    
    html_lst = find_html_lst_from_json(res_json, verbose)
    
    if html_lst == [] or html_lst == None:
        return False
    
    urls = find_urls(html_lst, verbose)
    
    if urls == []:
        return False
    
    return True
        

In [15]:
## Uncomment to see result
# autourl_exists(test_cite)

### Testing

In [16]:
import pandas as pd



In [17]:
journal_dump = pd.read_json("turkish_wiki_0621_first_3000_with_doi.json", lines = True)
journal_dump.head()

Unnamed: 0,a,u,c
0,1 + 2 + 3 + 4 + · · ·,,{{Akademik dergi kaynağı\n| soyadı = Lepowsky ...
1,12 Victoria,,{{Akademik dergi kaynağı\n | ad1 = B.\n | s...
2,"1,3,5-Triklorobenzen",,{{Akademik dergi kaynağı|soyadı=Jaw|ad=Ching-G...
3,141 Likya depremi,,{{Akademik dergi kaynağı | url=http://blackmed...
4,141 Likya depremi,,{{Akademik dergi kaynağı | url=http://www.nat-...


In [18]:
df_100 = journal_dump.sample(n = 100)
df_100.shape

(100, 3)

In [19]:
# %%time

df_100["has_autourl"] = df_100["c"].apply(lambda x: autourl_exists(x, language = "tr", source ="wikipedia"))

In [20]:
df_100.head()

Unnamed: 0,a,u,c,has_autourl
1597,Piroliz,,{{Akademik dergi kaynağı|başlık=Interactions o...,True
356,Çapraz bağlama reaksiyonu,https://archive.org/details/sim_chemical-revie...,{{Akademik dergi kaynağı|başlık=Advances in Tr...,True
1374,Pedro de Palol,,{{Akademik dergi kaynağı|url=|başlık=Avance de...,False
1904,Siyah Mansur Beyliği,,{{Akademik dergi kaynağı|url=https://www.acade...,True
1771,Roma Opera Binası,,{{Akademik dergi kaynağı|url=https://www.jstor...,True


### Has autourl

In [21]:
df_100_auto = df_100[df_100["has_autourl"]]
df_100_auto.head()

Unnamed: 0,a,u,c,has_autourl
1597,Piroliz,,{{Akademik dergi kaynağı|başlık=Interactions o...,True
356,Çapraz bağlama reaksiyonu,https://archive.org/details/sim_chemical-revie...,{{Akademik dergi kaynağı|başlık=Advances in Tr...,True
1904,Siyah Mansur Beyliği,,{{Akademik dergi kaynağı|url=https://www.acade...,True
1771,Roma Opera Binası,,{{Akademik dergi kaynağı|url=https://www.jstor...,True
2694,Bayi sırası,,{{Akademik dergi kaynağı|url=http://www.planet...,True


#### I generated url for it

In [22]:
df_100_auto[df_100_auto["u"] != ""]

Unnamed: 0,a,u,c,has_autourl
356,Çapraz bağlama reaksiyonu,https://archive.org/details/sim_chemical-revie...,{{Akademik dergi kaynağı|başlık=Advances in Tr...,True
2350,Tütünün sağlığa etkileri,https://archive.org/details/sim_psychopharmaco...,{{Akademik dergi kaynağı|başlık=The effects of...,True
2147,Trans erkek,https://archive.org/details/sim_american-journ...,{{Akademik dergi kaynağı|başlık=Demographic an...,True
1745,Richard Posner,https://archive.org/details/sim_journal-of-leg...,{{Akademik dergi kaynağı|başlık=The Most-Cited...,True


In [23]:
# print(df_100_auto.loc[130,"a"])
# # print(df_100_auto.loc[130, "u"])
# print(df_100_auto.loc[130, "c"])

In [24]:
# print(df_100_auto.loc[2263,"a"])
# print(df_100_auto.loc[2263, "u"])
# print(df_100_auto.loc[2263, "c"])

### Does not have url

In [25]:
df_100_no_auto = df_100[df_100["has_autourl"]==False]
df_100_no_auto.head()

Unnamed: 0,a,u,c,has_autourl
1374,Pedro de Palol,,{{Akademik dergi kaynağı|url=|başlık=Avance de...,False
465,Counter-Strike (video oyunu),,{{Akademik dergi kaynağı|başlık=Army of One|ya...,False
1666,Prosopagnozi,,{{Akademik dergi kaynağı|url=|başlık=Holistic ...,False
744,Hazırlama etkisi,,{{Akademik dergi kaynağı|başlık=Paired-associa...,False
729,Hazırlama etkisi,,{{Akademik dergi kaynağı|başlık=On priming by ...,False


In [26]:
df_100_no_auto[df_100_no_auto["u"] != ""]

Unnamed: 0,a,u,c,has_autourl
760,Hazırlama etkisi,https://archive.org/details/sim_science_2008-1...,{{Akademik dergi kaynağı|başlık=Experiencing p...,False


In [27]:
print(df_100_no_auto.loc[760, "a"])
print(df_100_no_auto.loc[760, "u"])
print(df_100_no_auto.loc[760, "c"])

Hazırlama etkisi
https://archive.org/details/sim_science_2008-10-24_322_5901
{{Akademik dergi kaynağı|başlık=Experiencing physical warmth promotes interpersonal warmth|tarih=Ekim 2008|sayı=5901|sayfalar=606-7|çalışma=Science|cilt=322}}


In [28]:
# print(df_100_no_auto.loc[748, "a"])
# print(df_100_no_auto.loc[748, "u"])
# print(df_100_no_auto.loc[748, "c"])