# Pulling info from Wikimedia

In [41]:
import pandas as pd
from pandas import read_csv
import json
import requests
import mwclient
from mwclient import Site
import time
import datetime
import os

#useragent = {
#    'User-Agent': 'Gene Wiki Review Impact (youremail@domain)'
#}

useragent = {
    'User-Agent': 'Gene Wiki Review Impact (gtsueng@scripps.edu)'
}

site = Site('en.wikipedia.org', clients_useragent=useragent['User-Agent'])

datapath = 'data/'
exppath = 'results/'

# Functions

In [62]:
###############################################################################
## Request nicely
###############################################################################
import json
import time
import os

import requests
from dateutil import parser
from datetime import date
import pathlib

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

DEFAULT_TIMEOUT = 5 # seconds

class TimeoutHTTPAdapter(HTTPAdapter):
    def __init__(self, *args, **kwargs):
        self.timeout = DEFAULT_TIMEOUT
        if "timeout" in kwargs:
            self.timeout = kwargs["timeout"]
            del kwargs["timeout"]
        super().__init__(*args, **kwargs)

    def send(self, request, **kwargs):
        timeout = kwargs.get("timeout")
        if timeout is None:
            kwargs["timeout"] = self.timeout
        return super().send(request, **kwargs)

## Set time outs, backoff, retries
httprequests = requests.Session()
retry_strategy = Retry(
    total=3,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
    method_whitelist=["HEAD", "GET", "OPTIONS"]
)
adapter = TimeoutHTTPAdapter(timeout=5,max_retries=retry_strategy)
httprequests.mount("https://", adapter)
httprequests.mount("http://", adapter)

In [2]:
###############################################################################
## This module uses mwclient to pull page size and edit stats on wikipedia pages  
## for each gene given a list of gene wikipedia titles
###############################################################################
def get_wiki_volume_info (site,titlelist):
    print('obtaining wikipedia volume information')
    pageinfo=[]
    pagefails = []
    for eachpage in titlelist:
        tempdict={} #title, length/size, last_revised, last_revision_id
        try:
            checkitem = site.api('query', prop='info', titles=eachpage)
            results1 = checkitem['query']['pages']
            for item in results1:
                base = str(item)
                results2 = results1[base]
                tempdict['title']=str(results2['title'])
                tempdict['page_length']=int(results2['length'])
                tempdict['last_touched']=str(results2['touched'])
                tempdict['lastrevid']=str(results2['lastrevid'])
                pageinfo.append(tempdict)               
        except:
            pagefails.append(eachpage)
            pass 
        time.sleep(1)
    return(pageinfo,pagefails)

In [52]:
###############################################################################
## This module uses pulls pageview data from the Media Wiki PageViews API
## More on the API here: https://wikimedia.org/api/rest_v1/#/Pageviews%20data/
## The module pulls in a parameter dictionary, and the list of wiki titles
## Parameters include:
## project: en.wikipedia.org, other wikimedia projects
## access: all-access, desktop, mobile-app, mobile-web
## agent: all-agents, user, spider, bot
## granularity: daily, monthly
###############################################################################
def get_monthly_pvs(page_view_parameters, useragent, titlelist):
    pginfo = []
    pgfails = []
    timestart = datetime.datetime.now().time()
    print(timestart,'obtaining wikipedia pageview information')
    pv_api_url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/"
    for eachtitle in titlelist:
        try:
            url = pv_api_url+pv_params['access']+pv_params['agent']+eachtitle+"/"+pv_params['granularity']+pv_params['start']+"/"+pv_params['end']
            r = httprequests.get(url, headers=useragent)
            items = r.json()
            try:
                for item in items["items"]:
                    tmpdict = {'title':item["article"], 'views':int(item["views"]), 'granularity':item['granularity'],
                               'timestamp':item["timestamp"],'access':item['access'],'agent':item['agent']}
                    pginfo.append(tmpdict)
            except:
                tmpdict = {'title':title, 'views':-1, 'granularity':"no data",
                               'timestamp':"00000000",'access':"not data",'agent':"no data"}
                pginfo.append(tmpdict)            
        except:
            pgfails.append(eachtitle)
        time.sleep(1)

    pginfodf = pandas.DataFrame(pginfo)
    
    return(pginfodf, pgfails)    

# Pulling gene specific infor by Wikipedia titles

In [53]:
## Import the urls for the genes
gene_wiki_info = read_csv(datapath+'GeneWikiReviewlist.tsv',delimiter='\t', header=0)
#print(gene_wiki_info.head(n=2))
pagelist = gene_wiki_info['Gene Wiki Page'].loc[~gene_wiki_info['Gene Wiki Page'].isna()].tolist()
titlelist = [x.replace(" ","_").replace("https://","http://").replace("http://en.wikipedia.org/wiki/","") for x in pagelist]
print(titlelist[0])

Surfactant_protein_A1


In [54]:
## Get Wikipedia info for gene wiki articles
pageinfo,pagefails = get_wiki_volume_info(site,titlelist)
wikiinfo = pandas.DataFrame(pageinfo)
print(wikiinfo.head(n=2))

wikiinfo.to_csv(exppath+'gene_wiki_vol_info.tsv',sep='\t',header=True)

obtaining wikipedia volume information
                   title  page_length          last_touched   lastrevid
0  Surfactant protein A1        31194  2021-11-20T12:42:45Z  1056213617
1  Surfactant protein A2        31602  2021-11-19T17:21:15Z  1056087140


In [55]:
#### Get Page views for each Gene Wiki Review wikipedia entry

#pages = ["Cyclin-dependent kinase 1", "Reelin"] ## for unit test

pv_params = {'project':'en.wikipedia',
             'access':'all-access/',
             'agent':'user/',
             'granularity':'monthly/',
             'start':'20130101',
             'end':'20211115'}

gene_monthly_pvs,pgfails = get_monthly_pvs(pv_params,useragent, titlelist)
print(gene_monthly_pvs.head(n=2))

gene_monthly_pvs.to_csv(exppath+'gene_wiki_views.tsv',sep='\t',header=True)

gene_monthly_views = pandas.pivot_table(gene_monthly_pvs[['timestamp','title','views']],
                                        values='views',index='title',columns='timestamp')
gene_pvs = gene_monthly_views.reset_index()
gene_pvs.rename(columns={'title':'wikipedia'},inplace=True)
#print(gene_pvs)
gene_pvs.to_csv(exppath+'gw_pvs.tsv',sep='\t',header=True)

11:31:27.970202 obtaining wikipedia pageview information
                   title  views granularity   timestamp      access agent
0  Surfactant_protein_A1    146     monthly  2017090100  all-access  user
1  Surfactant_protein_A1    125     monthly  2017100100  all-access  user


# Pull all statements added for series via SPARQL queries

Query Wikidata for P179 (part of series) of Q108807010 (Gene Wiki Review Series). Then identify statements that use any member of the query results as a reference

In [56]:
## Run the sparql query to retrieve all Articles in this series

def fetch_gwreviews_wd():
    url = 'https://query.wikidata.org/sparql'
    query = """
    SELECT ?item ?itemLabel 
    WHERE 
    {
      ?item wdt:P179 wd:Q108807010. 
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } # Helps get the label in your language, if not, then en language
    }
    """
    r = httprequests.get(url, params = {'format': 'json', 'query': query})
    data = r.json()
    datadf = pd.DataFrame(data['results']['bindings'])
    datadf['uri'] = [x['value'] for x in datadf['item']]
    datadf['label'] = [x['value'] for x in datadf['itemLabel']]
    datadf['QID'] = [x.replace('http://www.wikidata.org/entity/','') for x in datadf['uri']]
    cleandata = datadf[['uri','label','QID']].copy()
    return(cleandata)


def load_props_to_check(DATAPATH):
    propinfo = read_csv(os.path.join(DATAPATH,'propertylist.tsv'),delimiter='\t',header=0)
    return(propinfo)


def clean_up_results(wdjson,pid):
    tmpdf = pd.DataFrame(wdjson['results']['bindings'])
    tmpdf['subjectQID'] = [x['value'].replace("http://www.wikidata.org/entity/","") for x in tmpdf['item']]
    tmpdf['objectQID'] = [x['value'].replace("http://www.wikidata.org/entity/","") for x in tmpdf['value']]
    tmpdf['subject'] = [x['value'] for x in tmpdf['itemLabel']]
    tmpdf['object'] = [x['value'] for x in tmpdf['valueLabel']]
    tmpdf['predicatePID'] = pid
    cleandf = tmpdf[['subjectQID','predicatePID','objectQID','subject','object']].copy()
    return(cleandf)

In [64]:
## Run query to retrieve all statements that reference the above articles
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
propinfo = load_props_to_check(DATAPATH)
usefulpids = propinfo['Property'].loc[propinfo['PropertyUse']=='main'].unique().tolist()

cleandata = fetch_gwreviews_wd()
refids = cleandata['QID'].unique().tolist()

url = 'https://query.wikidata.org/sparql'

querybase = """
SELECT DISTiNCT ?item ?itemLabel ?value ?valueLabel
WHERE {
  ?item ?p ?statement.
  ?statement prov:wasDerivedFrom ?ref . 
  """

queryend = """    
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""

resultdf = pd.DataFrame(columns = (['subjectQID','predicatePID','objectQID','subject','object']))
for refqid in refids:
    for pid in usefulpids:
        #refqid = 'Q65950306' ## For testing
        #pid = 'P1916' ## For testing
        refquery = f"  ?ref ?prop wd:{refqid} ." 
        propquery = f"  ?item wdt:{pid} ?value"  
        query = querybase+refquery+propquery+queryend
        try:
            r = httprequests.get(url, params = {'format': 'json', 'query': query})
            tmpdata = r.json()
            if len(tmpdata['results']['bindings']) <= 0:
                no_result_flag = True
            else:
                cleandf = clean_up_results(tmpdata,pid)
                resultdf = pd.concat((resultdf,cleandf),ignore_index=True)
        except:
            continue
        time.sleep(2)

resultdf.to_csv('results/wd_statements_added.tsv',sep='\t',header=True)

In [34]:
url = 'https://query.wikidata.org/sparql'
query = """
SELECT DISTiNCT ?item ?itemLabel ?value ?valueLabel
WHERE {
  ?item ?p ?statement.
  ?statement prov:wasDerivedFrom ?ref . 
  ?ref ?prop wd:Q102060922 . # Replace with specific QID of article as a reference under 'stated in' within a statement
  ?item wdt:P2293 ?value  # Specify property of statement (in this example, Genetic Assocation)
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
r = requests.get(url, params = {'format': 'json', 'query': query})
test = r.json()
print(pd.DataFrame(test['results']['bindings']))


cleantest = clean_up_results(test,'P2293')
print(cleantest)

                                                item  \
0  {'type': 'uri', 'value': 'http://www.wikidata....   

                                               value  \
0  {'type': 'uri', 'value': 'http://www.wikidata....   

                                           itemLabel  \
0  {'xml:lang': 'en', 'type': 'literal', 'value':...   

                                          valueLabel  
0  {'xml:lang': 'en', 'type': 'literal', 'value':...  
  subjectQID predicatePID objectQID subject         object
0  Q18033696        P2293    Q41112   KALRN  schizophrenia
