# Pulling info from Wikimedia

In [1]:
import os
import pandas as pd
from pandas import read_csv
import json
import mwclient as mw
import pywikibot as pwb
import time
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pathlib
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry


In [None]:
useragent = {
    'User-Agent': 'Gene Wiki Review Impact (youremail@domain)'
}

mwsite = mw.Site('en.wikipedia.org', clients_useragent=useragent['User-Agent'])

datapath = 'data/'
exppath = 'results/'

# Functions

In [2]:
###############################################################################
## Request nicely
###############################################################################

DEFAULT_TIMEOUT = 5 # seconds

class TimeoutHTTPAdapter(HTTPAdapter):
    def __init__(self, *args, **kwargs):
        self.timeout = DEFAULT_TIMEOUT
        if "timeout" in kwargs:
            self.timeout = kwargs["timeout"]
            del kwargs["timeout"]
        super().__init__(*args, **kwargs)

    def send(self, request, **kwargs):
        timeout = kwargs.get("timeout")
        if timeout is None:
            kwargs["timeout"] = self.timeout
        return super().send(request, **kwargs)

## Set time outs, backoff, retries
httprequests = requests.Session()
retry_strategy = Retry(
    total=3,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
    method_whitelist=["HEAD", "GET", "OPTIONS"]
)
adapter = TimeoutHTTPAdapter(timeout=5,max_retries=retry_strategy)
httprequests.mount("https://", adapter)
httprequests.mount("http://", adapter)

In [23]:
###############################################################################
## This module uses pulls pageview data from the Media Wiki PageViews API
## More on the API here: https://wikimedia.org/api/rest_v1/#/Pageviews%20data/
## The module pulls in a parameter dictionary, and the list of wiki titles
## Parameters include:
## project: en.wikipedia.org, other wikimedia projects
## access: all-access, desktop, mobile-app, mobile-web
## agent: all-agents, user, spider, bot
## granularity: daily, monthly
###############################################################################
def get_monthly_pvs(page_view_parameters, useragent, no_missing):
    no_missing['titlelist'] = [x.replace(" ","_").replace("https://","http://").replace("http://en.wikipedia.org/wiki/","") for x in no_missing['Gene Wiki Page']]
    pginfo = []
    pgfails = []
    print('obtaining wikipedia pageview information')
    pv_api_url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/"
    for eachtitle in no_missing['titlelist']:
        try:
            url = pv_api_url+pv_params['access']+pv_params['agent']+eachtitle+"/"+pv_params['granularity']+pv_params['start']+"/"+pv_params['end']
            r = httprequests.get(url, headers=useragent)
            items = r.json()
            try:
                for item in items["items"]:
                    tmpdict = {'title':item["article"], 'views':int(item["views"]), 'granularity':item['granularity'],
                               'timestamp':item["timestamp"],'access':item['access'],'agent':item['agent']}
                    pginfo.append(tmpdict)
            except:
                tmpdict = {'title':title, 'views':-1, 'granularity':"no data",
                               'timestamp':"00000000",'access':"not data",'agent':"no data"}
                pginfo.append(tmpdict)            
        except:
            pgfails.append(eachtitle)
        time.sleep(1)

    pginfodf = pd.DataFrame(pginfo)
    
    return(pginfodf, pgfails)    

In [4]:
###############################################################################
## This module uses mwclient to pull page size and edit stats on wikipedia pages  
## for each gene given a list of gene wikipedia titles
###############################################################################
def get_wiki_volume_info (mwsite,no_missing):
    print('obtaining wikipedia volume information')
    no_missing['titlelist'] = [x.replace(" ","_").replace("https://","http://").replace("http://en.wikipedia.org/wiki/","") for x in no_missing['Gene Wiki Page']]
    pageinfo=[]
    pagefails = []
    for eachpage in no_missing['titlelist'].tolist():
        tempdict={} #title, length/size, last_revised, last_revision_id
        try:
            checkitem = mwsite.api('query', prop='info', titles=eachpage)
            results1 = checkitem['query']['pages']
            for item in results1:
                base = str(item)
                results2 = results1[base]
                tempdict['title']=str(results2['title'])
                tempdict['page_length']=int(results2['length'])
                tempdict['last_touched']=str(results2['touched'])
                tempdict['lastrevid']=str(results2['lastrevid'])
                pageinfo.append(tempdict)               
        except:
            pagefails.append(eachpage)
            pass 
        time.sleep(1)
    return(pageinfo,pagefails)

In [5]:
###############################################################################
## This module uses mwclient to get revision ids
###############################################################################
from time import mktime

def get_revid(site,pagetitle,starttime):
    page = site.pages[pagetitle]
    revidlist = []
    for revision in page.revisions():
        dt = datetime.fromtimestamp(mktime(revision['timestamp']))
        if dt <= datetime.strptime(starttime,'%Y%m%d'):
            revidlist.append(revision['revid'])
    return(revidlist[0])

def get_latest_revid(site,pagetitle):
    page = site.pages[pagetitle]
    allrevisions = list(page.revisions(prop='ids'))  
    last_revision_id = allrevisions[-1]['revid']
    return(last_revision_id)

In [6]:
###############################################################################
## This module uses mwclient to compare revisions
###############################################################################
def compare_revisions(mwsite,pagetitle,oldrevid,latestrevid):
    compare_result = mwsite.get('compare', fromrev=latestid, torev=oldrevid, fromtitle=pagetitle,
                              totitle=pagetitle)
    return(compare_result['compare']['*'])

In [14]:
###############################################################################
## This module uses pywikibot to get the text from old versions of wikipedia
## pages
###############################################################################

def get_six_months_prior(adatestring):
    dateinfo = datetime.strptime(adatestring, "%Y%m%d")
    six_months_prior = dateinfo - relativedelta(months = 6)
    starttime = datetime.strftime(six_months_prior,"%Y%m%d")
    return(starttime)


def get_old_page_length(pagetitle, oldrevid):
    pwsite = pwb.Site("en", "wikipedia")
    pwpage = pwb.Page(pwsite, pagetitle)
    text = pwpage.getOldVersion(oldid = oldrevid)
    return(len(text))


def get_old_page_volumes(mwsite,no_missing):
    print('obtaining old wikipedia volume information')
    no_missing['titlelist'] = [x.replace(" ","_").replace("https://","http://").replace("http://en.wikipedia.org/wiki/","") for x in no_missing['Gene Wiki Page']]
    pageinfo=[]
    for i in range(len(no_missing)):
        pagetitle = no_missing.iloc[i]['titlelist']
        updatedate = no_missing.iloc[i]['Wikipedia update period']
        starttime = get_six_months_prior(updatedate)
        tempdict={'title':pagetitle,'Wikipedia update period':updatedate,'6 months before update':starttime}
        try:
            oldrevid = get_revid(mwsite,pagetitle,starttime)
            oldpagevolume = get_old_page_length(pagetitle,oldrevid)
            tempdict['first revision prior to 6 month date'] = oldrevid
            tempdict['corresponding length'] = oldpagevolume
            pageinfo.append(tempdict)
        except:
            ## The page did not exist six months prior to the author adding, so page volume prior is 0
            tempdict['first revision prior to 6 month date'] = 0
            tempdict['corresponding length'] = 0
            pageinfo.append(tempdict)               
        time.sleep(1)
    return(pageinfo)    

# Pulling gene specific infor by Wikipedia titles

In [8]:
## Import the urls for the genes
gene_wiki_info = read_csv(datapath+'GeneWikiReviewlist.tsv',delimiter='\t', header=0)
no_missing = gene_wiki_info.loc[~gene_wiki_info['Gene Wiki Page'].isna()].copy()
no_missing['titlelist'] = [x.replace(" ","_").replace("https://","http://").replace("http://en.wikipedia.org/wiki/","") for x in no_missing['Gene Wiki Page']]
no_missing['Wikipedia update period'] = no_missing['Wikipedia update period'].astype(int)
no_missing['Wikipedia update period'] = no_missing['Wikipedia update period'].astype(str)
print(no_missing.iloc[0]['titlelist'])

Surfactant_protein_A1


In [9]:
%%time
## Get Wikipedia info for gene wiki articles
pageinfo,pagefails = get_wiki_volume_info(mwsite,no_missing)
wikiinfo = pd.DataFrame(pageinfo)
print(wikiinfo.head(n=2))

wikiinfo.to_csv(exppath+'gene_wiki_vol_info.tsv',sep='\t',header=True)

obtaining wikipedia volume information
                   title  page_length          last_touched   lastrevid
0  Surfactant protein A1        31194  2021-11-25T06:20:36Z  1056213617
1  Surfactant protein A2        31602  2021-11-19T17:21:15Z  1056087140
Wall time: 2min 34s


In [15]:
%%time
## Get past Wikipedia info for gene wiki articles
pageinfo = get_old_page_volumes(mwsite,no_missing)
wikiinfo = pd.DataFrame(pageinfo)
print(wikiinfo.head(n=2))

wikiinfo.to_csv(exppath+'gene_wiki_vol_info-BEFORE.tsv',sep='\t',header=True)

obtaining old wikipedia volume information
                   title Wikipedia update period 6 months before update  \
0  Surfactant_protein_A1                20120830               20120229   
1  Surfactant_protein_A2                20120830               20120229   

   first revision prior to 6 month date  corresponding length  
0                             462319030                  5903  
1                             204784710                    21  
Wall time: 3min 50s


In [16]:
#### Test of functions
pagetitle = 'Surfactant_protein_A1'
starttime = '20120229'
oldrevid = get_revid(mwsite,pagetitle,starttime)
print(oldrevid)
oldpagevolume = get_old_page_length(pagetitle,oldrevid)
print(oldpagevolume)

462319030
5903


In [24]:
%%time
#### Get Page views for each Gene Wiki Review wikipedia entry

#pages = ["Cyclin-dependent kinase 1", "Reelin"] ## for unit test

pv_params = {'project':'en.wikipedia',
             'access':'all-access/',
             'agent':'user/',
             'granularity':'monthly/',
             'start':'20130101',
             'end':'20211115'}

gene_monthly_pvs,pgfails = get_monthly_pvs(pv_params,useragent, no_missing)
print(gene_monthly_pvs.head(n=2))

gene_monthly_pvs.to_csv(exppath+'gene_wiki_views.tsv',sep='\t',header=True)

gene_monthly_views = pd.pivot_table(gene_monthly_pvs[['timestamp','title','views']],
                                        values='views',index='title',columns='timestamp')
gene_pvs = gene_monthly_views.reset_index()
gene_pvs.rename(columns={'title':'wikipedia'},inplace=True)
#print(gene_pvs)
gene_pvs.to_csv(exppath+'gw_pvs.tsv',sep='\t',header=True)

obtaining wikipedia pageview information
                   title  views granularity   timestamp      access agent
0  Surfactant_protein_A1    146     monthly  2017090100  all-access  user
1  Surfactant_protein_A1    125     monthly  2017100100  all-access  user


NameError: name 'pandas' is not defined

# Pull all statements added for series via SPARQL queries

Query Wikidata for P179 (part of series) of Q108807010 (Gene Wiki Review Series). Then identify statements that use any member of the query results as a reference

In [26]:
## Run the sparql query to retrieve all Articles in this series

def fetch_gwreviews_wd():
    url = 'https://query.wikidata.org/sparql'
    query = """
    SELECT ?item ?itemLabel 
    WHERE 
    {
      ?item wdt:P179 wd:Q108807010. 
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } # Helps get the label in your language, if not, then en language
    }
    """
    r = httprequests.get(url, params = {'format': 'json', 'query': query})
    data = r.json()
    datadf = pd.DataFrame(data['results']['bindings'])
    datadf['uri'] = [x['value'] for x in datadf['item']]
    datadf['label'] = [x['value'] for x in datadf['itemLabel']]
    datadf['QID'] = [x.replace('http://www.wikidata.org/entity/','') for x in datadf['uri']]
    cleandata = datadf[['uri','label','QID']].copy()
    return(cleandata)


def load_props_to_check(DATAPATH):
    propinfo = read_csv(os.path.join(DATAPATH,'propertylist.tsv'),delimiter='\t',header=0)
    return(propinfo)


def clean_up_results(wdjson,pid):
    tmpdf = pd.DataFrame(wdjson['results']['bindings'])
    tmpdf['subjectQID'] = [x['value'].replace("http://www.wikidata.org/entity/","") for x in tmpdf['item']]
    tmpdf['objectQID'] = [x['value'].replace("http://www.wikidata.org/entity/","") for x in tmpdf['value']]
    tmpdf['subject'] = [x['value'] for x in tmpdf['itemLabel']]
    tmpdf['object'] = [x['value'] for x in tmpdf['valueLabel']]
    tmpdf['predicatePID'] = pid
    cleandf = tmpdf[['subjectQID','predicatePID','objectQID','subject','object']].copy()
    return(cleandf)

In [27]:
%%time
## Run query to retrieve all statements that reference the above articles
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
propinfo = load_props_to_check(DATAPATH)
usefulpids = propinfo['Property'].loc[propinfo['PropertyUse']=='main'].unique().tolist()

cleandata = fetch_gwreviews_wd()
refids = cleandata['QID'].unique().tolist()

url = 'https://query.wikidata.org/sparql'

querybase = """
SELECT DISTiNCT ?item ?itemLabel ?value ?valueLabel
WHERE {
  ?item ?p ?statement.
  ?statement prov:wasDerivedFrom ?ref . 
  """

queryend = """    
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""

resultdf = pd.DataFrame(columns = (['subjectQID','predicatePID','objectQID','subject','object']))
for refqid in refids:
    for pid in usefulpids:
        #refqid = 'Q65950306' ## For testing
        #pid = 'P1916' ## For testing
        refquery = f"  ?ref ?prop wd:{refqid} ." 
        propquery = f"  ?item wdt:{pid} ?value"  
        query = querybase+refquery+propquery+queryend
        try:
            r = httprequests.get(url, params = {'format': 'json', 'query': query})
            tmpdata = r.json()
            if len(tmpdata['results']['bindings']) <= 0:
                no_result_flag = True
            else:
                cleandf = clean_up_results(tmpdata,pid)
                resultdf = pd.concat((resultdf,cleandf),ignore_index=True)
        except:
            continue
        time.sleep(2)

resultdf.to_csv('results/wd_statements_added.tsv',sep='\t',header=True)

Wall time: 2h 26min 56s


In [None]:
## This is a test
url = 'https://query.wikidata.org/sparql'
query = """
SELECT DISTiNCT ?item ?itemLabel ?value ?valueLabel
WHERE {
  ?item ?p ?statement.
  ?statement prov:wasDerivedFrom ?ref . 
  ?ref ?prop wd:Q102060922 . # Replace with specific QID of article as a reference under 'stated in' within a statement
  ?item wdt:P2293 ?value  # Specify property of statement (in this example, Genetic Assocation)
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
r = requests.get(url, params = {'format': 'json', 'query': query})
test = r.json()
print(pd.DataFrame(test['results']['bindings']))


cleantest = clean_up_results(test,'P2293')
print(cleantest)