# Pulling info from Wikimedia

In [1]:
import os
import pandas as pd
from pandas import read_csv
import json
import mwclient as mw
import pywikibot as pwb
import time
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pathlib
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry


In [2]:
useragent = {
    'User-Agent': 'Gene Wiki Review Impact (youremail@domain)'
}

mwsite = mw.Site('en.wikipedia.org', clients_useragent=useragent['User-Agent'])

datapath = 'data/'
exppath = 'results/'

# Functions

In [3]:
###############################################################################
## Request nicely
###############################################################################

DEFAULT_TIMEOUT = 5 # seconds

class TimeoutHTTPAdapter(HTTPAdapter):
    def __init__(self, *args, **kwargs):
        self.timeout = DEFAULT_TIMEOUT
        if "timeout" in kwargs:
            self.timeout = kwargs["timeout"]
            del kwargs["timeout"]
        super().__init__(*args, **kwargs)

    def send(self, request, **kwargs):
        timeout = kwargs.get("timeout")
        if timeout is None:
            kwargs["timeout"] = self.timeout
        return super().send(request, **kwargs)

## Set time outs, backoff, retries
httprequests = requests.Session()
retry_strategy = Retry(
    total=3,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504],
    method_whitelist=["HEAD", "GET", "OPTIONS"] ## Note this method is deprecated and replaced with `allowed_methods` for newer releases of requests library
)
adapter = TimeoutHTTPAdapter(timeout=5,max_retries=retry_strategy)
httprequests.mount("https://", adapter)
httprequests.mount("http://", adapter)

In [4]:
###############################################################################
## This module uses pulls pageview data from the Media Wiki PageViews API
## More on the API here: https://wikimedia.org/api/rest_v1/#/Pageviews%20data/
## The module pulls in a parameter dictionary, and the list of wiki titles
## Parameters include:
## project: en.wikipedia.org, other wikimedia projects
## access: all-access, desktop, mobile-app, mobile-web
## agent: all-agents, user, spider, bot
## granularity: daily, monthly
###############################################################################
def get_monthly_pvs(page_view_parameters, useragent, no_missing):
    no_missing['titlelist'] = [x.replace(" ","_").replace("https://","http://").replace("http://en.wikipedia.org/wiki/","") for x in no_missing['Gene Wiki Page']]
    pginfo = []
    pgfails = []
    print('obtaining wikipedia pageview information')
    pv_api_url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/"
    for eachtitle in no_missing['titlelist']:
        try:
            url = pv_api_url+pv_params['access']+pv_params['agent']+eachtitle+"/"+pv_params['granularity']+pv_params['start']+"/"+pv_params['end']
            r = httprequests.get(url, headers=useragent)
            items = r.json()
            try:
                for item in items["items"]:
                    tmpdict = {'title':item["article"], 'views':int(item["views"]), 'granularity':item['granularity'],
                               'timestamp':item["timestamp"],'access':item['access'],'agent':item['agent']}
                    pginfo.append(tmpdict)
            except:
                tmpdict = {'title':title, 'views':-1, 'granularity':"no data",
                               'timestamp':"00000000",'access':"not data",'agent':"no data"}
                pginfo.append(tmpdict)            
        except:
            pgfails.append(eachtitle)
        time.sleep(1)

    pginfodf = pd.DataFrame(pginfo)
    
    return(pginfodf, pgfails)    

In [5]:
###############################################################################
## This module uses mwclient to pull page size and edit stats on wikipedia pages  
## for each gene given a list of gene wikipedia titles
###############################################################################
def get_wiki_volume_info (mwsite,no_missing):
    print('obtaining wikipedia volume information')
    no_missing['titlelist'] = [x.replace(" ","_").replace("https://","http://").replace("http://en.wikipedia.org/wiki/","") for x in no_missing['Gene Wiki Page']]
    pageinfo=[]
    pagefails = []
    for eachpage in no_missing['titlelist'].tolist():
        tempdict={} #title, length/size, last_revised, last_revision_id
        try:
            checkitem = mwsite.api('query', prop='info', titles=eachpage)
            results1 = checkitem['query']['pages']
            for item in results1:
                base = str(item)
                results2 = results1[base]
                tempdict['title']=str(results2['title'])
                tempdict['page_length']=int(results2['length'])
                tempdict['last_touched']=str(results2['touched'])
                tempdict['lastrevid']=str(results2['lastrevid'])
                pageinfo.append(tempdict)               
        except:
            pagefails.append(eachpage)
            pass 
        time.sleep(1)
    return(pageinfo,pagefails)

In [6]:
###############################################################################
## This module uses mwclient to get revision ids
###############################################################################
from time import mktime

def get_revid(site,pagetitle,starttime):
    page = site.pages[pagetitle]
    revidlist = []
    for revision in page.revisions():
        dt = datetime.fromtimestamp(mktime(revision['timestamp']))
        if dt <= datetime.strptime(starttime,'%Y%m%d'):
            revidlist.append(revision['revid'])
    return(revidlist[0])

def get_latest_revid(site,pagetitle):
    page = site.pages[pagetitle]
    allrevisions = list(page.revisions(prop='ids'))  
    last_revision_id = allrevisions[-1]['revid']
    return(last_revision_id)

In [7]:
###############################################################################
## This module uses mwclient to compare revisions
###############################################################################
def compare_revisions(mwsite,pagetitle,oldrevid,latestrevid):
    compare_result = mwsite.get('compare', fromrev=latestid, torev=oldrevid, fromtitle=pagetitle,
                              totitle=pagetitle)
    return(compare_result['compare']['*'])

In [8]:
###############################################################################
## This module uses pywikibot to get the text from old versions of wikipedia
## pages
###############################################################################

def get_six_months_prior(adatestring):
    dateinfo = datetime.strptime(adatestring, "%Y%m%d")
    six_months_prior = dateinfo - relativedelta(months = 6)
    starttime = datetime.strftime(six_months_prior,"%Y%m%d")
    return(starttime)


def get_old_page_length(pagetitle, oldrevid):
    pwsite = pwb.Site("en", "wikipedia")
    pwpage = pwb.Page(pwsite, pagetitle)
    text = pwpage.getOldVersion(oldid = oldrevid)
    return(len(text))


def get_old_page_volumes(mwsite,no_missing):
    print('obtaining old wikipedia volume information')
    no_missing['titlelist'] = [x.replace(" ","_").replace("https://","http://").replace("http://en.wikipedia.org/wiki/","") for x in no_missing['Gene Wiki Page']]
    pageinfo=[]
    for i in range(len(no_missing)):
        pagetitle = no_missing.iloc[i]['titlelist']
        updatedate = no_missing.iloc[i]['Wikipedia update period']
        starttime = get_six_months_prior(updatedate)
        tempdict={'title':pagetitle,'Wikipedia update period':updatedate,'6 months before update':starttime}
        try:
            oldrevid = get_revid(mwsite,pagetitle,starttime)
            oldpagevolume = get_old_page_length(pagetitle,oldrevid)
            tempdict['first revision prior to 6 month date'] = oldrevid
            tempdict['corresponding length'] = oldpagevolume
            pageinfo.append(tempdict)
        except:
            ## The page did not exist six months prior to the author adding, so page volume prior is 0
            tempdict['first revision prior to 6 month date'] = 0
            tempdict['corresponding length'] = 0
            pageinfo.append(tempdict)               
        time.sleep(1)
    return(pageinfo)    

# Pulling gene specific infor by Wikipedia titles

In [9]:
## Import the urls for the genes
gene_wiki_info = read_csv(datapath+'GeneWikiReviewlist.tsv',delimiter='\t', header=0)
no_missing = gene_wiki_info.loc[~gene_wiki_info['Gene Wiki Page'].isna()].copy()
no_missing['titlelist'] = [x.replace(" ","_").replace("https://","http://").replace("http://en.wikipedia.org/wiki/","") for x in no_missing['Gene Wiki Page']]
no_missing['Wikipedia update period'] = no_missing['Wikipedia update period'].astype(int)
no_missing['Wikipedia update period'] = no_missing['Wikipedia update period'].astype(str)
print(no_missing.iloc[0]['titlelist'])


Surfactant_protein_A1


In [5]:
print(no_missing.head(n=2))

  Batch  Gene_id Date anticipated Latest email sent (MJ) GW_title  \
0   NaN   653509   July--COMPLETE              7/18/2012   SFTPA1   
1   NaN   729238   July--COMPLETE              7/18/2012   SFTPA2   

  corresponding author Journal state  \
0               Floros     published   
1               Floros     published   

                                              status  \
0  http://www.sciencedirect.com/science/article/p...   
1  http://www.sciencedirect.com/science/article/p...   

                                        Pubmed Gene Wiki Status  \
0  http://www.ncbi.nlm.nih.gov/pubmed/23069847         Complete   
1  http://www.ncbi.nlm.nih.gov/pubmed/23069847         Complete   

                                      Gene Wiki Page  \
0  https://en.wikipedia.org/wiki/Surfactant_prote...   
1  https://en.wikipedia.org/wiki/Surfactant_prote...   

                         Notes Acknowledgements Grant ID Editor    PMCID  \
0  Completed along with SFTPA2                       Na

In [6]:
basic_info = no_missing[['Batch','Gene Wiki Page','status','Wikipedia update period']]
basic_info.to_csv('results/basic_info.tsv',sep='\t',header=True)

In [15]:
%%time
## Get Wikipedia info for gene wiki articles
pageinfo,pagefails = get_wiki_volume_info(mwsite,no_missing)
wikiinfo = pd.DataFrame(pageinfo)
print(wikiinfo.head(n=2))

wikiinfo.to_csv(exppath+'gene_wiki_vol_info.tsv',sep='\t',header=True)

obtaining wikipedia volume information
                   title  page_length          last_touched   lastrevid
0  Surfactant protein A1        31194  2021-12-01T01:46:22Z  1056213617
1  Surfactant protein A2        31602  2021-12-07T14:12:19Z  1056087140
Wall time: 2min 33s


In [16]:
%%time
## Get past Wikipedia info for gene wiki articles
pageinfo = get_old_page_volumes(mwsite,no_missing)
wikiinfo = pd.DataFrame(pageinfo)
print(wikiinfo.head(n=2))

wikiinfo.to_csv(exppath+'gene_wiki_vol_info-BEFORE.tsv',sep='\t',header=True)

obtaining old wikipedia volume information
                   title Wikipedia update period 6 months before update  \
0  Surfactant_protein_A1                20120830               20120229   
1  Surfactant_protein_A2                20120830               20120229   

   first revision prior to 6 month date  corresponding length  
0                             462319030                  5903  
1                             204784710                    21  
Wall time: 3min 53s


In [None]:
#### Test of functions
pagetitle = 'Surfactant_protein_A1'
starttime = '20120229'
oldrevid = get_revid(mwsite,pagetitle,starttime)
print(oldrevid)
oldpagevolume = get_old_page_length(pagetitle,oldrevid)
print(oldpagevolume)

In [17]:
%%time
#### Get Page views for each Gene Wiki Review wikipedia entry

#pages = ["Cyclin-dependent kinase 1", "Reelin"] ## for unit test

pv_params = {'project':'en.wikipedia',
             'access':'all-access/',
             'agent':'user/',
             'granularity':'monthly/',
             'start':'20130101',
             'end':'20211115'}

gene_monthly_pvs,pgfails = get_monthly_pvs(pv_params,useragent, no_missing)
print(gene_monthly_pvs.head(n=2))

gene_monthly_pvs.to_csv(exppath+'gene_wiki_views.tsv',sep='\t',header=True)

gene_monthly_views = pd.pivot_table(gene_monthly_pvs[['timestamp','title','views']],
                                        values='views',index='title',columns='timestamp')
gene_pvs = gene_monthly_views.reset_index()
gene_pvs.rename(columns={'title':'wikipedia'},inplace=True)
#print(gene_pvs)
gene_pvs.to_csv(exppath+'gw_pvs.tsv',sep='\t',header=True)

obtaining wikipedia pageview information
                   title  views granularity   timestamp      access agent
0  Surfactant_protein_A1    146     monthly  2017090100  all-access  user
1  Surfactant_protein_A1    125     monthly  2017100100  all-access  user
Wall time: 2min 47s


# Pull all statements added for series via SPARQL queries

Query Wikidata for P179 (part of series) of Q108807010 (Gene Wiki Review Series). Then identify statements that use any member of the query results as a reference

In [10]:
## Run the sparql query to retrieve all Articles in this series

def fetch_gwreviews_wd():
    url = 'https://query.wikidata.org/sparql'
    query = """
    SELECT ?item ?itemLabel ?PubMedCentID 
    WHERE 
    {
      ?item wdt:P179 wd:Q108807010.
        ?item wdt:P932 ?PubMedCentID
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } # Helps get the label in your language, if not, then en language
    }
    """
    r = httprequests.get(url, params = {'format': 'json', 'query': query})
    data = r.json()
    datadf = pd.DataFrame(data['results']['bindings'])
    datadf['uri'] = [x['value'] for x in datadf['item']]
    datadf['label'] = [x['value'] for x in datadf['itemLabel']]
    datadf['QID'] = [x.replace('http://www.wikidata.org/entity/','') for x in datadf['uri']]
    datadf['PMCID'] = [x['value'] for x in datadf['PubMedCentID']]
    cleandata = datadf[['uri','label','QID','PMCID']].copy()
    return(cleandata)


def load_props_to_check(DATAPATH):
    propinfo = read_csv(os.path.join(DATAPATH,'propertylist.tsv'),delimiter='\t',header=0)
    return(propinfo)


def clean_up_results(wdjson,pid):
    tmpdf = pd.DataFrame(wdjson['results']['bindings'])
    tmpdf['subjectQID'] = [x['value'].replace("http://www.wikidata.org/entity/","") for x in tmpdf['item']]
    tmpdf['objectQID'] = [x['value'].replace("http://www.wikidata.org/entity/","") for x in tmpdf['value']]
    tmpdf['subject'] = [x['value'] for x in tmpdf['itemLabel']]
    tmpdf['object'] = [x['value'] for x in tmpdf['valueLabel']]
    tmpdf['predicatePID'] = pid
    try:
        tmpdf['qualifierID'] = [x['value'].replace("http://www.wikidata.org/entity/","") for x in tmpdf['qualifier']]
        tmpdf['qualifier'] = [x['value'] for x in tmpdf['qualifierLabel']]
    except:
        tmpdf['qualifierID'] = "None"
        tmpdf['qualifier'] = "None"
    cleandf = tmpdf[['subjectQID','predicatePID','objectQID','subject','object','qualifierID','qualifier']].copy()
    return(cleandf)

In [37]:
%%time
#### With Qualifiers
## Run query to retrieve all statements that reference the above articles
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
propinfo = load_props_to_check(DATAPATH)
usefulpids = propinfo['Property'].loc[propinfo['PropertyUse']=='main'].unique().tolist()

cleandata = fetch_gwreviews_wd()
refids = cleandata['QID'].unique().tolist()

url = 'https://query.wikidata.org/sparql'

resultdf = pd.DataFrame(columns = (['subjectQID','predicatePID','objectQID','subject','object','qualifierID','qualifier']))
for refqid in refids:
    for pid in usefulpids:
        #refqid = 'Q65950306' ## For testing
        #pid = 'P1916' ## For testing
        querybase = f"""
        SELECT DISTiNCT ?item ?itemLabel ?value ?valueLabel ?qualifier ?qualifierLabel
        WHERE {{
          ?item ?p ?statement.
          ?statement prov:wasDerivedFrom ?ref . 
          ?ref pr:P248 wd:{refqid} .
          ?statement pq:P459 ?qualifier .
          ?item wdt:{pid} ?value
                SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
        }}
        """        
        try:
            r = httprequests.get(url, params = {'format': 'json', 'query': querybase})
            tmpdata = r.json()
            if len(tmpdata['results']['bindings']) <= 0:
                no_result_flag = True
            else:
                cleandf = clean_up_results(tmpdata,pid)
                resultdf = pd.concat((resultdf,cleandf),ignore_index=True)
        except:
            continue
        time.sleep(2)

print(resultdf.head(n=2))
resultdf.to_csv('results/wd_statements_added.tsv',sep='\t',header=True)



  subjectQID predicatePID objectQID      subject       object qualifierID  \
0     Q57055         P769   Q423364  paracetamol  propranolol   Q23173789   
1     Q57055         P769  Q1135705  paracetamol    rifabutin   Q23173789   

  qualifier  
0       EXP  
1       EXP  
Wall time: 2h 50min


In [36]:
## This is a test
url = 'https://query.wikidata.org/sparql'
query = """
SELECT DISTiNCT ?item ?itemLabel ?value ?valueLabel ?qualifier ?qualifierLabel
WHERE {
  ?item ?p ?statement.
  ?statement prov:wasDerivedFrom ?ref . 
  ?ref pr:P248 wd:Q102060922 . # Replace with specific QID of article as a reference under 'stated in' within a statement
  ?statement pq:P459 ?qualifier .
  ?item wdt:P2293 ?value  # Specify property of statement (in this example, Genetic Assocation)
        SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
r = httprequests.get(url, params = {'format': 'json', 'query': query})
test = r.json()
check = pd.DataFrame(test['results']['bindings'])
qlabel = check.iloc[0]['qualifierLabel']
print(check)


cleantest = clean_up_results(test,'P2293')
print(cleantest)

                                           qualifier  \
0  {'type': 'uri', 'value': 'http://www.wikidata....   

                                                item  \
0  {'type': 'uri', 'value': 'http://www.wikidata....   

                                               value  \
0  {'type': 'uri', 'value': 'http://www.wikidata....   

                                           itemLabel  \
0  {'xml:lang': 'en', 'type': 'literal', 'value':...   

                                          valueLabel  \
0  {'xml:lang': 'en', 'type': 'literal', 'value':...   

                                      qualifierLabel  
0  {'xml:lang': 'en', 'type': 'literal', 'value':...  
  subjectQID predicatePID objectQID subject         object qualifierID  \
0  Q18033696        P2293    Q41112   KALRN  schizophrenia   Q23174671   

  qualifier  
0       IMP  


### Alternative approach
The above sparql query is giving a lot of false positives. To bypass, try simplifying the query to pull all Wikidata items that have statements which reference a Gene Wiki Review. Request the entire content for each item from the MediaWiki API (https://www.wikidata.org/w/api.php?action=wbgetclaims&entity=Q108&format=json) then parse it to extract only the statements with the correct references. Note that these may be very nested; however, this may be more comprehensive that trying to query for specific properties.

In [11]:
%%time
## Run query to retrieve all items statements that reference the above articles
script_path = ''
DATAPATH = os.path.join(script_path,'data/')
propinfo = load_props_to_check(DATAPATH)
usefulpids = propinfo['Property'].loc[propinfo['PropertyUse']=='main'].unique().tolist()

cleandata = fetch_gwreviews_wd()
refids = cleandata['QID'].unique().tolist()

url = 'https://query.wikidata.org/sparql'

resultdf = pd.DataFrame(columns = (['subjectQID','subject']))
for refqid in refids:
    querybase = f"""
    SELECT DISTiNCT ?item ?itemLabel
    WHERE {{
      ?item ?p ?statement.
      ?statement prov:wasDerivedFrom [pr:P248 wd:{refqid}] . 
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """        
    try:
        r = httprequests.get(url, params = {'format': 'json', 'query': querybase})
        tmpdata = r.json()
        if len(tmpdata['results']['bindings']) <= 0:
            no_result_flag = True
        else:
            tmpdf = pd.DataFrame(tmpdata['results']['bindings'])
            tmpdf['subjectQID'] = [x['value'].replace("http://www.wikidata.org/entity/","") for x in tmpdf['item']]
            tmpdf['subject'] = [x['value'] for x in tmpdf['itemLabel']]
            resultdf = pd.concat((resultdf,tmpdf[['subjectQID','subject']].copy()),ignore_index=True)
    except:
        continue
    time.sleep(2)

print(resultdf.head(n=2))
print(len(resultdf['subjectQID'].unique().tolist()))

  subjectQID       subject
0    Q155746  paricalcitol
1    Q415571     Bleomycin
37
Wall time: 3min 6s


In [None]:
def extract_q(childict):
    try:
        found_qid = childict['datavalue']['value']['id']
    except:
        found_qid = False
    return(found_qid)


def search_records(jsonchild):
    try: ##test if jsonchild is a list or dictionary
        keylist = list(jsonchild.keys())
    except:
        keylist = range(len(jsonchild))
    for eachkey in keylist:
        childict = jsonchild[eachkey]
        found_qid = extract_q(childict)
        if found_qid != False:
            break
    return(found_qid)

def search_dict(grandparentdict):

In [12]:
def get_gw_qualifier(qualifierlist):
    qualinfo = []
    qualifier_props = list(qualifierlist.keys())
    for eachprop in qualifier_props:
        print(eachprop)
        for i in len(range(qualifierlist[eachprop])):
            try:
                qual_prop = qualifierlist[eachprop][i]['property']
                qual_qid = qualifierlist[eachprop][i]['datavalue']['value']['id']
                qualinfo.append({"qualifier_property":qual_prop,
                                 "qualifierID":qual_id})
            except:
                continue
    if len(qualinfo)<1:
        qualinfo = False
    return(qualinfo)


def get_gw_reference(referencelist,refids):
    for j in range(len(referencelist)):
        a_reference = referencelist[j]['snaks']
        refprops = list(referencelist[j]['snaks'].keys())
        for eachrefprop in refprops:
            for k in range(len(eachrefprop)):
                try:
                    reference_qid = a_reference[eachrefprop][k]['datavalue']['value']['id']
                except:
                    reference_qid = False
                if reference_qid in refids:
                    break_out_flag = True
                    break
                else: 
                    reference_qid = False
            if break_out_flag == True:
                break
        if break_out_flag == True:
            break
    return(eachrefprop, reference_qid)


def parse_qjson(eachentity,qjson,refids):
    saved_results = []
    propertylist = list(qjson['claims'].keys())
    for eachprop in propertylist:
        statementprop = eachprop
        for i in range(len(qjson['claims'])):
            try:
                statementobject = qjson['claims'][eachprop][i]['mainsnak']['datavalue']['value']['id']
            except:
                try:
                    statementobject = qjson['claims'][eachprop][i]['mainsnak']['datavalue']['value']
                except:
                    continue
                    #print(i, qjson['claims'][eachprop][i].keys())
            try:
                qualifierlist = qjson['claims'][eachprop][i]['qualifiers']
                qualinfo = get_gw_qualifier(qualifierlist)
                print(qualinfo)
            except:
                qualinfo = False
            try:
                referencelist = qjson['claims'][eachprop][i]['references']
                reference_property, reference_qid = get_gw_reference(referencelist,refids)
            except:
                reference_qid = False
            if ((reference_qid != False) and (qualinfo) != False):
                tmpdict = {'subjectID':eachentity,
                           'predicateID':statementprop,
                           'objectID':statementobject,
                           'referenceProp':reference_property,
                           'referenceID':reference_qid}
                for eachqual in qualinfo:
                    saved_results.append(tmpdict.update(eachqual))
            elif ((reference_qid != False) and (qualinfo) == False):
                tmpdict = {'subjectID':eachentity,
                           'predicateID':statementprop,
                           'objectID':statementobject,
                           'referenceProp':reference_property,
                           'referenceID':reference_qid,
                           'qualifier_property':False,
                           'qualifierID':False
                          }
    if len(saved_results)>0:
        saved_resultdf = pd.DataFrame(saved_results)
    else:
        saved_resultdf = False
    return(saved_resultdf)

In [72]:
entitylist = resultdf['subjectQID'].unique().tolist()

for eachentity in entitylist[0:1]:
    ##fetch the json
    print(eachentity)
    r = httprequests.get(f'https://www.wikidata.org/w/api.php?action=wbgetclaims&entity={eachentity}&format=json')
    qjson = r.json()
    testdf = pd.DataFrame(qjson)
    testdf.reset_index(inplace=True)
    testdf.rename(columns={'index':'property'},inplace=True)
    expandeddf = testdf.explode('claims').copy()
    print(expandeddf.iloc[0]['claims'])
    expandeddf['keyinfo'] = [x.keys() for x in expandeddf['claims']]
    expandeddf['objectVal'] = [x['mainsnak']['datavalue']['value'] for x in expandeddf['claims']]
    #print(len(expandeddf),len(explodeddf))
    #expandeddf['type'] = [x['type']['value'] for x in expandeddf['claims']]
    #expandeddf['objectID'] = [x['id'] for x in expandeddf['claims']]
    print(expandeddf)


Q155746
{'mainsnak': {'snaktype': 'value', 'property': 'P117', 'hash': '5333d73dc6d61b73530cbad4828d21983f30f1ea', 'datavalue': {'value': 'Paricalcitol.svg', 'type': 'string'}, 'datatype': 'commonsMedia'}, 'type': 'statement', 'id': 'Q155746$b7e0eaaf-4a18-48b5-3baf-a544db2728a4', 'rank': 'normal'}
   property                                             claims  \
0      P117  {'mainsnak': {'snaktype': 'value', 'property':...   
1      P129  {'mainsnak': {'snaktype': 'value', 'property':...   
2     P2017  {'mainsnak': {'snaktype': 'value', 'property':...   
3     P2062  {'mainsnak': {'snaktype': 'value', 'property':...   
4     P2063  {'mainsnak': {'snaktype': 'value', 'property':...   
5     P2067  {'mainsnak': {'snaktype': 'value', 'property':...   
6     P2115  {'mainsnak': {'snaktype': 'value', 'property':...   
7     P2175  {'mainsnak': {'snaktype': 'value', 'property':...   
7     P2175  {'mainsnak': {'snaktype': 'value', 'property':...   
7     P2175  {'mainsnak': {'snaktype': 'v

In [13]:
entitylist = resultdf['subjectQID'].unique().tolist()

statements_added = pd.DataFrame(columns = ['subjectID', 'predicateID',
                                            'objectID', 'referenceProp', 
                                            'referenceID', 'qualifier_property',
                                            'qualifierID'])
for eachentity in entitylist[0:1]:
    ##fetch the json
    print(eachentity)
    r = httprequests.get(f'https://www.wikidata.org/w/api.php?action=wbgetclaims&entity={eachentity}&format=json')
    qjson = r.json()
    retrieved_df = parse_qjson(eachentity,qjson,refids)
    if retrieved_df != False:
        statements_added = pd.concat((statements_added,retrieved_df))

print(statements_added)  

Q155746
P4390
P2868
P459
P1810
P642
Empty DataFrame
Columns: [subjectID, predicateID, objectID, referenceProp, referenceID, qualifier_property, qualifierID]
Index: []


In [50]:
entitylist = resultdf['subjectQID'].unique().tolist()

for eachentity in entitylist[0:1]:
    ##fetch the json
    print(eachentity)
    r = httprequests.get(f'https://www.wikidata.org/w/api.php?action=wbgetclaims&entity={eachentity}&format=json')
    qjson = r.json()
    print(qjson['claims']['P231'][0].keys())
    print(qjson['claims']['P231'][0]['mainsnak']['property'])
    print(qjson['claims']['P231'][0]['mainsnak']['datavalue']['value'])
    print(qjson['claims']['P231'][0]['references'][0]['snaks'].keys())
    print(qjson['claims']['P231'][0]['references'][0]['snaks']['P248'][0].keys())
    print(qjson['claims']['P231'][0]['references'][0]['snaks']['P248'][0]['datavalue']['value']['id'])

Q155746
dict_keys(['mainsnak', 'type', 'id', 'rank', 'references'])
P231
131918-61-1
dict_keys(['P248', 'P652', 'P407', 'P1476', 'P813'])
dict_keys(['snaktype', 'property', 'hash', 'datavalue', 'datatype'])
Q6593799


In [65]:
print(1,qjson['claims']['P3771'][0].keys())
print(2,qjson['claims']['P3771'][0]['mainsnak']['property'])
print(3,qjson['claims']['P3771'][0]['mainsnak']['datavalue']['value'])
print(4,qjson['claims']['P3771'][0]['qualifiers'].keys())
print(5,qjson['claims']['P3771'][0]['qualifiers']['P459'])
print(6,qjson['claims']['P3771'][0]['qualifiers']['P459'][0].keys())
print(7,qjson['claims']['P3771'][0]['qualifiers']['P459'][0]['property'])
print(8,qjson['claims']['P3771'][0]['qualifiers']['P459'][0]['datavalue']['value']['id'])

1 dict_keys(['mainsnak', 'type', 'qualifiers', 'qualifiers-order', 'id', 'rank', 'references'])
2 P3771
3 {'entity-type': 'item', 'numeric-id': 14906849, 'id': 'Q14906849'}
4 dict_keys(['P459', 'P3811'])
5 [{'snaktype': 'value', 'property': 'P459', 'hash': '61a4ff3177c88526c2a0b9c033c3e5f101a1c4f5', 'datavalue': {'value': {'entity-type': 'item', 'numeric-id': 23173789, 'id': 'Q23173789'}, 'type': 'wikibase-entityid'}, 'datatype': 'wikibase-item'}]
6 dict_keys(['snaktype', 'property', 'hash', 'datavalue', 'datatype'])
7 P459
8 Q23173789


## Pull Citation Metrics for All Gene Wiki Reviews
There is a SERP API for accessing information from google scholar. This API can potentially be used to pull the number of citations garnered by each Gene Wiki Review article.

Example API Call - https://serpapi.com/playground?engine=google_scholar&q=Gene+Wiki+Reviews-Raising+the+quality+and+accessibility+of+information+about+the+human+genome&hl=en

**Note that it appears the results for the API call are not machine-readable as it's a playground API meant for user exploration. Since it's unclear whether or not the Gene Wiki Review editorial will constitute a commercial use, we'l just manually check google scholar and pull the information.**

In [16]:
## Test the use of ther SERPAPI
## As evidenced by the results, the API playground is not meant for any real use
baseapiurl = 'https://serpapi.com/playground?engine=google_scholar&q='
apicallend = '&hl=en'
title = cleandata.iloc[0]['label'].replace(" ","+")
testurl = f"{baseapiurl}{title}{apicallend}"
results = httprequests.get(testurl)
print(results.text)

The KCNE2 K+ channel regulatory subunit: Ubiquitous influence, complex pathobiology


In [25]:
#### Step 1, pull all gw reviews from the file
gene_wiki_info = read_csv(datapath+'GeneWikiReviewlist.tsv',delimiter='\t', header=0)
#print(gene_wiki_info)
publishlist = gene_wiki_info['status'].unique().tolist()
pubsinfo = gene_wiki_info[['status','Pubmed','PMCID']].loc[gene_wiki_info['status'].isin(publishlist)].copy()
pubsinfo.drop_duplicates(inplace=True,keep='first')
pubsinfo['PMCID']=pubsinfo['PMCID'].astype(str)
print(pubsinfo.head(n=2))

                                              status  \
0  http://www.sciencedirect.com/science/article/p...   
2  http://www.sciencedirect.com/science/article/p...   

                                        Pubmed            PMCID  
0  http://www.ncbi.nlm.nih.gov/pubmed/23069847          3570704  
2  http://www.ncbi.nlm.nih.gov/pubmed/23246696  NIHMS ID 909445  


In [26]:
#### Step 2, pull all Gene Wiki Reviews from Wikidata and their Titles (ie- the label)
cleandata = fetch_gwreviews_wd()
cleandata['PMCID'] = cleandata['PMCID'].astype(str)
print(cleandata.head(n=2))

                                        uri  \
0  http://www.wikidata.org/entity/Q21710689   
1  http://www.wikidata.org/entity/Q21710694   

                                               label        QID    PMCID  
0  The KCNE2 K+ channel regulatory subunit: Ubiqu...  Q21710689  4917011  
1  Structural and functional biology of arachidon...  Q21710694  6728142  


In [29]:
#### Step 3- Merge the tables from Step 1 and Step 2
allpubinfo = pubsinfo.merge(cleandata, on='PMCID',how = 'outer')
print(allpubinfo.tail(n=2))
#allpubinfo.to_csv(os.path.join(exppath,'article_citations.tsv'),sep='\t',header=True)
## Note, citations will be pulled manually, so this file exists only to make manual pulls easier to track

                                               status  \
95  https://www.sciencedirect.com/science/article/...   
96                                                NaN   

                                       Pubmed    PMCID  \
95  https://pubmed.ncbi.nlm.nih.gov/34252531/  8318780   
96                                        NaN  6660134   

                                          uri  \
95  http://www.wikidata.org/entity/Q108806643   
96   http://www.wikidata.org/entity/Q38584470   

                                                label         QID  
95  A role for zinc transporter gene SLC39A12 in t...  Q108806643  
96  Cardiac myosin-binding protein C (MYBPC3) in c...   Q38584470  
