# Creation of the Document Repository (Wikipedia)

## 1 Identification of the corpus from DBPedia

### 1.1 Get DBPedia concepts from the root concept

In [10]:
from SPARQLWrapper import SPARQLWrapper, JSON, RDF, XML
#from bd import BDdatos
import codecs
import json
from datetime import date

# Variables to start the query process

e = "http://live.dbpedia.org/sparql"  # set the EndPoint: DBPedia Live

childConcepts = []

In [11]:
def getSubConcepts_byLevel(rootConcept, q, l):
    """
        Get sub-concepts of the root concept(s). Retrieve subconcepts located in three hops.
    """
    
    varP = ['c', 'level']
    sparql = SPARQLWrapper(e)
    
    query = """select distinct ?level ?r ?c1 ?c2 ?c3
    {
        {VALUES (?r ?level) {(<%s> 1)}
            ?r ^skos:broader ?c1.}
        union
        {VALUES (?r ?level) {(<%s> 2)}
            ?r ^skos:broader ?c1. ?c1 ^skos:broader ?c2.}
        union
        {VALUES (?r ?level) {(<%s> 3)}
            ?r ^skos:broader ?c1. ?c1 ^skos:broader ?c2. ?c2 ^skos:broader ?c3.}
    }
    """
    numSC = 0 # number of subconcepts
    print("\t Processing...")
    for res in rootConcept:
        print("Root concept:", res)  
        sparql.setQuery(query%(res, res, res))
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        for result in results["results"]["bindings"]:
            numSC += 1
            var1 = var2 = var3 = var4 = ''
            if varP[0] in result:
                var1 = result[varP[0]]["value"]
            if varP[1] in result:
                var2 = result[varP[1]]["value"]
            if varP[2] in result:
                var3 = result[varP[2]]["value"]
            if varP[3] in result:
                var4 = result[varP[3]]["value"]
            childConcepts.append([date.today().strftime("%d/%m/%Y"), res, var1, var2, var3, var4])
    
    print("Number of results:", numSC)
    

In [12]:
def getSubConcepts(rootConcept):
    """
        Get sub-concepts of the root concept(s). Retrieve subconcepts located in three hops.
    """
    
    varP = ['c1', 'c2', 'c3', 'level']
    sparql = SPARQLWrapper(e)
    
    query = """select distinct ?level ?r ?c1 ?c2 ?c3
    {
        {VALUES (?r ?level) {(<%s> 1)}
            ?r ^skos:broader ?c1.}
        union
        {VALUES (?r ?level) {(<%s> 2)}
            ?r ^skos:broader ?c1. ?c1 ^skos:broader ?c2.}
        union
        {VALUES (?r ?level) {(<%s> 3)}
            ?r ^skos:broader ?c1. ?c1 ^skos:broader ?c2. ?c2 ^skos:broader ?c3.}
    }
    """
    numSC = 0 # number of subconcepts
    print("\t Processing...")
    for res in rootConcept:
        print("Root concept:", res)  
        sparql.setQuery(query%(res, res, res))
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        for result in results["results"]["bindings"]:
            numSC += 1
            var1 = var2 = var3 = var4 = ''
            if varP[0] in result:
                var1 = result[varP[0]]["value"]
            if varP[1] in result:
                var2 = result[varP[1]]["value"]
            if varP[2] in result:
                var3 = result[varP[2]]["value"]
            if varP[3] in result:
                var4 = result[varP[3]]["value"]
            childConcepts.append([date.today().strftime("%d/%m/%Y"), res, var1, var2, var3, var4])
    
    print("Number of results:", numSC)
    

In [13]:
# To RUN:
getSubConcepts(["http://dbpedia.org/resource/Category:COVID-19"])  # Get subconcepts from the root concept

	 Processing...
Root concept: http://dbpedia.org/resource/Category:COVID-19
Number of results: 170


In [33]:
childConcepts[44:50]

# lists, by_location

[['16/05/2022',
  'http://dbpedia.org/resource/Category:COVID-19',
  'http://dbpedia.org/resource/Category:Medical_responses_to_the_COVID-19_pandemic',
  'http://dbpedia.org/resource/Category:Hospitals_established_for_the_COVID-19_pandemic',
  '',
  '2'],
 ['16/05/2022',
  'http://dbpedia.org/resource/Category:COVID-19',
  'http://dbpedia.org/resource/Category:COVID-19_pandemic',
  'http://dbpedia.org/resource/Category:COVID-19_pandemic_in_popular_culture',
  'http://dbpedia.org/resource/Category:Books_about_the_COVID-19_pandemic',
  '3'],
 ['16/05/2022',
  'http://dbpedia.org/resource/Category:COVID-19',
  'http://dbpedia.org/resource/Category:COVID-19_pandemic',
  'http://dbpedia.org/resource/Category:COVID-19_pandemic_in_popular_culture',
  'http://dbpedia.org/resource/Category:Works_about_the_COVID-19_pandemic',
  '3'],
 ['16/05/2022',
  'http://dbpedia.org/resource/Category:COVID-19',
  'http://dbpedia.org/resource/Category:COVID-19_pandemic',
  'http://dbpedia.org/resource/Catego

### 1.2 Getting a unique list of concepts:

In [34]:
import pandas as pd

print("...Getting unique concepts")

blackList = ['organization', 'deaths', 'books', 'timelines', 'events', 'monuments', 'songs', 'film', 'album',  'television', 'films', 'company', 'list', 'http://dbpedia.org/ontology/']

#childConceptsDF = pd.DataFrame(childConcepts, columns = ['date', 'root', 'childLl1', 'childLl2', 'childLl3', 'level']) # ORIGINAL

childConceptsDF = pd.DataFrame(childConcepts[0:44], columns = ['date', 'root', 'childLl1', 'childLl2', 'childLl3', 'level'])

conceptList = list(childConceptsDF.childLl1.unique()) + list(childConceptsDF.childLl2.unique()) + list(childConceptsDF.childLl3.unique())



...Getting unique concepts


In [36]:
def verify(c):
    for l in blackList:
        if l in c.lower():
            return False
    return True

conceptListSel = [c for c in conceptList if verify(c) and len(c)>0]
print(len(conceptListSel))

conceptListSel = pd.DataFrame(set(conceptListSel), columns=['concept'])
conceptListSel.tail()

39


Unnamed: 0,concept
32,http://dbpedia.org/resource/Category:Israeli_C...
33,http://dbpedia.org/resource/Category:Vietnames...
34,http://dbpedia.org/resource/Category:Statistic...
35,http://dbpedia.org/resource/Category:COVID-19_...
36,http://dbpedia.org/resource/Category:Medical_r...


In [38]:
# Depuración de conceptos
conceptListSel.to_csv('categories.csv', index = False)

# http://dbpedia.org/resource/Category:Songs_about_the_COVID-19_pandemic
#http://dbpedia.org/resource/Category:COVID-19_pandemic_in_the_United_States_in_popular_culture
#http://dbpedia.org/resource/Category:COVID-19_pandemic_in_popular_culture

In [45]:
# Depuración manual de categorías (conceptListSel):

conceptListSel = pd.read_csv('categories.csv')

In [47]:
conceptListSel

Unnamed: 0,concept
0,http://dbpedia.org/resource/Category:COVID-19
1,http://dbpedia.org/resource/Category:Works_abo...
2,http://dbpedia.org/resource/Category:Impact_of...
3,http://dbpedia.org/resource/Category:COVID-19_...
4,http://dbpedia.org/resource/Category:COVID-19_...
5,http://dbpedia.org/resource/Category:Deploymen...
6,http://dbpedia.org/resource/Category:COVID-19_...
7,http://dbpedia.org/resource/Category:COVID-19_...
8,http://dbpedia.org/resource/Category:COVID-19_...
9,http://dbpedia.org/resource/Category:COVID-19_...


### 1.3. Getting Wikipage links and their metadata from DBPedia Live:

<b>a. Creating the table:</b>

In [48]:
wikiPages = []

In [49]:
###########
# Get Pages:

import time, random

#S = requests.Session()

def getWikiPagesFromDBConcepts():
    #cur = con.cursor()
    print("Getting links from DBpedia:")
    # Insert data into new table:
    #cur.execute("""select distinct id, c from """ + stable + """ where id >""" +str(l1)+' and id <= ' + str(l2))
    #rowList = cur.fetchall()
    for row in conceptListSel.iterrows():
        idC = row[0]
        cat = row[1][0]
        print(idC, cat)
        results = getMetadata(cat)
        if results:
            for result in results:
                res = page = max_modif  = ''
                max_outDegree = 0
                res = result['r']["value"]
                if 'wikiPage' in result:
                    page = result['wikiPage']["value"]
                if 'classDBR' in result:
                    classDBR = result['classDBR']["value"]
                if 'wikiPageID' in result:
                    pageId = result['wikiPageID']["value"]                    
                if 'max_outDegree' in result:
                    max_outDegree = result['max_outDegree']["value"]
            
                wikiPages.append([date.today().strftime("%Y-%m-%d"), cat, res, classDBR, page, pageId, max_outDegree])
        time.sleep(2+random.uniform(0, 10))

def getMetadata(cat, user_agent='Mozilla/5.0', num_retries=3):
    sparql = SPARQLWrapper(e)
    try:
        # Pendiente el type de cada recurso:
        q = """SELECT DISTINCT ?r (GROUP_CONCAT(DISTINCT ?class; SEPARATOR=';') AS ?classDBR) ?wikiPage ?wikiPageID
        (str(max(?outDegree)) AS ?max_outDegree)
        WHERE{
        VALUES ?c {<%s>}
        ?r dct:subject ?c .
        OPTIONAL {?r dbo:wikiPageModified ?modif.}
        OPTIONAL {?r dbo:wikiPageOutDegree ?outDegree.}
        OPTIONAL {?r foaf:isPrimaryTopicOf ?wikiPage.}
        OPTIONAL {?r rdf:type ?class.}
        OPTIONAL {?r dbo:wikiPageID ?wikiPageID}
        } GROUP BY ?r ?wikiPageID ?wikiPage"""
        sparql.setQuery(q%(cat))
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
    except socket.error as error:
        print('Error querying:', e.reason)
        #html = None
        if num_retries > 0:
            if hasattr(error, 'code') and 500 <= error.code < 600:
                return getMetadata(cat, user_agent, num_retries-1)
    return results["results"]["bindings"]

In [50]:
getWikiPagesFromDBConcepts() # Send range of ids to get pages: 281 (max id from tableDBIDs) len(conceptListSel)

Getting links from DBpedia:
0 http://dbpedia.org/resource/Category:COVID-19
1 http://dbpedia.org/resource/Category:Works_about_the_COVID-19_pandemic
2 http://dbpedia.org/resource/Category:Impact_of_the_COVID-19_pandemic
3 http://dbpedia.org/resource/Category:COVID-19_conspiracy_theorists
4 http://dbpedia.org/resource/Category:COVID-19_pandemic
5 http://dbpedia.org/resource/Category:Deployment_of_COVID-19_vaccines
6 http://dbpedia.org/resource/Category:COVID-19_models
7 http://dbpedia.org/resource/Category:COVID-19_drug_development
8 http://dbpedia.org/resource/Category:COVID-19_vaccines
9 http://dbpedia.org/resource/Category:COVID-19_misinformation
10 http://dbpedia.org/resource/Category:Responses_to_the_COVID-19_pandemic
11 http://dbpedia.org/resource/Category:COVID-19_symptoms
12 http://dbpedia.org/resource/Category:Vietnamese_COVID-19_vaccines
13 http://dbpedia.org/resource/Category:Statistics_of_the_COVID-19_pandemic
14 http://dbpedia.org/resource/Category:COVID-19_pandemic_by_loca

In [51]:
len(wikiPages) 

blackList = ['organization', 'deaths', 'books', 'timelines', 'events', 'monuments', 'song', 'album', 'awards',  'television', 'films', 'company', 'list', 'person']

wikiPagesDF = pd.DataFrame(wikiPages, columns = ['date', 'concept', 'dbr', 'class', 'wikipage', 'wikiPageID', 'max_outDegree'])

wikiPagesDF = wikiPagesDF[['class', 'wikipage', 'wikiPageID']].drop_duplicates()

#wikiPageID = wikiPagesDF.wikiPageID.unique()
wikiPagesDF.head()

Unnamed: 0,class,wikipage,wikiPageID
0,http://www.w3.org/2002/07/owl#Thing,http://en.wikipedia.org/wiki/COVID-19_drug_rep...,63430824
1,http://dbpedia.org/class/yago/Abstraction10000...,http://en.wikipedia.org/wiki/Favipiravir,40872327
2,,http://en.wikipedia.org/wiki/Tika_Utsav,67366102
3,,http://en.wikipedia.org/wiki/Solidarity_trial,63544717
4,http://dbpedia.org/ontology/Agent;http://dbped...,http://en.wikipedia.org/wiki/Vaxart,63256201


In [52]:
wikiPageSel = pd.DataFrame(set([c[1][2] for c in wikiPagesDF.iterrows() if verify(c[1][0]+c[1][1])]), columns=['wikiPageID'])
print(len(wikiPageSel)) # 968
wikiPageSel[0:5]

276


Unnamed: 0,wikiPageID
0,63571861
1,67155086
2,67708405
3,65802682
4,64504133


In [53]:
pd.merge(wikiPageSel, wikiPagesDF[['wikiPageID', 'wikipage', 'class']]).to_csv('pages.csv')

wikiPageSel.wikiPageID.to_csv('wikiPages2022.csv')

## Getting metadata of pages:

In [22]:
#from bd import BDdatos
import codecs
import json
from datetime import date
import re 
#from bs4 import BeautifulSoup
import urllib
#import wikipedia
import requests

URL = "https://en.wikipedia.org/w/api.php"

S = requests.Session()

import pandas as pd

wikiPages = pd.read_csv('wikiPages.csv')
wikiPages = list(wikiPages.wikiPageID); wikiPages[5]

66188669

In [23]:
def getMetadata(pageid):
    pageid = str(pageid)
    PARAMS = {'action':'query',
        'pageids': pageid,
        'prop':'revisions|pageterms|info|pageviews',
        'rvprop':'ids|flags|timestamp|userid|user|size|comment|tags',
        'rvlimit':'500',
        'inprop':'url',
        "format": "json"
    }
    req = S.get(url=URL, params=PARAMS)
    data = req.json()
    #date = data["query"]['pages'][pageid]['revisions'][0]['timestamp'] 
    return data 

In [24]:
metadataPages = []  # está guardado en wikiPagesMetadata.csv
pageviews = []   # No está bien guardado

In [25]:
#t = [66188669]

from itertools import cycle

for pageid in wikiPages:
    print(pageid)
    description = firstRev = lastRev = revid = title = url = None
    numRev = 0
    metadata = getMetadata(pageid)
    if 'title' in metadata["query"]['pages'][str(pageid)]:
        title = metadata["query"]['pages'][str(pageid)]['title']
    if 'revisions' in metadata["query"]['pages'][str(pageid)]:
        numRev = len(metadata["query"]['pages'][str(pageid)]['revisions'])
        lastRev = metadata["query"]['pages'][str(pageid)]['revisions'][0]['timestamp'][0:10]
        firstRev = metadata["query"]['pages'][str(pageid)]['revisions'][numRev-1]['timestamp'][0:10]
        revid = metadata["query"]['pages'][str(pageid)]['revisions'][0]['revid']
    if 'terms' in metadata["query"]['pages'][str(pageid)]:
        if 'description' in metadata["query"]['pages'][str(pageid)]['terms']:
            description = ';'.join(metadata["query"]['pages'][str(pageid)]['terms']['description'])
    if 'pageviews' in metadata["query"]['pages'][str(pageid)]:
        pageviews.append(list(zip(cycle([pageid]), metadata["query"]['pages'][str(pageid)]['pageviews'])))
    if 'fullurl' in metadata["query"]['pages'][str(pageid)]:
        url = metadata["query"]['pages'][str(pageid)]['fullurl']
    metadataPages.append([pageid, title, description, numRev, firstRev, lastRev, revid, url])


66453368
65700804
63416817
63365057
67276075
66188669
63648639
67108361
63431821
63336518
63272141
63795020
63843248
68341990
63316213
68179014
65084949
67521729
63353421
63354227
63366021
63283461
63285738
62907551
68389568
63346121
63346138
65225706
63724991
67268564
65297373
63335287
67872083
65093951
63183247
59858954
67551548
63617842
65383730
63178636
63344613
63366078
63606106
68542482
63272917
64107722
65187376
56459486
67231916
53724240
63895130
64123343
66852270
67023308
63364910
63214876
66485227
66195384
66496391
62858715
65060446
63343158
68103267
63402681
61333254
68831719
64008008
64598873
66280945
63347679
66481246
63300653
63366066
67759831
63241849
65830027
63379589
67288169
63540207
66365133
66580737
63415229
63299502
67345750
67278940
64079576
67091267
68816648
68213064
63379158
63948936
62750956
63255772
63913842
63590050
65682288
64462283
63811805
61753359
66834168
63808158
48410011
66041794
68076170
63366079
67757201
67999171
65314337
878899
63627364
65527712
682

In [26]:
metadataPagesDF = pd.DataFrame(metadataPages, columns=['wikipageID', 'title', 'description', 'numRev', 'firstRev', 'lastRev', 'revid', 'url'])
metadataPagesDF.head(5) #[metadataPagesDF['url'] == None]

# To save metadata
metadataPagesDF.to_csv('wikiPagesMetadata2.csv')


<b> c. Validating Wikipage links (there are some redirects)</b>

## 2. Getting  data from Wikipedia

### 2.1 Extracting metadata of pages

<b> a. Creating tables </b>

In [27]:
wikiPageError = [] # etable 
wikiPageCategories = [] # ctable
wikiPageWikiProjects  = [] # ptable
wikiPageTOC  = [] # toctable

<b> b. Extracting metadata of Wikipages </b>

In [28]:
#from bd import BDdatos
import codecs
import json
from datetime import date
import re 
#from bs4 import BeautifulSoup
import urllib
#import wikipedia
import requests

URL = "https://en.wikipedia.org/w/api.php"

S = requests.Session()

#db = BDdatos()
#con = db.conectar()

################
# Get TOC:
def getPageID(l1, l2):
    #cur = con.cursor()
    print("Downloading information from Wikipedia:")
    #cur = con.cursor()
    #cur.execute("""SELECT distinct id, wikipageUrl FROM """ + otable + """ WHERE id >""" +str(l1)+' and id <= ' + str(l2))
    #rowList = cur.fetchall()
    wikipageList = metadataPagesDF[['wikipageID', 'url']]
    for row in range(l1, l2):
        idP = wikipageList.iloc[row][0]
        page =  wikipageList.iloc[row][1]
        print(idP, page)
        getMoreMetadata(idP, page)


def getMoreMetadata(idP, page):
    S = requests.Session() 
     
    #urllib.parse.unquote(page),
    PARAMS = {
        "action": "parse",
        "pageid": str(idP), 
        "format": "json"
    }

    R = S.get(url=URL, params=PARAMS)
    DATAC = R.json()
    if 'error' in DATAC:
        error = DATAC['error']['info']
        print(error)
        wikiPageError.append([date.today().strftime("%Y-%m-%d"), idP])
    else:    
        DATAC = DATAC['parse']

        #Wikipedia Categories:
        for l in DATAC['categories']:
            if 'hidden' not in l:
                wikiPageCategories.append([date.today().strftime("%Y-%m-%d"), idP, l['*']])

        #iwlinks, wikipedia projects:
        for l in DATAC['iwlinks']:
            wikiPageWikiProjects.append([date.today().strftime("%Y-%m-%d"), idP, l['prefix'], l['url'], l['*']])
    
        # TOC:
        toc = DATAC['sections']
        for i in toc:
            fromtitle = ''
            if 'fromtitle' in i:
                fromtitle = i['fromtitle']
            wikiPageTOC.append([date.today().strftime("%Y-%m-%d"), idP, i['toclevel'], i['level'], i['line'], i['number'], i['index'], fromtitle, i['anchor']])            


            

In [29]:

# To run:
metadataPagesDF.tail()
getPageID(0, 968) # 2637 validated pages


Downloading information from Wikipedia:
66453368 https://en.wikipedia.org/wiki/Responses_to_the_COVID-19_pandemic_in_April_2021
65700804 https://en.wikipedia.org/wiki/Statistics_of_the_COVID-19_pandemic_in_Malaysia
63416817 https://en.wikipedia.org/wiki/Media_coverage_of_the_COVID-19_pandemic
63365057 https://en.wikipedia.org/wiki/COVID-19_pandemic_in_the_Marshall_Islands
67276075 https://en.wikipedia.org/wiki/Sinopharm_WIBP_COVID-19_vaccine
66188669 https://en.wikipedia.org/wiki/COVID-19_vaccination_in_Switzerland
63648639 https://en.wikipedia.org/wiki/Statistics_of_the_COVID-19_pandemic_in_Brazil
67108361 https://en.wikipedia.org/wiki/Statistics_of_the_COVID-19_pandemic
63431821 https://en.wikipedia.org/wiki/Coronavirus_Tech_Handbook
63336518 https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Oman
63272141 https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Israel
63795020 https://en.wikipedia.org/wiki/Trikini
63843248 https://en.wikipedia.org/wiki/COVID-19_anti-lockdown_protests_in_t

In [30]:
#wikiPageCategories[0:10]

wikiPageError[0:5]  # solo un error: [['2021-11-26', 68542482], ['2021-11-29', 65225706, 'Page error']]

wikiPageCategories[0:5] #
wikiPageCategoriesDF = pd.DataFrame(wikiPageCategories, columns=['date', 'wikipageID', 'category']).to_csv('wikiPageCategories2.csv')
wikiPageWikiProjects[0:5] # ptable
wikiPageWikiProjectsDF = pd.DataFrame(wikiPageWikiProjects, columns=['date', 'wikipageID', 'project', 'link', 'linkName']).to_csv('wikiPageWikiProjects2.csv')


In [31]:
wikiPageTOCDF = pd.DataFrame(wikiPageTOC, columns=['date', 'wikipageID', 'toclevel', 'level', 'line', 'number', 'index', 'fromtitle', 'anchor'])
#wikiPageTOCDF[wikiPageTOCDF['line'].str.contains("<i>", case=False)]

# Clean html tag:
wikiPageTOCDF["line"].replace({"<i>": "", "</i>": ""}, inplace=True)

In [32]:
wikiPageTOCDF.to_csv('wikiPageTOC2.csv')

# ********************Desde aquí ejecutar y cargar wikiPageTOC desde wikiPageTOC.csv

In [2]:
import pandas as pd
wikiPageTOC =pd.read_csv('wikiPageTOC2.csv').values.tolist()
wikiPageTOC[0:5]

[[0,
  '2021-12-02',
  66453368,
  1,
  2,
  'Reactions and measures in Africa',
  '1',
  '1',
  'Responses_to_the_COVID-19_pandemic_in_April_2021',
  'Reactions_and_measures_in_Africa'],
 [1,
  '2021-12-02',
  66453368,
  1,
  2,
  'Reactions and measures in the Americas',
  '2',
  '2',
  'Responses_to_the_COVID-19_pandemic_in_April_2021',
  'Reactions_and_measures_in_the_Americas'],
 [2,
  '2021-12-02',
  66453368,
  1,
  2,
  'Reactions and measures in the Eastern Mediterranean',
  '3',
  '3',
  'Responses_to_the_COVID-19_pandemic_in_April_2021',
  'Reactions_and_measures_in_the_Eastern_Mediterranean'],
 [3,
  '2021-12-02',
  66453368,
  1,
  2,
  'Reactions and measures in Europe',
  '4',
  '4',
  'Responses_to_the_COVID-19_pandemic_in_April_2021',
  'Reactions_and_measures_in_Europe'],
 [4,
  '2021-12-02',
  66453368,
  1,
  2,
  'Reactions and measures in South and Southeast Asia',
  '5',
  '5',
  'Responses_to_the_COVID-19_pandemic_in_April_2021',
  'Reactions_and_measures_in_So

In [3]:
sections = ['See also', 'REFERENCES','External links', 'Further reading', 'Notes','Bibliography', 'Filmography%', 'Timeline%', 'Statistics%', '%2020',
            '%2021', 'Release history', 'Charts', 'Gallery'] # , 'Timeline', 'Statistics', '2020', '2021']
#page LIKE 'http://en.wikipedia.org/wiki/COVID-19_pandemic_in%')
#AND page NOT LIKE '%Timeline%' AND page NOT LIKE '%/Responses_to%']

def verify2(c):
    
    for l in sections:
        #print(l, c.lower)
        if l.lower() in c.lower():
            return False
    return True


#wikiPageTOC = wikiPageTOCDF.values.tolist() # 17034
lenT = len(wikiPageTOC)
tocSel = [wikiPageTOC[c] for c in range(lenT) if verify2(wikiPageTOC[c][5]) and 'Song' not in wikiPageTOC[c][7] and 'Statistics' not in wikiPageTOC[c][7] and 'Template:' not in wikiPageTOC[c][7]]
print(len(tocSel)) # 14878, 11806(sin 2020, 2021, Timeline y Statistics)


#wikiPageTOCDF['line'].str.contains("<i>", case=False)]

14878


In [36]:
wikiPageTOC2[20:30]
print(len(tocSel)) # 14972 a 14675 quitando temprates y statistics. 14661
tocSel[0:1]

14878


[[0,
  '2021-12-02',
  66453368,
  1,
  2,
  'Reactions and measures in Africa',
  '1',
  '1',
  'Responses_to_the_COVID-19_pandemic_in_April_2021',
  'Reactions_and_measures_in_Africa']]

<b> c. Cleaning sections and pages </b>

Identify sections of the pages with textual content:

In [12]:

wikiPageTOC_Filtered = [] # tocSelected
wikiPageTOC_Grouped  = [] # tocGrouped
wikiPageTOC_Aux  = [] # tocAux
wikiTOCByPage = []  # tocCpages *** 
wikiPageSections = []
wIKIPAGESECTIONCONTENT  = [] # tocContent  ****
#wIKIPAGESECTIONLINKS  = [] # tocContentLinks *****
wikiPageSectionLinks = []
wikiPageError = []

## 3.  Get content from each section

In [11]:
#tocSelDF = pd.DataFrame(tocSel, columns= ['i', 'date', 'wikipageID', 'toclevel', 'level', 'line', 'number', 'index', 'fromtitle', 'anchor'])


In [13]:
# Get section:
#from bd import BDdatos
from datetime import date
import wikipedia
from bs4 import BeautifulSoup
import re

#db = BDdatos()
#con = db.conectar()

def getPageID(l1, l2):
    print("Downloading data from Wikipedia:")
    if l1:
        idP = tocSel[l1][2]
    else:
        idP = tocSel[0][2]
    toc = []
    for l in range(l1, l2): #range(len(tocSel)):
        if idP != tocSel[l][2]:
            print(l, '>>', idP)
            #print(toc)
            getSection(idP, toc)
            toc = []
        toc.append(tocSel[l][5])
        idP = tocSel[l][2]
            #print(tocSel[l][1], toc)    

def getSection(pageid, toc):
    try:
        page = wikipedia.WikipediaPage(pageid=str(pageid))
        print(page.url)
    
        # Part 2: extraction of the content by sections:
        
        List_contentByTOC = []  
        t = 0
        head = 0
        c = 0
        if len(toc) > 0:
            contentByTOC = toc[t] # Primera sección
            lenTOC = len(toc)  # menos 1
        else:
            contentByTOC = 'head'
            lenTOC = len(toc) + 1 # sin más
        #print(lenTOC, toc)
        content = page.content
        splitText = []
        a, *l = re.split('==+', content) 
        if len(a) > 1:
            splitText.append(a)
        splitText = splitText + l 
    
        # Clean from empty strings:
        splitText = [x for x in splitText if x.strip() and x != '\n\n\n' and x != '\n\n']
        #print(splitText)
        if contentByTOC != splitText[c].strip().replace('=', ''): # 
            List_contentByTOC.append(['head', 0, splitText[c].strip()])
            head = 1
            c = c + 1
        if len(toc) > 1:
            m = c
        #print(toc)
        #print('---------------------------')

            r = c
        #print(len(splitText))
            while t < lenTOC:
            
            # desde aquí:
                match = False
                contentByTOC = toc[t] # here toc[t+1], toc[t] (cmabio2)
            
                while not(match):
                    #print(m, contentByTOC)
                    #print(splitText[m].strip())
                    if verify2(splitText[m].strip()): # verify2(splitText[m].strip().replace(',', '-')):
                
                        if toc[t].strip().replace(',', '-') == splitText[m].strip().replace(',', '-') and splitText[m+1].strip() not in toc: # here: m + 1, splitText[m] (cmabio2)
                            List_contentByTOC.append([contentByTOC, t, splitText[m+1].strip().replace('=\n', '')]) # here: m + 1, m (cmabio2)
                            t = t + 1
                            match = True
                        else:
                            #print(m, contentByTOC, splitText[m].strip().replace(',', '-')) # .replace(',', '-'))
                            if toc[t].strip().replace(',', '-') == splitText[m].strip().replace(',', '-') and splitText[m+1].strip() in toc: 
                                match = True
                                t = t + 1
                        
                    m = m + 1
                    #print(4*'-', splitText[m].strip())
                #print(List_contentByTOC)
            # hasta aquí:
        
        # Insert into the DB:
        for l in List_contentByTOC:
            wikiPageSections.append([date.today().strftime("%Y-%m-%d"), pageid, l[0], l[1], l[2]])
        
        # Part 3: getting urls by section:
        List_linksByTOC = []
        html = page.html()
        spllitp = html.split('title="Edit section: ')
        ls = len(toc) + 1
        li = 0
        if head:
            soupS = BeautifulSoup(spllitp[0],"html.parser")
            links = soupS.find_all(name='a')
            li = 1
            for i in links:
                try:
                    url = i['href']
                    if 'wiki' in url and 'File:' not in url and 'Wikipedia:' not in url and 'Help:' not in url:
                        List_linksByTOC.append(['head', 0, i.text, url])
                except:
                    continue
        for s in range(li, ls):
            #print('\n\n\n >>>>>', s)
            soupS = BeautifulSoup(spllitp[s],"html.parser")
            links = soupS.find_all(name='a')
            sec = toc[s-1] # *** [0]
            for i in links:
                try:
                    url = i['href']
                    if 'wiki' in url and 'File:' not in url and 'Wikipedia:' not in url and 'Help:' not in url:
                        List_linksByTOC.append([sec, s, i.text, url])
                except:
                    continue
        #print([row for row in List_linksByTOC[220:250]])
        
        unique_linksByTOC = [list(item) for item in set(tuple(row) for row in List_linksByTOC)] #  [list(item) for item in set(tuple(row) for row in List_linksByTOC)]
    
        for l in unique_linksByTOC:
            #print(l[0], l[1], l[2], l[3])
            wikiPageSectionLinks.append([date.today().strftime("%Y-%m-%d"), pageid, l[0], l[1], l[2], l[3]])
    #print(List_contentByTOC)
    except:
        print("Error")
        wikiPageError.append([date.today().strftime("%Y-%m-%d"), pageid, "Page error", page.url])
        #continue


# To RUN:                
#Only for test: getSection(1, "High_Point_Market", "High Point Market", ['History', 'Furniture Cluster/Niche'])



In [20]:
getPageID(14860, 14878) # 53 (problema). 57:101, 437, 490  14675  # 384,393 >> 65225706, https://en.wikipedia.org/wiki/OK_Not_to_Be_OK
#wikiPageSections[0:22]
#wikiPageSections[37:44]
#tocSel[354:384] # 63346138


#329 >> 68389568 >> https://en.wikipedia.org/wiki/Love_is_not_tourism

#329 >> 63346121 >> COVID-19_pandemic_in_Peru

#len(tocSel) # 14972

# 589 >> 63178636, https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Taiwan, 42 sections

# errores:
# 66 >> 63648639 >> https://en.wikipedia.org/wiki/Statistics_of_the_COVID-19_pandemic_in_Brazil
#1056 >> 64008008 >> https://en.wikipedia.org/wiki/2020_PDC_Home_Tour_Play-Offs
# 1148 >> 63300653 >> https://en.wikipedia.org/wiki/COVID-19_pandemic_in_South_Africa
# 1444 >> 62750956 >> https://en.wikipedia.org/wiki/COVID-19_pandemic
# 1521 >> 65682288 >> https://en.wikipedia.org/wiki/2020_PDC_Home_Tour_3
# 1599 >> 48410011 >> https://en.wikipedia.org/wiki/2020_United_States_presidential_election
# 1611 >> 68076170 >> https://en.wikipedia.org/wiki/Proximity_chat

Downloading data from Wikipedia:
14869 >> 63634124
https://en.wikipedia.org/wiki/COVID-19_pandemic_on_Grand_Princess


In [23]:
len(wikiPageError)
wikiPageError[0:5]

[['2021-12-03',
  63648639,
  'Page error',
  'https://en.wikipedia.org/wiki/Statistics_of_the_COVID-19_pandemic_in_Brazil'],
 ['2021-12-03',
  63272917,
  'Page error',
  'https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Sri_Lanka'],
 ['2021-12-03',
  65060446,
  'Page error',
  'https://en.wikipedia.org/wiki/Artificial_crowd_noise'],
 ['2021-12-03',
  64008008,
  'Page error',
  'https://en.wikipedia.org/wiki/2020_PDC_Home_Tour_Play-Offs'],
 ['2021-12-03',
  63300653,
  'Page error',
  'https://en.wikipedia.org/wiki/COVID-19_pandemic_in_South_Africa']]

## 4.  Get information about links included in the section's content (from Wikidata)

<b> a. Identify unique links to get data </b>

In [None]:
from bd import BDdatos

db = BDdatos()
con = db.conectar()


annLinkIDs = 'WikiAnnotationLinksIDs'  # Unique links
annMetadata = 'WikiAnnotationMetadata'

cur = con.cursor()

# Get unique links
cur.execute("""CREATE TABLE """ + annLinkIDs + """  (id INT AUTO_INCREMENT PRIMARY KEY) AS 
SELECT DISTINCT link, count(*) AS freq FROM """ + tocContentLinks + """
WHERE link LIKE '%/wiki/%' GROUP BY link HAVING count(*) > 3; """)


# Create table for metadata:

cur.execute("""CREATE TABLE """ + annMetadata + """ (
  `date` date,
  `link` TEXT CHARACTER SET utf8,
  `url` TEXT CHARACTER SET utf8,
  `title` TEXT CHARACTER SET utf8,  
  `pageid` VARCHAR(20) CHARACTER SET utf8 DEFAULT NULL,
  `revid` VARCHAR(20) CHARACTER SET utf8 DEFAULT NULL,
  lastUpdate date,
  `summary` LONGTEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci
) ENGINE=INNODB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;""")

db.saveDB(con)
db.closeDB(con)

In [None]:
annLinkIDs = 'WikiAnnotationLinksIDs'  # Unique links
annMetadata = 'WikiAnnotationMetadata'
URL = "https://en.wikipedia.org/w/api.php"
S = requests.Session()
def getLastUpdate(pageid):
    pageid = str(pageid)
    PARAMS = {'action':'query',
        'pageids': pageid,
        'prop':'revisions',
        'rvprop':'ids|flags|timestamp|userid|user|size|comment|tags',
        'rvlimit':'100',
        "format": "json"
    }
    req = S.get(url=URL, params=PARAMS)
    data = req.json()
    date = data["query"]['pages'][pageid]['revisions'][0]['timestamp'] 
    return date 





<b> b. Gathering the metadata from Wikipedia </b>

In [None]:
from bd import BDdatos
from datetime import date
import wikipedia
from bs4 import BeautifulSoup
import re
import urllib
import requests

db = BDdatos()
con = db.conectar()

def getAnnotationID(l1, l2):
    cur = con.cursor()
    print("Downloading data:")
    cur = con.cursor() # annLinkIDs 
    cur.execute("""SELECT distinct id, link FROM """ + annLinkIDs + """ where id >""" +str(l1)+' and id <= ' + str(l2) + ' order by id')
    rowList = cur.fetchall()
    for row in rowList:
        idP = row[0]
        urlP = 'http://en.wikipedia.org'+row[1]
        title = urllib.parse.unquote(row[1].replace('/wiki/', ''))
        print(idP, urlP, title)
        if title:
            getPageMetadata(idP, urlP, title, 1)


def getPageMetadata(idP, urlP, title, state):
    try:
        page = wikipedia.WikipediaPage(title=title) 

        ################################################################
        # Part 1: Metadata
        summary = page.summary
        revision_id = page.revision_id
        pageid = page.pageid
        lastUpdate = getLastUpdate(pageid)
        lastUpdate = lastUpdate[0:10]
        db.WikiAnnotationMetadata(db, date.today().strftime("%Y-%m-%d"), urlP, page.url, page.title, pageid, revision_id, lastUpdate, summary, annMetadata)
    except: 
        print("Annotation not found")
        db.WikiPageError(db, date.today().strftime("%Y-%m-%d"), urlP, "Annotation not found", 'WikiPageError')


# TO RUN:
getAnnotationID(9637, 11867)



<b> c. Integrating data </b>

In [None]:
from bd import BDdatos

db = BDdatos()
con = db.conectar()


#annLinkIDs = 'WikiAnnotationLinksIDs'  # Unique links
#annMetadata = 'WikiAnnotationMetadata'

cur = con.cursor()

# To create index:
cur.execute("""ALTER TABLE """ + tocContentLinks + """ ADD INDEX (`page` (100));""")
cur.execute("""ALTER TABLE """ + annMetadata + """ ADD INDEX (`link` (100));""")


db.saveDB(con)
db.closeDB(con)

## 5.  Filter data not related to COVID-19

### 9.2  Get content from data repository

### Buscar las menciones dentro de las secciones/párrafos (phrase matching)

Ahora si lo que quieres es buscar las menciones dentro de las secciones/párrafos, podrías hacer phrase matching basandote en un gazetteer (= lista de términos relevantes para covid), por ejemplo usando spacy:


In [None]:
# Crear table:

from bd import BDdatos
db = BDdatos()
con = db.conectar()

tableCovid = 'WikiPageSectionCOVID'
tableContent = "WIKIPAGESECTIONCONTENT"
pageIDs = "WikiPagesIDs"

cur = con.cursor()


cur.execute(u"""ALTER TABLE """ + tableContent + """ ADD `idpage` INT  NULL  DEFAULT NULL  AFTER `page`;""")

cur.execute(u"""UPDATE """ + tableContent + """ c LEFT OUTER JOIN """ + pageIDs + """  p
ON c.page = p.wikipageUrl SET c.idpage = p.id""")

cur.execute(u"""CREATE TABLE """ + tableCovid + """ (`date` date,
     page TEXT, section TEXT, idSection INT, idWikidata VARCHAR(50),
     startPos INT, endPos INT, textMatch TEXT);""")

db.saveDB(con)
db.closeDB(con)

In [None]:
tableCovid = 'WikiPageSectionCOVID'
tableContent = "WIKIPAGESECTIONCONTENT"
pageIDs = "WikiPagesIDs"
mtable = 'WikiPageMetadata'

In [None]:
COVIDGazetteer

In [None]:
<b> Requisite </b>: Hasta este punto ya debió estar creado el Gazetter:

In [None]:
# addingphrase matching

# python -m spacy download en_core_web_sm -> to update  to 2.3.1 version of en_core_web_sm



# Subir archivo de Gazetteer:
import pandas as pd

path = "../QA/data/COVIDGazetter28Feb.csv"
COVIDGazetteer = pd.read_csv(path, sep = "|")
terms = [t for t in COVIDGazetteer["o"]]

#s = COVIDGazetteer[COVIDGazetteer['o'].str.lower() == "coronavirus"].iloc[0]["s"]
#s
#for text in terms:
#    if "COVID-19 pandemic" in text:
#        print(text)


from bd import BDdatos
import codecs
import json
from datetime import date
import spacy
from spacy.matcher import PhraseMatcher

db = BDdatos()
con = db.conectar()

nlp = spacy.load('en_core_web_sm')
matcher = PhraseMatcher(nlp.vocab, attr="LOWER") # First, we initialize the Matcher with a vocab. The matcher must always share the same vocab with the documents it will operate on. We can now call matcher.add() with an ID and our custom pattern. The second argument lets you pass in an optional callback function to invoke on a successful match. For now, we set it to None.

# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", None, *patterns)

tablecontent = 'WIKIPAGESECTIONCONTENT' # _IRQA_FRAGMENTS_FINAL

def getWikipediaFragments(l1, l2):
    cur = con.cursor()
    print("Descargando datos:")
    cur = con.cursor()
    cur.execute("""SELECT DISTINCT idpage, page, sectionID, section section_ori, 
    REPLACE(section, '/', '_') section, sectionId, secText FROM """ + tablecontent + """ WHERE secText != '' AND idpage >""" +str(l1)+' and idpage <= ' + str(l2))
    rowList = cur.fetchall()
    for row in rowList:
        idP = row[0]
        page = row[1]
        idSection = row[2]
        section_ori = row[3]
        section = row[4]
        idSec = row[5]
        doc = row[6]
        doc = nlp(doc)
        matches = matcher(doc)
        print(idP, page, section_ori)
        for match_id, start, end in matches:
            span = doc[start:end]
            if not(COVIDGazetteer[COVIDGazetteer['o'].str.lower() == span.text.lower()].empty):
                idWikidata = COVIDGazetteer[COVIDGazetteer['o'].str.lower() == span.text.lower()].iloc[0]["s"]
                #print(page, section_ori, idSection, idWikidata, start, end, span.text) # The matcher returns a list of (match_id, start, end) tuples – in this case, [('15578876784678163569', 0, 3)], which maps to the span doc[0:3] of our original document. The match_id is the hash value of the string ID “HelloWorld”. To get the string value, you can look up the ID in the StringStore
                db.WikiPageSectionCOVID(db, date.today().strftime("%Y-%m-%d"), page, section_ori, idSection, idWikidata, start, end, span.text, tableCovid)
            
# TO RUN:
getWikipediaFragments(239, 2875) # 2875



<b> Filter page sections' with covid terms </b>

In [None]:
from bd import BDdatos
db = BDdatos()
con = db.conectar()

tablefilter = 'WikiPageSectionCOVIDFilter'
tableFinal = "IRQA_FRAGMENTS_FINAL"

cur = con.cursor()

cur.execute(u"""CREATE TABLE """ + tablefilter + """ AS
     SELECT PAGE, IDSECTION, COUNT(DISTINCT idwikidata) AS freqcovid FROM 
     """ + tableCovid + """ GROUP BY PAGE, IDSECTION;""")
    

cur.execute(u"""CREATE TABLE """+ tableFinal + """ AS
    SELECT DISTINCT S.*, lastUpdate FROM """ + tablefilter + """ F
    INNER JOIN """ + tableContent + """ s  
    ON f.page = s.page AND f.idsection = s.sectionid
    LEFT OUTER JOIN """ + mtable + """ M
    ON f.page = m.page;""")
            
db.saveDB(con)
db.closeDB(con)

<b> Filter content depending of the classification </b>

In [None]:
from bd import BDdatos
import time, random
db = BDdatos()
con = db.conectar()

from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("/Users/jachicaiza/envs/pyenv/coronabert")
model = AutoModelForSequenceClassification.from_pretrained("/Users/jachicaiza/envs/pyenv/coronabert")

# Retrieve data

# Load Data
import pandas as pd 
from scipy.special import expit     # Use scipy to run a logistic function on the scores (to scale them from 0 to 1)z
from datetime import date

#closeDB(con)
cursor=con.cursor()
cursor.execute(u"""SELECT DISTINCT source, id, page, category, stext FROM QA_DATA_ALL_FINAL 
ORDER BY source, id;""")
areas = cursor.fetchall()
areas = pd.DataFrame(areas)
areas.columns=['source', 'id', 'page', 'category', 'stext']
    
areasT = []

areas.loc[333:334, ["id", "stext"]]
len(areas)

#for i in areas.loc[0:334, ["id", "stext"]].iterrows():
#    print(i[1][0])
#    text = [i[1][1]]
#    print(text)

# Create some input text. This would normally be title plus
# abstract (where available) in a single string
#text = list(areas.loc[0:1, "abstract"])



In [None]:
for i in areas.loc[0:16100, ['source', 'id', 'page', 'category', 'stext']].iterrows():
    print(i[1][0], i[1][1])
    text = [i[1][4]]
    #print(text)
    # Tokenize it with appropriate padding and truncation
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=500)
    # Run the sequence classication model
    outputs = model(**inputs)
    # Pull out the scores for the first article
    #scores = outputs.logits[0,:].tolist() # new version of transformers
    scores = outputs[0][0,:].tolist()
    scores = expit(scores) # outputs
    # Get the associated labels from the model config and sort them to show the
    # highest scoring topic / article type
    scores_with_labels = [ (s,model.config.id2label[i]) for i,s in enumerate(scores) ]
    #sorted(scores_with_labels, reverse=True)
    ranking = sorted(scores_with_labels, reverse=True)
    n = 0
    time.sleep(6+random.uniform(0, 8))
    #print(ranking)
    for r in ranking:
        n = n + 1
        areasT.append([i[1][0], i[1][1], i[1][2], i[1][3], n, r[0], r[1]])



In [None]:
import csv 

len(ranking)

fields = ['source', 'id', 'page', 'category', 'position', 'score', 'subject']

with open('Transformer04abr.csv', 'w') as f:
    write = csv.writer(f) 
    write.writerow(fields) 
    write.writerows(areasT)

db.saveDB(con)
db.closeDB(con)

### 9.3 Convert  Wikipedia fragments to a text corpus

In [None]:
from bd import BDdatos
import codecs
import json
from datetime import date

db = BDdatos()
con = db.conectar()
text_path = '../QA/corporaCOVID'

def convertToDoc(l1, l2):
    cur = con.cursor()
    print("Download data:")
    cur = con.cursor()
    cur.execute("""SELECT DISTINCT idpage, page, REPLACE(REPLACE(section, '/', '_'), '#', '_') section,
    sectionId, concat(REPLACE(REPLACE(section, '/', '_'), '#', '_'), '\n', secText) FROM """ 
    + tableFinal + """ WHERE secText != '' AND idpage > """ +str(l1)+ ' and idpage <= ' + str(l2))
    rowList = cur.fetchall()
    for row in rowList:
        idP = row[0]
        page = row[1]
        section = row[2]
        idSec = row[3]
        doc = row[4]
        name = str(idP)+"_"+section+".txt"
        print(name)
        file = open(text_path+"/"+name, 'w')
        try:
            file.write(doc)
        finally:
            file.close()
            
# To run:
convertToDoc(0, 2875)

### Agregar respuestas de los QA pairs,

Lo nterior hay que actualizar con la nueva data extraída y correr nuevamente: 

In [None]:
from bd import BDdatos
import codecs
import json
from datetime import date

db = BDdatos()
con = db.conectar()
text_path = '../QA/corporaCOVID'

tpairsQA = 'QAPairsFINAL'

def getWikipediaFraments(l1, l2):
    cur = con.cursor()
    print("Descargando datos:")
    cur = con.cursor()
    cur.execute("""SELECT DISTINCT id, url, source, sourceDate, concat(question, '\n', categories, '\n', answer) text
    FROM """ + tpairsQA + """ WHERE id >""" +str(l1)+' and id <= ' + str(l2))
    rowList = cur.fetchall()
    for row in rowList:
        idP = row[0]
        page = row[1]
        source = row[2]
        sourceDate = row[3]
        doc = row[4]
        name = str(idP)+"_"+source+".txt"
        print(name)
        file = open(text_path+"/"+name, 'w')
        try:
            file.write(doc)
        finally:
            file.close()
            
getWikipediaFraments(0, 1500)

<b> Join data from all the sources </b>

In [None]:
from bd import BDdatos
db = BDdatos()
con = db.conectar()

tableUnion = "QA_DATA_ALL_FINAL"
tableQA = "QAPairsFINAL"

cur = con.cursor()

cur.execute(u"""CREATE TABLE """ + tableUnion + """ AS
  SELECT 'WIKIPEDIA' source, idpage id, page, section category, secText stext, lastUpdate, 'section' typeS
  FROM """ + tableFinal + """  UNION
  SELECT DISTINCT source, id, url, categories, concat(question, '|', answer), sourceDate, 'QA' typeS
  FROM """ + tableQA +  """;""")
            
db.saveDB(con)
db.closeDB(con)

In [None]:
### 9.4 Indexing

#### Start the service:

brew services start elasticsearch

In [None]:
from haystack import Finder
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers

* Document Store

FAISS is a library for efficient similarity search on a cluster of dense vectors. The FAISSDocumentStore uses a SQL(SQLite in-memory be default) database under-the-hood to store the document text and other meta data. The vector embeddings of the text are indexed on a FAISS Index that later is queried for searching answers.

In [None]:
from haystack.document_store.faiss import FAISSDocumentStore

document_store = FAISSDocumentStore(sql_url= "sqlite:///repoQA.db", faiss_index_factory_str="Flat", return_embedding=True) # The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor. 



In [None]:
* Cleaning documents

Here to convert and index Wikipedia articles to our DocumentStore.

Haystack provides a customizable pipeline for: converting files into texts, cleaning texts, splitting texts, writing them to a Document Store

In [None]:
# Let's first get some files that we want to use

#document_store =  ElasticsearchDocumentStore()
text_path = "../QA/corporaCOVID"

# Convert files to dicts

dicts = convert_files_to_dicts(dir_path= text_path, clean_func=clean_wiki_text, split_paragraphs=True)

print(dicts[0].keys()) # dict_keys(['text', 'meta'])
dicts[0]['meta'] # "name"



* Document Store

Haystack finds answers to queries within the documents stored in a DocumentStore. The current implementations of DocumentStore include ElasticsearchDocumentStore, FAISSDocumentStore, SQLDocumentStore, and InMemoryDocumentStore.

Here: We recommended Elasticsearch as it comes preloaded with features like full-text queries, BM25 retrieval, and vector storage for text embeddings.

Alternatives: If you are unable to setup an Elasticsearch instance, then follow the Tutorial 3 for using SQL/InMemory document stores.

Hint: This tutorial creates a new document store instance with Wikipedia articles on Game of Thrones. However, you can configure Haystack to work with your existing document stores.

In [None]:
# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(dicts)

* Retriever

Here: We use a DensePassageRetriever
Alternatives:
    
* The ElasticsearchRetriever with custom queries (e.g. boosting) and filters
* Use EmbeddingRetriever to find candidate documents based on the similarity of embeddings (e.g. created via Sentence-BERT)
* Use TfidfRetriever in combination with a SQL or InMemory Document store for simple prototyping and debugging


Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question could be answered. They use some simple but fast algorithm.

Here: We use Elasticsearch's default BM25 algorithm

* Use DensePassageRetriever to use different embedding models for passage and query (see Tutorial 6)

Note, that DensePassageRetriever DPR works best when you index short passages < 512 tokens as only those tokens will be used for the embedding.

* Here, for nq_dev_subset_v2.json we have avg. num of tokens = 5220(!).
* DPR still outperforms Elastic's BM25 by a small margin here.
* from haystack.retriever.dense import DensePassageRetriever

In [None]:
from haystack.retriever.dense import DensePassageRetriever

# retriever = ElasticsearchRetriever(document_store=document_store)
# Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes with SQLite document store.

# from haystack.retriever.sparse import TfidfRetriever
# retriever = TfidfRetriever(document_store=document_store)

max_len_query= 64 # 84
max_len_passage = 256

retriever = DensePassageRetriever(document_store=document_store,
                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                  max_seq_len_query=max_len_query,
                                  max_seq_len_passage=max_len_passage,
                                  batch_size=16,
                                  use_gpu=True,
                                  embed_title=True,
                                  use_fast_tokenizers=True)
# Important: 
# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation. 
# While this can be a time consuming operation (depending on corpus size), it only needs to be done once. 
# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.

document_store.update_embeddings(retriever)

In [None]:
# Save to disc:

document_store.save("../QA/data/DocStore")

# Testing with vectors of quora similar but not run

In [None]:
from haystack.document_store.faiss import FAISSDocumentStore

document_storeS = FAISSDocumentStore(sql_url= "sqlite:///repoQASim.db", faiss_index_factory_str="Flat", return_embedding=True) # The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor. 

# Let's first get some files that we want to use

#document_store =  ElasticsearchDocumentStore()
text_path = "../QA/corporaCOVID"

# Convert files to dicts

dicts = convert_files_to_dicts(dir_path= text_path, clean_func=clean_wiki_text, split_paragraphs=True)

print(dicts[0].keys()) # dict_keys(['text', 'meta'])
dicts[0]['meta'] # "name"

# Now, let's write the dicts containing documents to our DB.
document_storeS.write_documents(dicts)

In [None]:
from sentence_transformers import SentenceTransformer, util
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModel
from transformers import DistilBertTokenizerFast
import os
import csv
import pickle
import time
import torch

model_name = "distilbert-base-nli-stsb-quora-ranking" # 'distilbert-multilingual-nli-stsb-quora-ranking'
#model = SentenceTransformer(model_name)



#tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") #"mrm8488/t5-base-finetuned-squadv2")
#model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-squadv2")

max_corpus_size = 100000
embedding_cache_path = 'covid-embeddings-{}-size-{}.pkl'.format(model_name.replace('/', '_'), max_corpus_size)

embedding_cache_path

#model = AutoModel.from_pretrained(embedding_cache_path)
print("Load pre-computed embeddings from disc")
with open(embedding_cache_path, "rb") as fIn:
    cache_data = pickle.load(fIn)
    corpus_sentences = cache_data['sentences'][0:max_corpus_size]
    corpus_embeddings = cache_data['embeddings'][0:max_corpus_size]
###############################
print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))

#model = SentenceTransformer(embedding_cache_path)

#model.load_state_dict(torch.load(embedding_cache_path))

#model.build_vocab(corpus_sentences, tokenize=True)

model = AutoModel.from_pretrained("../../PLNmodels/distilbert-base-nli-stsb-quora-ranking")
              



In [None]:
max_len_query= 64 # 84
max_len_passage = 256

retrieverQ = DensePassageRetriever(document_store=document_storeS,
                                  query_embedding_model= "../../PLNmodels/distilbert-base-nli-stsb-quora-ranking",
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                  max_seq_len_query=max_len_query,
                                  max_seq_len_passage=max_len_passage,
                                  batch_size=16,
                                  use_gpu=True,
                                  embed_title=True,
                                  use_fast_tokenizers=True)
# Important: 
# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation. 
# While this can be a time consuming operation (depending on corpus size), it only needs to be done once. 
# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.




In [None]:
document_storeS.update_embeddings(retriever)

In [None]:
# Save to disc:

document_storeS.save("../QA/data/DocStoreSim")