# Practica 9
- **Alumna:** Enya Quetzalli Gómez Rodríguez *(Eduardo Gómez Rodríguez)*
- **Profesora:** Olga Kolesnikova
- **Escuela:** Escuela Superior de Cómputo del IPN
- **Grupo:** 3CV9
- **Semestre:** 2020/2

## Relaciones sintagmáticas con BM25
**Instrucciones:**
    Utilizar derivación y el modelo BM25, pero esta vez con etiquedado POS y obtener la similitud con los vectores del model BM25 pero utilizando la relación de tagging para una determinada palabra.

In [1]:
import nltk
import numpy
import math
from nltk.stem import SnowballStemmer

In [2]:
def cleanHtml(txt):
	from bs4 import BeautifulSoup
	return BeautifulSoup(txt,'lxml').get_text().lower()

In [3]:
def splitText(txt):
	return txt.replace('/', ' ').replace('.', ' ').replace('-', ' ')

In [4]:
def deleteTrash(txt):
	import re
	good = {'\n'}
	for i in "abcdefghijklmnopqrstuvwxyz áéíóúñü":
		good.add(i)
	ans = ""
	for c in txt:
		if c in good:
			ans += c
	return ans

In [5]:
def deleteStopwords(txt):
	from nltk.corpus import stopwords
	ans = []
	stp = stopwords.words()
	for w in txt:
		if w not in stp:
			ans.append(w)
	return ans

In [6]:
def getVocabulary(tokens):
	vocabulary = sorted(set(tokens))
	return vocabulary

In [7]:
def getContexts(wordList, windowSize=4):
    contexts = dict()
    index = 0
    for index in range(len(wordList)):
        word = wordList[index]
        if word not in contexts:
            contexts[word] = []
        start = max(0, index - windowSize)
        end = min(index + windowSize, len(wordList) - 1)
        for i in range(start, end + 1):
            if i != index:
                contexts[word].append(wordList[i])
    return contexts

In [8]:
def loadFromJson(name):
    import json
    with open("./files/"+name, 'r', encoding="utf8") as f:
        return json.load(f)

In [9]:
def dumpToJson(obj, name):
    import json
    with open("./files/"+name, 'w', encoding="utf8") as outfile:
        json.dump(obj, outfile, indent=4)

In [10]:
def replaceWithStems(tokens):
    ss = SnowballStemmer("spanish")
    stematized_tokens = []
    for word in tokens:
        stematized_tokens.append(ss.stem(word))
    return stematized_tokens

In [11]:
def replacWithLemmas(tokens):
    lemmas = loadFromJson("lemmatized_tokens_generate")
    lemmatized_tokens = []
    for word in tokens:
        if word in lemmas.keys():
            lemmatized_tokens.append(lemmas[word])
        else:
            lemmatized_tokens.append(word)
    return lemmatized_tokens

In [12]:
def getTagger():
    patterns = [
        (r'.*o$', 'n'),  # noun masculine singular
        (r'.*os$', 'n'), # noun masculine plural
        (r'.*a$', 'n'),  # noun femenine singular
        (r'.*as$', 'n')  # noun femenine plural
    ]
    regexTagger = nltk.RegexpTagger(patterns, nltk.DefaultTagger('s'))
    unigramTagger = nltk.UnigramTagger(nltk.corpus.cess_esp.tagged_sents(), None, regexTagger)
    return unigramTagger

In [13]:
def getPosForToken(normalizedTokens):
    posTag = loadFromJson("generatePOS.json")
    tagger = getTagger()
    
    autoTagged = tagger.tag(normalizedTokens)
    tokenTags = list()
    for i in range(0,len(normalizedTokens)):
        if normalizedTokens[i] in posTag:
            tokenTags.append(posTag[normalizedTokens[i]])
        else:
            tokenTags.append(autoTagged[i][1])
    return tokenTags    

In [14]:
def mixTags(tokens, tokenTags):
    taggedTokens = list()
    for i in range(0,len(tokens)):
        taggedTokens.append((tokens[i], tokenTags[i]))
    return taggedTokens

In [15]:
f = open("./files/e961024.htm", "r", encoding="utf8")
org_txt = f.read()
f.close()

text = deleteTrash(splitText(cleanHtml(org_txt)))
normalizedTokens = deleteStopwords(nltk.word_tokenize(text))

In [16]:
tokenTags = getPosForToken(normalizedTokens)
stematizedTokens = replaceWithStems(normalizedTokens)
tokens = mixTags(stematizedTokens, tokenTags)
#dumpToJson(tokens, "stematized_tokens_snowball.json")

In [17]:
print(tokens[:100])

[('emod', 's'), ('htm', 's'), ('http', 's'), ('www', 's'), ('excelsior', 's'), ('mx', 's'), ('art', 's'), ('html', 's'), ('excelsior', 's'), ('editorial', 'NCMS000'), ('juev', 'NCMP000'), ('octubr', 'NCMS000'), ('epigram', 'NCMS000'), ('jorg', 'NP00000'), ('mansill', 'n'), ('torr', 'NCFP000'), ('critic', 'V0R02S0'), ('miami', 's'), ('herald', 's'), ('president', 'NCIS000'), ('ecuatorian', 'NCMS000'), ('autoproclam', 's'), ('loc', 'NCMS000'), ('neoliberal', 'n'), ('aplic', 'V0R02S0'), ('encomi', 'NCMS000'), ('hac', 'VIIP3S0'), ('mism', 'P00MP00'), ('pais', 'NCMS000'), ('manicomi', 'NCMS000'), ('editorial', 'NCMS000'), ('not', 'NCFS000'), ('siguient', 'AGIS000'), ('http', 's'), ('www', 's'), ('excelsior', 's'), ('mx', 's'), ('art', 's'), ('html', 's'), ('excelsior', 's'), ('editorial', 'NCMS000'), ('juev', 'NCMP000'), ('octubr', 'NCMS000'), ('hungr', 'n'), ('rebelion', 'NCFS000'), ('antiestalin', 'n'), ('oscar', 's'), ('gonzalez', 's'), ('lopez', 's'), ('curs', 'NCMS000'), ('octubr', 'NC

In [18]:
contexts = getContexts(tokens)

In [19]:
print(list(contexts.items())[:100])

[(('emod', 's'), [('htm', 's'), ('http', 's'), ('www', 's'), ('excelsior', 's')]), (('htm', 's'), [('emod', 's'), ('http', 's'), ('www', 's'), ('excelsior', 's'), ('mx', 's')]), (('http', 's'), [('emod', 's'), ('htm', 's'), ('www', 's'), ('excelsior', 's'), ('mx', 's'), ('art', 's'), ('manicomi', 'NCMS000'), ('editorial', 'NCMS000'), ('not', 'NCFS000'), ('siguient', 'AGIS000'), ('www', 's'), ('excelsior', 's'), ('mx', 's'), ('art', 's'), ('noviembr', 'NCMS000'), ('editorial', 'NCMS000'), ('not', 'NCFS000'), ('siguient', 'AGIS000'), ('www', 's'), ('excelsior', 's'), ('mx', 's'), ('art', 's'), ('huert', 'NCFS000'), ('editorial', 'NCMS000'), ('not', 'NCFS000'), ('siguient', 'AGIS000'), ('www', 's'), ('excelsior', 's'), ('mx', 's'), ('art', 's'), ('horripil', 'AGIP000'), ('editorial', 'NCMS000'), ('not', 'NCFS000'), ('siguient', 'AGIS000'), ('www', 's'), ('excelsior', 's'), ('mx', 's'), ('art', 's'), ('transform', 'NCFS000'), ('editorial', 'NCMS000'), ('not', 'NCFS000'), ('siguient', 'AGIS

In [20]:
vocabulary = getVocabulary(tokens)
#contexts = loadFromJson("contexts_stematized_snowball.json")

In [21]:
print(vocabulary[:100])

[('a', 's'), ('abad', 'NCMS000'), ('abaj', 'RG000'), ('abander', 'V0P00SF'), ('abandon', 'V0N0000'), ('abandon', 'V0P00SM'), ('abarc', 'V0N0000'), ('abascadocarr', 'n'), ('abast', 'NCMS000'), ('abastec', 'NCMS000'), ('abastec', 'V0IF3S0'), ('abastec', 'V0N0000'), ('abastec', 'V0SI3S0'), ('abat', 'V0N0000'), ('abat', 'V0SI3P0'), ('abiert', 'AGFP000'), ('abiert', 'AGFS000'), ('abiert', 'AGMS000'), ('abiert', 'RG000'), ('aboc', 'V0IF3S0'), ('abog', 'NCMS000'), ('abog', 'V0IS3P0'), ('abog', 'V0R02S0'), ('abord', 'V0IF1P0'), ('abord', 'V0IF3P0'), ('abord', 'V0R02S0'), ('abort', 'NCMS000'), ('abracadabr', 's'), ('abran', 'V0R03P0'), ('abraz', 'V0N0000'), ('abre', 'V0R02S0'), ('abren', 'V0IP3P0'), ('abri', 'V0II3P0'), ('abri', 'V0IS3S0'), ('abriend', 'V0G0000'), ('abrier', 'V0SI3S0'), ('abrieron', 'V0IS3P0'), ('abril', 'NP00000'), ('abrir', 'V0N0000'), ('abrum', 'V0P00SM'), ('abrupt', 'AGFS000'), ('absolut', 'AGMS000'), ('absolut', 'NCMS000'), ('absolut', 'RG000'), ('absolutiz', 'n'), ('absor

### Modelo BM25


In [22]:
def getFrequencyVectors(vocabulary, contexts):
    freqVectors = dict()
    for target in vocabulary:
        vector = list()
        for word in vocabulary:
            vector.append(contexts[target].count(word))
        freqVectors[target] = vector
    return freqVectors

In [23]:
def getAvg(contexts):
    return sum(len(context) for centralWord,context in contexts.items()) / len(contexts)

In [24]:
def getDocs(qi, contexts):
    total = 0
    for centralWord,context in contexts.items():
        if qi in context:
            total = total+1
    return total

In [25]:
def getIDF(N,q,contexts):
    docs = getDocs(q,contexts)
    return math.log( (N-docs+0.5)/(docs+0.5) )
    

In [26]:
def getIdfMap(vocabulary, contexts):
    N = len(vocabulary)
    idfMap = dict()
    for target in vocabulary:
        idfMap[target] = getIDF(N,target, contexts)
    return idfMap

In [27]:
def termFrequency(q,doc):
    return doc.count(q)

In [28]:
def getScore(tf, mg, idf, avgdl, k , b):
    return ((tf*(k+1)) / (tf+k*( 1-b+( (b*mg)/avgdl) ) ))    

In [29]:
def getCos(v1, v2):
    n = len(v1)
    
    pp = 0
    for i in range(0,n):
        pp += (v1[i]*v2[i])
    
    mg1 = 0
    mg2 = 0
    for i in range(0,n):
        mg1 += (v1[i]*v1[i])
        mg2 += (v2[i]*v2[i])
    mgV1 = math.sqrt(mg1)
    mgV2 = math.sqrt(mg2)
    
    if mg1 == 0 or mg2 == 0:
        return 0
    return pp/(mgV1*mgV2)  

In [30]:
def getBM25(vocabulary, contexts):
    k = 1.5
    b = 0.75
    
    idfMap = getIdfMap(vocabulary, contexts)
    freqVectors = getFrequencyVectors(vocabulary, contexts)
    avg = getAvg(contexts)
    
    mapOfBM25 = dict()
    for target in vocabulary:
        fV = freqVectors[target]
        mg = len(contexts[target])
        
        vector = list()
        for freq in fV:
            vector.append(getScore(freq,mg,idfMap[target],avg,k,b))
        mapOfBM25[target] = vector
    return mapOfBM25        

In [31]:
def getSimilarsTo(target, vectors):
    w = SnowballStemmer("spanish").stem(target)
    targetVectors = []
    for taggedToken,v1 in vectors.items():
        if taggedToken[0] == w:
            targetVectors.append(v1)
            
    similarity = dict()
    for v1 in targetVectors:
        for word in vocabulary:
            v2 = vectors[word]
            cos = getCos(v1,v2)
            if cos > 0.0 :
                if word in similarity:
                    similarity[word] += cos
                else:
                    similarity[word] = cos
    
    relatedWords = []
    for token,cos in similarity.items():
        relatedWords.append((cos,token))
    
    return sorted(relatedWords, reverse=True)

In [32]:
bm25 = getBM25(vocabulary, contexts)

In [34]:
result = getSimilarsTo("acero", bm25)

In [35]:
for r in result[:100]:
    print(str(r[0]) + ", " + str(r[1]))

1.0000000000000002, ('acer', 'NCMS000')
0.7500000000000001, ('traslad', 'V0P00SM')
0.5303300858899107, ('cabl', 'NCMS000')
0.5000000000000001, ('cobr', 'NCMS000')
0.37500000000000006, ('tub', 'NCMP000')
0.37500000000000006, ('microproces', 's')
0.37500000000000006, ('desmaterializ', 'n')
0.3535533905932738, ('vaci', 'NCMS000')
0.25000000000000006, ('sign', 'NCMP000')
0.25000000000000006, ('penetr', 'NCFS000')
0.25000000000000006, ('ilog', 'AGMS000')
0.25000000000000006, ('compromet', 'V0R02S0')
0.25000000000000006, ('cabl', 'NCMP000')
0.20962650400890429, ('depresion', 'NCFS000')
0.17788748670848886, ('cre', 'V0R03P0')
0.1767766952966369, ('visibl', 'AGIP000')
0.1767766952966369, ('finaliz', 'V0N0000')
0.1767766952966369, ('evit', 'V0G0000')
0.17376954165389288, ('cubr', 'V0N0000')
0.17376954165389283, ('desplom', 'NCMS000')
0.16639502721915675, ('paralel', 'RG000')
0.16184608884186205, ('sosten', 'aq0cs0')
0.16184608884186205, ('inapropi', 'n')
0.16184608884186205, ('depreci', 'V0R03S

Como podemos ver, el uso de etiquetado POS ha mejorda muchisimo el modelo BM25, teniendo resultados mucho más acertados y realisticos al uso de la palabra acero.