# Practica 6
- **Alumna:** Enya Quetzalli Gómez Rodríguez *(Eduardo Gómez Rodríguez)*
- **Profesora:** Olga Kolesnikova
- **Escuela:** Escuela Superior de Cómputo del IPN
- **Grupo:** 3CV9
- **Semestre:** 2020/2

## Similaridad con snowball-stem y el modelo de recuperación BM25
**Instrucciones:**
    Con lo visto del modelos BM25, ahora supondremos que una ventana de contexto ahora es un documento, de modo que obtendremos la similaridad de cada palabra con todos los *documentos* y obtenedremos el mejor resultado para cada palabra, de modo que podremos obtener la similaridad entre una palabra y todos los documentos. 

In [1]:
import nltk
import numpy
import math
from nltk.stem import SnowballStemmer

In [2]:
def cleanHtml(txt):
	from bs4 import BeautifulSoup
	return BeautifulSoup(txt,'lxml').get_text().lower()

In [3]:
def splitText(txt):
	return txt.replace('/', ' ').replace('.', ' ').replace('-', ' ')

In [4]:
def deleteTrash(txt):
	import re
	good = {'\n'}
	for i in "abcdefghijklmnopqrstuvwxyz áéíóúñü":
		good.add(i)
	ans = ""
	for c in txt:
		if c in good:
			ans += c
	return ans

In [5]:
def deleteStopwords(txt):
	from nltk.corpus import stopwords
	ans = []
	stp = stopwords.words()
	for w in txt:
		if w not in stp:
			ans.append(w)
	return ans

In [6]:
def getVocabulary(tokens):
	vocabulary = sorted(set(tokens))
	return vocabulary

In [7]:
def getContexts(wordList, windowSize=4):
    contexts = dict()
    index = 0
    for index in range(len(wordList)):
        word = wordList[index]
        if word not in contexts:
            contexts[word] = []
        start = max(0, index - windowSize)
        end = min(index + windowSize, len(wordList) - 1)
        for i in range(start, end + 1):
            if i != index:
                contexts[word].append(wordList[i])
    return contexts

Ahora cambiaremos nuestras funciones de derivación, por la de **snowball**.

In [8]:
def replaceWithStems(tokens):
    ss = SnowballStemmer("spanish")
    stematized_tokens = []
    for word in tokens:
        stematized_tokens.append(ss.stem(word))
    return stematized_tokens

In [9]:
def dumpToJson(obj, name):
    import json
    with open("./files/"+name, 'w', encoding="utf8") as outfile:
        json.dump(obj, outfile, indent=4)

In [10]:
def loadFromJson(name):
    import json
    with open("./files/"+name, 'r', encoding="utf8") as f:
        return json.load(f)

In [11]:
f = open("./files/e961024.htm", "r", encoding="utf8")
org_txt = f.read()
f.close()

text = deleteTrash(splitText(cleanHtml(org_txt)))
normalizedTokens = deleteStopwords(nltk.word_tokenize(text))
stematizedTokens = replaceWithStems(normalizedTokens)
tokens = stematizedTokens
#dumpToJson(tokens, "stematized_tokens_snowball.json")

contexts = getContexts(tokens)
#dumpToJson(contexts, "contexts_stematized_snowball.json")

#tokens = loadFromJson("stematized_tokens_snowball.json")
vocabulary = getVocabulary(tokens)
#contexts = loadFromJson("contexts_stematized_snowball.json")

### Modelo BM25


In [12]:
def getFrequencyVectors(vocabulary, contexts):
    freqVectors = dict()
    for target in vocabulary:
        vector = list()
        for word in vocabulary:
            vector.append(contexts[target].count(word))
        freqVectors[target] = vector
    return freqVectors

In [13]:
def getAvg(contexts):
    return sum(len(context) for centralWord,context in contexts.items()) / len(contexts)

In [14]:
def getDocs(qi, contexts):
    total = 0
    for centralWord,context in contexts.items():
        if qi in context:
            total = total+1
    return total

In [15]:
def getIDF(N,q,contexts):
    docs = getDocs(q,contexts)
    return math.log( (N-docs+0.5)/(docs+0.5) )
    

In [16]:
def getIdfMap(vocabulary, contexts):
    N = len(vocabulary)
    idfMap = dict()
    for target in vocabulary:
        idfMap[target] = getIDF(N,target, contexts)
    return idfMap

In [17]:
def termFrequency(q,doc):
    return doc.count(q)

In [18]:
def getScore(tf, mg, idf, avgdl, k , b):
    return ((tf*(k+1)) / (tf+k*( 1-b+( (b*mg)/avgdl) ) ))    

In [19]:
def getCos(v1, v2):
    n = len(v1)
    
    pp = 0
    for i in range(0,n):
        pp += (v1[i]*v2[i])
    
    mg1 = 0
    mg2 = 0
    for i in range(0,n):
        mg1 += (v1[i]*v1[i])
        mg2 += (v2[i]*v2[i])
    mgV1 = math.sqrt(mg1)
    mgV2 = math.sqrt(mg2)
    
    if mg1 == 0 or mg2 == 0:
        return 0
    return pp/(mgV1*mgV2)  

In [20]:
def getBM25(vocabulary, contexts):
    k = 1.5
    b = 0.75
    
    idfMap = getIdfMap(vocabulary, contexts)
    freqVectors = getFrequencyVectors(vocabulary, contexts)
    avg = getAvg(contexts)
    
    mapOfBM25 = dict()
    for target in vocabulary:
        fV = freqVectors[target]
        mg = len(contexts[target])
        
        vector = list()
        for freq in fV:
            vector.append(getScore(freq,mg,idfMap[target],avg,k,b))
        mapOfBM25[target] = vector
    return mapOfBM25        

In [21]:
def getSimilarsTo(target, vectors):
    w = SnowballStemmer("spanish").stem(target)
    v1 = vectors[w]
    
    similarity = list()
    for word in vocabulary:
        v2 = vectors[word]
        cos = getCos(v1,v2)
        if cos > 0.0 :
            similarity.append((cos,word))
    
    return sorted(similarity, reverse=True)

In [22]:
bm25 = getBM25(vocabulary, contexts)

In [23]:
result = getSimilarsTo("acero", bm25)

In [24]:
for r in result[:100]:
    print(str(r[0]) + ", " + r[1])

1.0000000000000002, acer
0.5303300858899107, traslad
0.4900443295315784, cabl
0.4161812435586644, tub
0.4161812435586644, microproces
0.38253821696366713, vaci
0.375, desmaterializ
0.25828949695182796, ilog
0.25, fibr
0.22633159011550752, cobr
0.20613200813525495, depresion
0.1832111283478356, alrededor
0.1767766952966369, obedec
0.1767766952966369, cremi
0.17305798843973946, buscatel
0.16372083932838835, sin
0.1634990808969134, paralel
0.1634990808969134, grylm
0.1578917466068364, inapropi
0.1537519020460427, tan
0.15322127154477425, millon
0.1506020829576032, pes
0.1505190248013665, penetr
0.1505190248013665, inyeccion
0.14890963438890767, desplom
0.14860915849551184, visibl
0.14860915849551184, cubr
0.14753019676839768, injustific
0.14749352981765051, rentabil
0.14737547814520655, mil
0.1467612835187306, inscinc
0.1467612835187306, corporation
0.14529168961955197, increment
0.14506882256477227, finaliz
0.14506882256477227, concesion
0.14346993603593514, total
0.14330768802894522, co

Como podemos ver, el model BM25 parece ser bastante acertado a pesar de que no estamos trabajando con documentos directamente. 