# Practica 7
- **Alumna:** Enya Quetzalli Gómez Rodríguez *(Eduardo Gómez Rodríguez)*
- **Profesora:** Olga Kolesnikova
- **Escuela:** Escuela Superior de Cómputo del IPN
- **Grupo:** 3CV9
- **Semestre:** 2020/2

## Similaridad  con snowball-stem y el modelo de recuperación TF-IDF
**Instrucciones:**
    Con lo visto del modelos TF-IDF, ahora supondremos que una ventana de contexto ahora es un documento, de modo que obtendremos la similaridad de cada palabra con todos los *documentos* y obtenedremos el mejor resultado para cada palabra, de modo que podremos obtener la similaridad entre una palabra y todos los documentos. 

In [1]:
import nltk
import numpy
import math
from nltk.stem import SnowballStemmer

In [2]:
def cleanHtml(txt):
	from bs4 import BeautifulSoup
	return BeautifulSoup(txt,'lxml').get_text().lower()

In [3]:
def splitText(txt):
	return txt.replace('/', ' ').replace('.', ' ').replace('-', ' ')

In [4]:
def deleteTrash(txt):
	import re
	good = {'\n'}
	for i in "abcdefghijklmnopqrstuvwxyz áéíóúñü":
		good.add(i)
	ans = ""
	for c in txt:
		if c in good:
			ans += c
	return ans

In [5]:
def deleteStopwords(txt):
	from nltk.corpus import stopwords
	ans = []
	stp = stopwords.words()
	for w in txt:
		if w not in stp:
			ans.append(w)
	return ans

In [6]:
def getVocabulary(tokens):
	vocabulary = sorted(set(tokens))
	return vocabulary

In [7]:
def getContexts(wordList, windowSize=4):
    contexts = dict()
    index = 0
    for index in range(len(wordList)):
        word = wordList[index]
        if word not in contexts:
            contexts[word] = []
        start = max(0, index - windowSize)
        end = min(index + windowSize, len(wordList) - 1)
        for i in range(start, end + 1):
            if i != index:
                contexts[word].append(wordList[i])
    return contexts

In [8]:
def replaceWithStems(tokens):
    ss = SnowballStemmer("spanish")
    stematized_tokens = []
    for word in tokens:
        stematized_tokens.append(ss.stem(word))
    return stematized_tokens

In [9]:
def dumpToJson(obj, name):
    import json
    with open("./files/"+name, 'w', encoding="utf8") as outfile:
        json.dump(obj, outfile, indent=4)

In [10]:
def loadFromJson(name):
    import json
    with open("./files/"+name, 'r', encoding="utf8") as f:
        return json.load(f)

In [11]:
f = open("./files/e961024.htm", "r", encoding="utf8")
org_txt = f.read()
f.close()

text = deleteTrash(splitText(cleanHtml(org_txt)))
normalizedTokens = deleteStopwords(nltk.word_tokenize(text))
stematizedTokens = replaceWithStems(normalizedTokens)
tokens = stematizedTokens
#dumpToJson(tokens, "stematized_tokens_snowball.json")

contexts = getContexts(tokens)
#dumpToJson(contexts, "contexts_stematized_snowball.json")

#tokens = loadFromJson("stematized_tokens_snowball.json")
vocabulary = getVocabulary(tokens)
#contexts = loadFromJson("contexts_stematized_snowball.json")

In [None]:
def getCos(v1, v2):
    n = len(v1)
    
    pp = 0
    for i in range(0,n):
        pp += (v1[i]*v2[i])
    
    mg1 = 0
    mg2 = 0
    for i in range(0,n):
        mg1 += (v1[i]*v1[i])
        mg2 += (v2[i]*v2[i])
    mgV1 = math.sqrt(mg1)
    mgV2 = math.sqrt(mg2)
    
    if mg1 == 0 or mg2 == 0:
        return 0
    return pp/(mgV1*mgV2) 

In [None]:
def getSimilarsTo(target, vectors):
    w = SnowballStemmer("spanish").stem(target)
    v1 = vectors[w]
    
    similarity = list()
    for word in vocabulary:
        v2 = vectors[word]
        cos = getCos(v1,v2)
        if cos > 0.0 :
            similarity.append((cos,word))
    
    return sorted(similarity, reverse=True)

In [12]:
def getFrequencyVectors(vocabulary, contexts):
    freqVectors = dict()
    for target in vocabulary:
        vector = list()
        for word in vocabulary:
            vector.append(contexts[target].count(word))
        freqVectors[target] = vector
    return freqVectors

In [13]:
def getAvg(contexts):
    return sum(len(context) for centralWord,context in contexts.items()) / len(contexts)

In [14]:
def getDocs(qi, contexts):
    total = 0
    for centralWord,context in contexts.items():
        if qi in context:
            total = total+1
    return total

In [15]:
def getIDF(N,q,contexts):
    docs = getDocs(q,contexts)
    return math.log( (N-docs+0.5)/(docs+0.5) )
    

In [16]:
def getIdfMap(vocabulary, contexts):
    N = len(vocabulary)
    idfMap = dict()
    for target in vocabulary:
        idfMap[target] = getIDF(N,target, contexts)
    return idfMap

In [17]:
def termFrequency(q,doc):
    return doc.count(q)

In [18]:
def getScore(tf, mg, idf, avgdl, k , b):
    return ((tf*(k+1)) / (tf+k*( 1-b+( (b*mg)/avgdl) ) ))    

### TF-IDF
Utilizaremos las mismas funciones de BM25, pero en esta ocasión con la fórmula de $tf*idf$

In [20]:
def getTfIdf(vocabulary, contexts):    
    idfMap = getIdfMap(vocabulary, contexts)
    freqVectors = getFrequencyVectors(vocabulary, contexts)
    
    vectors = dict()
    for target in vocabulary:
        fV = freqVectors[target]
        mg = len(contexts[target])
        
        vector = list()
        for freq in fV:
            vector.append(freq*idfMap[target])
        vectors[target] = vector
    return vectors        

In [22]:
tf_idf = getTfIdf(vocabulary, contexts)

In [23]:
result = getSimilarsTo("acero", tf_idf)

In [24]:
for r in result[:100]:
    print(str(r[0]) + ", " + r[1])

1.0000000000000002, acer
0.5303300858899106, traslad
0.4999999999999999, cabl
0.44721359549995804, tub
0.44721359549995804, microproces
0.4166666666666667, vaci
0.3750000000000001, desmaterializ
0.2558831578595797, sin
0.25000000000000006, fibr
0.25000000000000006, cobr
0.23717082451262853, depresion
0.22360679774997905, inapropi
0.22360679774997902, ilog
0.21889599547077093, millon
0.2165063509461096, devalu
0.21501831715537362, dol
0.20412414523193156, grylm
0.2041241452319315, paralel
0.20044593143431844, tan
0.19999999999999998, rentabil
0.19867985355975665, alrededor
0.1952833664712358, depreci
0.18569533817705183, chicag
0.17677669529663695, moned
0.17677669529663692, buscatel
0.1767766952966369, obedec
0.1767766952966369, cremi
0.17677669529663687, visibl
0.17677669529663687, cubr
0.17446914130588104, increment
0.17206180040292138, desplom
0.17160161824591105, cos
0.1698415551216895, diciembr
0.1666666666666667, ponent
0.16666666666666666, subvalu
0.16666666666666666, estabiliz


Como podemos ver, el modelo TF-IDF nos devuelve resultados muy similares a los obtenidos con BM25