# Practica 10
- **Alumna:** Enya Quetzalli Gómez Rodríguez *(Eduardo Gómez Rodríguez)*
- **Profesora:** Olga Kolesnikova
- **Escuela:** Escuela Superior de Cómputo del IPN
- **Grupo:** 3CV9
- **Semestre:** 2020/2

## Relaciones sintagmáticas con Entropy
**Instrucciones:**
    Ahora se leerá el texto como párrafos a los que llamaremos *sentence*, y para cada uno deberemos realizar nuestro proceso de normalización
    Utilizar derivación y el modelo BM25, pero esta vez con etiquedado POS y obtener la similitud con los vectores del model BM25 pero utilizando la relación de tagging para una determinada palabra.

In [1]:
import nltk
import numpy
import math
from nltk.stem import SnowballStemmer

In [2]:
def cleanHtml(txt):
	from bs4 import BeautifulSoup
	return BeautifulSoup(txt,'lxml').get_text().lower()

In [3]:
def splitText(txt):
	return txt.replace('/', ' ').replace('.', ' ').replace('-', ' ')

In [4]:
def deleteTrash(txt):
	import re
	good = {'\n'}
	for i in "abcdefghijklmnopqrstuvwxyz áéíóúñü":
		good.add(i)
	ans = ""
	for c in txt:
		if c in good:
			ans += c
	return ans

In [5]:
def splitToSentences(txt):
    tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
    return tokenizer.tokenize(txt)

In [6]:
def deleteStopwords(txt):
	from nltk.corpus import stopwords
	ans = []
	stp = stopwords.words()
	for w in txt:
		if w not in stp:
			ans.append(w)
	return ans

In [7]:
def getVocabulary(sentences):
    tokens = set()
    for sent in sentences:
        for token in sent:
            tokens.add(token)
    return sorted(tokens)

In [8]:
def getFrequencies(sentences, vocabulary):
    tokens = dict()
    
    for token in vocabulary:
        tokens[token] = 0
    
    for sent in sentences:
        for token in sent:
            tokens[token] += 1
            
    return tokens

In [9]:
def getContexts(wordList, windowSize=4):
    contexts = dict()
    index = 0
    for index in range(len(wordList)):
        word = wordList[index]
        if word not in contexts:
            contexts[word] = []
        start = max(0, index - windowSize)
        end = min(index + windowSize, len(wordList) - 1)
        for i in range(start, end + 1):
            if i != index:
                contexts[word].append(wordList[i])
    return contexts

In [10]:
def loadFromJson(name):
    import json
    with open("./files/"+name, 'r', encoding="utf8") as f:
        return json.load(f)

In [11]:
def dumpToJson(obj, name):
    import json
    with open("./files/"+name, 'w', encoding="utf8") as outfile:
        json.dump(obj, outfile, indent=4)

In [12]:
def replaceWithStems(tokens):
    ss = SnowballStemmer("spanish")
    stematized_tokens = []
    for word in tokens:
        stematized_tokens.append(ss.stem(word))
    return stematized_tokens

In [13]:
def replacWithLemmas(tokens):
    lemmas = loadFromJson("lemmatized_tokens_generate")
    lemmatized_tokens = []
    for word in tokens:
        if word in lemmas.keys():
            lemmatized_tokens.append(lemmas[word])
        else:
            lemmatized_tokens.append(word)
    return lemmatized_tokens

In [14]:
def getTagger():
    patterns = [
        (r'.*o$', 'n'),  # noun masculine singular
        (r'.*os$', 'n'), # noun masculine plural
        (r'.*a$', 'n'),  # noun femenine singular
        (r'.*as$', 'n')  # noun femenine plural
    ]
    regexTagger = nltk.RegexpTagger(patterns, nltk.DefaultTagger('s'))
    unigramTagger = nltk.UnigramTagger(nltk.corpus.cess_esp.tagged_sents(), None, regexTagger)
    return unigramTagger

In [15]:
def mixTags(tokens, tokenTags):
    taggedTokens = list()
    for i in range(0,len(tokens)):
        taggedTokens.append((tokens[i], tokenTags[i][1]))
    return taggedTokens

In [16]:
def normalizeSencences(sent):
    tagger = getTagger()
    sentences = []
    for s in sent:
        cleanSentence = deleteTrash(splitText(s))
        normalizedTokens = deleteStopwords(nltk.word_tokenize(cleanSentence))
        tokenTags = tagger.tag(normalizedTokens)
        stematizedTokens = replaceWithStems(normalizedTokens)
        tokens = mixTags(stematizedTokens, tokenTags)
        sentences.append(tokens)
    return sentences

In [17]:
f = open("./files/e961024.htm", "r", encoding="utf8")
org_txt = f.read()
f.close()

sentences = splitToSentences(cleanHtml(org_txt))
sentences = normalizeSencences(sentences)

In [18]:
vocabulary = getVocabulary(sentences)
frequencies = getFrequencies(sentences, vocabulary)

### Usando la Entropía


In [19]:
def getEntropy(tgP, wordP, target, word, n, sentences):
    prob_11 = 0
    for sent in sentences:
        if (target in sent) and (word in sent):
             prob_11 += 1
    prob_11 /= n
    
    tgQ = 1 - tgP
    wordQ = 1 - wordP
    
    prob_01 = wordP - prob_11
    prob_10 = tgP - prob_11
    prob_00 = tgQ - prob_01
    
    #print("prob => ",prob_00, prob_01, prob_10, prob_11)
    
    a,b,c,d = 0,0,0,0
    if wordQ != 0:
        if prob_00/wordQ > 0:
            a = prob_00 * math.log2(prob_00 / wordQ)
        if prob_10/wordQ > 0:
            b = prob_10 * math.log2(prob_10 / wordQ)
    if wordP != 0:
        if prob_01/wordP > 0:
            c = prob_01 * math.log2(prob_01/wordP)
        if prob_11/wordP > 0:
            d = prob_11 * math.log2(prob_11/wordP)
            
    #print("entropy => ",a,b,c,d)
            
    return (a+b+c+d)*-1

In [20]:
def getSimilarsTo(target, frequency, vocabulary, sentences):
    w = SnowballStemmer("spanish").stem(target)
    n = len(sentences)
    
    targetTokens = set()
    for token in vocabulary:
        if token[0] == w:
            targetTokens.add(token)
    
    print("Checking for " + str(len(targetTokens)) + " tokens of word " + target)
    for tg in targetTokens:
        print("\t" + str(tg))
            
    similarity = dict()
    for word in vocabulary:
        similarity[word] = 0.0
    for tg in targetTokens:
        tgProb = frequency[tg]/n
        
        for word in vocabulary:
            wProb = frequency[word]/n
            entropy = getEntropy(tgProb, wProb, tg, word, n, sentences)
            similarity[word] += entropy
    
    relatedWords = []
    for token,entropy in similarity.items():
        relatedWords.append((entropy,token))
    
    return sorted(relatedWords)

In [21]:
result = getSimilarsTo("economía", frequencies, vocabulary, sentences)

Checking for 8 tokens of word economía
	('econom', 'aq0ms0')
	('econom', 'aq0mp0')
	('econom', 'ncfp000')
	('econom', 'ncfs000')
	('econom', 'nccp000')
	('econom', 'aq0fs0')
	('econom', 'nccs000')
	('econom', 'aq0fp0')


In [23]:
for r in result[:100]:
    print(str(r[0]) + ", " + str(r[1]))

0.3641814132108909, ('econom', 'ncfs000')
0.3675207047273351, ('econom', 'aq0fs0')
0.39793888255221943, ('econom', 'aq0ms0')
0.45103438653804856, ('econom', 'aq0mp0')
0.4524651492150109, ('econom', 'aq0fp0')
0.45884011876521114, ('econom', 'nccs000')
0.4696252930143442, ('econom', 'nccp000')
0.47187853733975615, ('ven', 'vmip3p0')
0.47709654375915966, ('recuper', 'ncfs000')
0.4773778056725599, ('econom', 'ncfp000')
0.4780838194149634, ('crecimient', 'ncms000')
0.4807410909660985, ('perversion', 's')
0.4809802425618901, ('conoc', 'ncms000')
0.4812349214112581, ('funcionari', 'ncmp000')
0.4814087199213799, ('globaliz', 'ncfs000')
0.48160438161419983, ('bm', 's')
0.4817505012620616, ('product', 'ncmp000')
0.48190187955134867, ('educ', 'ncfs000')
0.48215963140215146, ('mal', 'aq0mp0')
0.48215963140215146, ('mit', 'ncmp000')
0.48215963140215146, ('paz', 'n')
0.48215963140215146, ('valor', 'vmip3p0')
0.4821903602259055, ('pais', 'ncms000')
0.48225256299422237, ('mundial', 'aq0cs0')
0.4823480

En los reusltados podemos observar como el modelo de *Entropía*, igualmente arroja resultados similares a una palabra sin requetir los cosenos de todos los vectores.
Valores simares a economía son arrejados, como lo son crecimiento, valor, país, macroecnonomía, valor , finanzas.

In [24]:
result2 = getSimilarsTo("acero", frequencies, vocabulary, sentences)

Checking for 1 tokens of word acero
	('acer', 'ncms000')


In [25]:
for r in result2[:100]:
    print(str(r[0]) + ", " + str(r[1]))

0.0, ('acer', 'ncms000')
0.0, ('barat', 'aq0fs0')
0.0, ('cabl', 's')
0.0, ('cobr', 'ncms000')
0.0, ('codif', 's')
0.0, ('desmaterializ', 'n')
0.0, ('distanci', 'ncfp000')
0.0, ('fibr', 'n')
0.0, ('impuls', 'vmp00sm')
0.0, ('larg', 'aq0fp0')
0.0, ('microproces', 's')
0.0, ('mov', 'vmn0000')
0.0, ('noved', 's')
0.0, ('optic', 'n')
0.0, ('pes', 'vmp00sm')
0.0, ('rasg', 'n')
0.0, ('transistor', 's')
0.0, ('traslad', 'vmp00sm')
0.0, ('tub', 'ncmp000')
0.0010887316276537834, ('aceler', 'n')
0.0010887316276537834, ('cabl', 'ncms000')
0.0010887316276537834, ('digital', 'aq0cs0')
0.0010887316276537834, ('vaci', 'ncms000')
0.0014996665771167495, ('transmision', 'ncfs000')
0.0019649648744892827, ('tecnolog', 'aq0ms0')
0.002123099907398, ('bas', 'aq0fsp')
0.002123099907398, ('intens', 'aq0fs0')
0.0022546050354253088, ('diferent', 'ncfs000')
0.0022546050354253088, ('facil', 'aq0cs0')
0.0023671832039176763, ('industrial', 'aq0cs0')
0.0025530516798542343, ('epoc', 'ncfs000')
0.002703212957889823, ('e

Como podemos ver, en esta ocasión a diferencia de los vectores, en la entroía la palabra traslado no aparece tan cercana a la palabra *acero*, pero si lo son cable, cobre, distancia y traslado que tienen una relación con el proceso que vive el acero.