# Practica 12
- **Alumna:** Enya Quetzalli Gómez Rodríguez *(Eduardo Gómez Rodríguez)*
- **Profesora:** Olga Kolesnikova
- **Escuela:** Escuela Superior de Cómputo del IPN
- **Grupo:** 3CV9
- **Semestre:** 2020/2

## Información Mutua con KL-Divergence
**Instrucciones:**
    Utilizar la incertidumbre de información mutua cominado con entropía, utilizando la formula de divergencia de Kullback-Liebler.

In [1]:
import nltk
import numpy
import math
from nltk.stem import SnowballStemmer

In [2]:
def cleanHtml(txt):
	from bs4 import BeautifulSoup
	return BeautifulSoup(txt,'lxml').get_text().lower()

In [3]:
def splitText(txt):
	return txt.replace('/', ' ').replace('.', ' ').replace('-', ' ')

In [4]:
def deleteTrash(txt):
	import re
	good = {'\n'}
	for i in "abcdefghijklmnopqrstuvwxyz áéíóúñü":
		good.add(i)
	ans = ""
	for c in txt:
		if c in good:
			ans += c
	return ans

In [5]:
def splitToSentences(txt):
    tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
    return tokenizer.tokenize(txt)

In [6]:
def deleteStopwords(txt):
	from nltk.corpus import stopwords
	ans = []
	stp = stopwords.words()
	for w in txt:
		if w not in stp:
			ans.append(w)
	return ans

In [7]:
def getVocabulary(sentences):
    tokens = set()
    for sent in sentences:
        for token in sent:
            tokens.add(token)
    return sorted(tokens)

In [8]:
def getFrequencies(sentences, vocabulary):
    tokens = dict()
    
    for token in vocabulary:
        tokens[token] = 0
    
    for sent in sentences:
        for token in sent:
            tokens[token] += 1
            
    return tokens

In [9]:
def getContexts(wordList, windowSize=4):
    contexts = dict()
    index = 0
    for index in range(len(wordList)):
        word = wordList[index]
        if word not in contexts:
            contexts[word] = []
        start = max(0, index - windowSize)
        end = min(index + windowSize, len(wordList) - 1)
        for i in range(start, end + 1):
            if i != index:
                contexts[word].append(wordList[i])
    return contexts

In [10]:
def loadFromJson(name):
    import json
    with open("./files/"+name, 'r', encoding="utf8") as f:
        return json.load(f)

In [11]:
def dumpToJson(obj, name):
    import json
    with open("./files/"+name, 'w', encoding="utf8") as outfile:
        json.dump(obj, outfile, indent=4)

In [12]:
def replaceWithStems(tokens):
    ss = SnowballStemmer("spanish")
    stematized_tokens = []
    for word in tokens:
        stematized_tokens.append(ss.stem(word))
    return stematized_tokens

In [13]:
def replacWithLemmas(tokens):
    lemmas = loadFromJson("lemmatized_tokens_generate")
    lemmatized_tokens = []
    for word in tokens:
        if word in lemmas.keys():
            lemmatized_tokens.append(lemmas[word])
        else:
            lemmatized_tokens.append(word)
    return lemmatized_tokens

In [14]:
def getTagger():
    patterns = [
        (r'.*o$', 'n'),  # noun masculine singular
        (r'.*os$', 'n'), # noun masculine plural
        (r'.*a$', 'n'),  # noun femenine singular
        (r'.*as$', 'n')  # noun femenine plural
    ]
    regexTagger = nltk.RegexpTagger(patterns, nltk.DefaultTagger('s'))
    unigramTagger = nltk.UnigramTagger(nltk.corpus.cess_esp.tagged_sents(), None, regexTagger)
    return unigramTagger

In [15]:
def mixTags(tokens, tokenTags):
    taggedTokens = list()
    for i in range(0,len(tokens)):
        taggedTokens.append((tokens[i], tokenTags[i][1]))
    return taggedTokens

In [16]:
def normalizeSencences(sent):
    tagger = getTagger()
    sentences = []
    for s in sent:
        cleanSentence = deleteTrash(splitText(s))
        normalizedTokens = deleteStopwords(nltk.word_tokenize(cleanSentence))
        tokenTags = tagger.tag(normalizedTokens)
        stematizedTokens = replaceWithStems(normalizedTokens)
        tokens = mixTags(stematizedTokens, tokenTags)
        sentences.append(tokens)
    return sentences

In [17]:
f = open("./files/e961024.htm", "r", encoding="utf8")
org_txt = f.read()
f.close()

sentences = splitToSentences(cleanHtml(org_txt))
sentences = normalizeSencences(sentences)

In [18]:
vocabulary = getVocabulary(sentences)
frequencies = getFrequencies(sentences, vocabulary)

### Usando la Entropía


In [19]:
def getEntropy(tgP, wordP, target, word, n, sentences):
    prob_11 = 0
    for sent in sentences:
        if (target in sent) and (word in sent):
             prob_11 += 1
    prob_11 /= n
    
    tgQ = 1 - tgP
    wordQ = 1 - wordP
    
    prob_01 = wordP - prob_11
    prob_10 = tgP - prob_11
    prob_00 = tgQ - prob_01
    
    #print("prob => ",prob_00, prob_01, prob_10, prob_11)
    
    a,b,c,d = 0,0,0,0
    if wordQ != 0:
        param = prob_00/(tgQ*wordQ) #KL divergence
        if param > 0:
            a = prob_00 * math.log2(param)
        param = prob_10/(tgP*wordQ)
        if param > 0:
            b = prob_10 * math.log2(param)
            
    if wordP != 0:
        param = prob_01/(tgQ*wordP)
        if param > 0:
            c = prob_01 * math.log2(param)
        param = prob_11/(tgP*wordP)
        if param > 0:
            d = prob_11 * math.log2(param)
            
    #print("entropy => ",a,b,c,d)
            
    return (a+b+c+d)*-1

In [20]:
def getSimilarsTo(target, frequency, vocabulary, sentences):
    w = SnowballStemmer("spanish").stem(target)
    n = len(sentences)
    k = 2 #Laplace Smoothing variable
    
    targetTokens = set()
    for token in vocabulary:
        if token[0] == w:
            targetTokens.add(token)
    
    print("Checking for " + str(len(targetTokens)) + " tokens of word " + target)
    for tg in targetTokens:
        print("\t" + str(tg))
            
    similarity = dict()
    for word in vocabulary:
        similarity[word] = 0.0
    for tg in targetTokens:
        tgProb = (frequency[tg]+k)/(n+frequency[tg]*k)
        
        for word in vocabulary:
            wProb = (frequency[word]+k)/(n+frequency[tg]*k)
            entropy = getEntropy(tgProb, wProb, tg, word, n, sentences)
            similarity[word] += entropy
    
    relatedWords = []
    for token,entropy in similarity.items():
        relatedWords.append((entropy,token))
    
    return sorted(relatedWords)

In [21]:
result = getSimilarsTo("economía", frequencies, vocabulary, sentences)

Checking for 8 tokens of word economía
	('econom', 'aq0fp0')
	('econom', 'ncfp000')
	('econom', 'nccp000')
	('econom', 'aq0fs0')
	('econom', 'aq0mp0')
	('econom', 'aq0ms0')
	('econom', 'nccs000')
	('econom', 'ncfs000')


In [22]:
for r in result[:100]:
    print(str(r[0]) + ", " + str(r[1]))

-0.12215123286222171, ('econom', 'ncfs000')
-0.1179139460041253, ('econom', 'aq0fs0')
-0.08817665887884213, ('econom', 'aq0ms0')
-0.033279448289898345, ('econom', 'aq0mp0')
-0.031486875130163836, ('econom', 'aq0fp0')
-0.025920987920871157, ('econom', 'nccs000')
-0.01593333095700152, ('econom', 'nccp000')
-0.015043063137908553, ('ven', 'vmip3p0')
-0.013711532139178298, ('recuper', 'ncfs000')
-0.013313680359125337, ('crecimient', 'ncms000')
-0.009548332426989405, ('econom', 'ncfp000')
-0.009123454957945268, ('globaliz', 'ncfs000')
-0.009037127138095263, ('polit', 'ncfs000')
-0.008850955015992245, ('pais', 'ncms000')
-0.008819658373953331, ('funcionari', 'ncmp000')
-0.008696320680427673, ('mundial', 'aq0cs0')
-0.008687966293415988, ('financier', 'aq0mp0')
-0.008473166257941662, ('perversion', 's')
-0.008275545766920224, ('conoc', 'ncms000')
-0.008052956408488663, ('crisis', 'ncfn000')
-0.008003689688639775, ('bm', 's')
-0.007997081386527978, ('mexican', 'aq0fs0')
-0.007896544587901339, ('

In [23]:
result2 = getSimilarsTo("acero", frequencies, vocabulary, sentences)

Checking for 1 tokens of word acero
	('acer', 'ncms000')


In [25]:
for r in result2[:50]:
    print(str(r[0]) + ", " + str(r[1]))

-0.0036922259662763987, ('acer', 'ncms000')
-0.0036922259662763987, ('barat', 'aq0fs0')
-0.0036922259662763987, ('cabl', 's')
-0.0036922259662763987, ('cobr', 'ncms000')
-0.0036922259662763987, ('codif', 's')
-0.0036922259662763987, ('desmaterializ', 'n')
-0.0036922259662763987, ('distanci', 'ncfp000')
-0.0036922259662763987, ('fibr', 'n')
-0.0036922259662763987, ('impuls', 'vmp00sm')
-0.0036922259662763987, ('larg', 'aq0fp0')
-0.0036922259662763987, ('microproces', 's')
-0.0036922259662763987, ('mov', 'vmn0000')
-0.0036922259662763987, ('noved', 's')
-0.0036922259662763987, ('optic', 'n')
-0.0036922259662763987, ('pes', 'vmp00sm')
-0.0036922259662763987, ('rasg', 'n')
-0.0036922259662763987, ('transistor', 's')
-0.0036922259662763987, ('traslad', 'vmp00sm')
-0.0036922259662763987, ('tub', 'ncmp000')
-0.00342616288109798, ('aceler', 'n')
-0.00342616288109798, ('cabl', 'ncms000')
-0.00342616288109798, ('digital', 'aq0cs0')
-0.00342616288109798, ('vaci', 'ncms000')
-0.0032285516449949386