# Practica 14
- **Alumna:** Enya Quetzalli Gómez Rodríguez *(Eduardo Gómez Rodríguez)*
- **Profesora:** Olga Kolesnikova
- **Escuela:** Escuela Superior de Cómputo del IPN
- **Grupo:** 3CV9
- **Semestre:** 2020/2

## Obtener palabras frencuentes con modelo TF-IDF
**Instrucciones:**
    Obtener una lista de las palabras más frecuentes en el documento usando el modelo TF-IDF y ordenarlas en forma descendente.

In [1]:
import nltk
import numpy
import math
from nltk.stem import SnowballStemmer

In [2]:
def cleanHtml(txt):
	from bs4 import BeautifulSoup
	return BeautifulSoup(txt,'lxml').get_text().lower()

In [3]:
def splitText(txt):
	return txt.replace('/', ' ').replace('.', ' ').replace('-', ' ')

In [4]:
def deleteTrash(txt):
	import re
	good = {'\n'}
	for i in "abcdefghijklmnopqrstuvwxyz áéíóúñü":
		good.add(i)
	ans = ""
	for c in txt:
		if c in good:
			ans += c
	return ans

In [5]:
def splitToSentences(txt):
    tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
    return tokenizer.tokenize(txt)

In [6]:
def deleteStopwords(txt):
	from nltk.corpus import stopwords
	ans = []
	stp = stopwords.words()
	for w in txt:
		if w not in stp:
			ans.append(w)
	return ans

In [7]:
def getVocabulary(sentences):
    tokens = set()
    for sent in sentences:
        for token in sent:
            tokens.add(token)
    return sorted(tokens)

In [8]:
def getFrequencies(sentences, vocabulary):
    tokens = dict()
    
    for token in vocabulary:
        tokens[token] = 0
    
    for sent in sentences:
        for token in sent:
            tokens[token] += 1
            
    return tokens

In [9]:
def getContexts(wordList, windowSize=4):
    contexts = dict()
    index = 0
    for index in range(len(wordList)):
        word = wordList[index]
        if word not in contexts:
            contexts[word] = []
        start = max(0, index - windowSize)
        end = min(index + windowSize, len(wordList) - 1)
        for i in range(start, end + 1):
            if i != index:
                contexts[word].append(wordList[i])
    return contexts

In [10]:
def loadFromJson(name):
    import json
    with open("./files/"+name, 'r', encoding="utf8") as f:
        return json.load(f)

In [11]:
def dumpToJson(obj, name):
    import json
    with open("./files/"+name, 'w', encoding="utf8") as outfile:
        json.dump(obj, outfile, indent=4)

In [12]:
def replaceWithStems(tokens):
    ss = SnowballStemmer("spanish")
    stematized_tokens = []
    for word in tokens:
        stematized_tokens.append(ss.stem(word))
    return stematized_tokens

In [13]:
def replacWithLemmas(tokens):
    lemmas = loadFromJson("lemmatized_tokens_generate")
    lemmatized_tokens = []
    for word in tokens:
        if word in lemmas.keys():
            lemmatized_tokens.append(lemmas[word])
        else:
            lemmatized_tokens.append(word)
    return lemmatized_tokens

In [14]:
def getTagger():
    patterns = [
        (r'.*o$', 'n'),  # noun masculine singular
        (r'.*os$', 'n'), # noun masculine plural
        (r'.*a$', 'n'),  # noun femenine singular
        (r'.*as$', 'n')  # noun femenine plural
    ]
    regexTagger = nltk.RegexpTagger(patterns, nltk.DefaultTagger('s'))
    unigramTagger = nltk.UnigramTagger(nltk.corpus.cess_esp.tagged_sents(), None, regexTagger)
    return unigramTagger

In [15]:
def mixTags(tokens, tokenTags):
    taggedTokens = list()
    for i in range(0,len(tokens)):
        taggedTokens.append((tokens[i], tokenTags[i][1]))
    return taggedTokens

In [16]:
def normalizeSencences(sent):
    tagger = getTagger()
    sentences = []
    for s in sent:
        cleanSentence = deleteTrash(splitText(s))
        normalizedTokens = deleteStopwords(nltk.word_tokenize(cleanSentence))
        tokenTags = tagger.tag(normalizedTokens)
        stematizedTokens = replaceWithStems(normalizedTokens)
        tokens = mixTags(stematizedTokens, tokenTags)
        sentences.append(tokens)
    return sentences

In [17]:
f = open("./files/e961024.htm", "r", encoding="utf8")
org_txt = f.read()
f.close()

sentences = splitToSentences(cleanHtml(org_txt))
sentences = normalizeSencences(sentences)

In [18]:
vocabulary = getVocabulary(sentences)
frequencies = getFrequencies(sentences, vocabulary)

Modelo TF-IDF

In [19]:
def getDocs(qi, sentences):
    total = 0
    for doc in sentences:
        if qi in doc:
            total += 1
    return total

In [20]:
def getIDF(N,q,sentences):
    docs = getDocs(q,sentences)
    return math.log( (N-docs+0.5)/(docs+0.5) )

In [21]:
def getIdfMap(vocabulary, sentences):
    N = len(vocabulary)
    idfMap = dict()
    for target in vocabulary:
        idfMap[target] = getIDF(N,target, sentences)
    return idfMap

In [22]:
def getTfIdf(vocabulary, sentences, frequencies):    
    idfMap = getIdfMap(vocabulary, sentences)
    
    commonWords = list()
    for target in vocabulary:
        freq = frequencies[target]
        commonWords.append((freq*idfMap[target], target))

    return sorted(commonWords, reverse=True)

In [23]:
commonWords = getTfIdf(vocabulary, sentences, frequencies)
for freq,token in commonWords:
    print(str(freq), str(token))

730.5460114146454 ('excelsior', 's')
726.36195383828 ('mexic', 'np0000l')
691.9164864728178 ('millon', 'ncmp000')
507.2602414041703 ('octubr', 'W')
467.5608657456637 ('merc', 'ncms000')
455.5819878001192 ('pes', 'Zm')
444.00550824103186 ('pais', 'ncms000')
432.3270937012682 ('año', 'ncms000')
423.26782293850914 ('cient', 'n')
403.6735134645968 ('gobiern', 'ncms000')
398.05229003977115 ('part', 'ncfs000')
376.5524593041801 ('sol', 'rg')
373.0024056573115 ('dolar', 'Zm')
368.2643934112788 ('juev', 'W')
352.5676282414717 ('www', 's')
352.5676282414717 ('mx', 's')
352.5676282414717 ('http', 's')
352.5676282414717 ('html', 's')
348.8623138214727 ('siguient', 'aq0cs0')
344.24444858398294 ('financ', 'aq0fs0')
341.4102346882717 ('not', 'ncfs000')
339.0930960521014 ('ser', 'vsn0000')
337.5087562006253 ('nacional', 'aq0cs0')
300.6633375698864 ('president', 'ncms000')
297.02279893496956 ('mil', 'dn0cp0')
295.6522819437216 ('años', 'ncmp000')
289.08646291247095 ('agu', 'ncfs000')
265.1712391694438

61.03489369330827 ('podr', 'vmic3p0')
61.03489369330827 ('plen', 'aq0ms0')
61.03489369330827 ('pes', 's')
61.03489369330827 ('perd', 'ncfs000')
61.03489369330827 ('papel', 'ncms000')
61.03489369330827 ('panoram', 'ncms000')
61.03489369330827 ('nuev', 'dn0cp0')
61.03489369330827 ('neces', 'ncfs000')
61.03489369330827 ('mit', 'ncfs000')
61.03489369330827 ('manifest', 'vmis3s0')
61.03489369330827 ('mandatari', 'ncms000')
61.03489369330827 ('juli', 'n')
61.03489369330827 ('juli', 'W')
61.03489369330827 ('inic', 'vmis3s0')
61.03489369330827 ('increment', 'vmn0000')
61.03489369330827 ('import', 'ncfp000')
61.03489369330827 ('grav', 'aq0cs0')
61.03489369330827 ('gobi', 'ncmp000')
61.03489369330827 ('extern', 's')
61.03489369330827 ('expert', 'ncmp000')
61.03489369330827 ('etap', 'ncfs000')
61.03489369330827 ('estatal', 'aq0cs0')
61.03489369330827 ('energ', 'ncfs000')
61.03489369330827 ('dificil', 'aq0cs0')
61.03489369330827 ('dich', 'vmp00sm')
61.03489369330827 ('declar', 'ncfp000')
61.034893

36.64338043801177 ('negoci', 'ncfs000')
36.64338043801177 ('necesari', 'aq0mp0')
36.64338043801177 ('naturalez', 'ncfs000')
36.64338043801177 ('municipi', 'ncmp000')
36.64338043801177 ('multivision', 's')
36.64338043801177 ('muer', 'vmip3s0')
36.64338043801177 ('much', 'di0fp0')
36.64338043801177 ('miguel', 's')
36.64338043801177 ('mejor', 'vmn0000')
36.64338043801177 ('mejor', 'n')
36.64338043801177 ('medi', 'dn0fs0')
36.64338043801177 ('maxim', 'aq0ms0')
36.64338043801177 ('marg', 'nccs000')
36.64338043801177 ('mantien', 'vmip3p0')
36.64338043801177 ('magnitud', 'ncfs000')
36.64338043801177 ('lugar', 'ncmp000')
36.64338043801177 ('lleg', 'vmp00sm')
36.64338043801177 ('lleg', 'vmn0000')
36.64338043801177 ('llam', 'aq0fsp')
36.64338043801177 ('liger', 'rg')
36.64338043801177 ('legisl', 'aq0ms0')
36.64338043801177 ('legal', 'aq0cp0')
36.64338043801177 ('leer', 'vmn0000')
36.64338043801177 ('just', 'rg')
36.64338043801177 ('jimenez', 's')
36.64338043801177 ('interbancari', 'n')
36.643380

23.342699667215836 ('referent', 'ncms000')
23.342699667215836 ('refer', 'vmp00sm')
23.342699667215836 ('reestructur', 's')
23.342699667215836 ('reduj', 'vmis3s0')
23.342699667215836 ('reduccion', 'ncfs000')
23.342699667215836 ('reduc', 'vmp00sm')
23.342699667215836 ('recorr', 's')
23.342699667215836 ('reclus', 'n')
23.342699667215836 ('recient', 'aq0cp0')
23.342699667215836 ('recesion', 'ncfs000')
23.342699667215836 ('rebeld', 's')
23.342699667215836 ('rebas', 'vmip3s0')
23.342699667215836 ('realiz', 'vmip3s0')
23.342699667215836 ('reafirm', 'n')
23.342699667215836 ('reactiv', 'ncfs000')
23.342699667215836 ('razon', 'aq0cs0')
23.342699667215836 ('ratif', 'ncfs000')
23.342699667215836 ('rap', 'aq0ms0')
23.342699667215836 ('ramon', 's')
23.342699667215836 ('ramirez', 's')
23.342699667215836 ('ram', 'ncms000')
23.342699667215836 ('ram', 'n')
23.342699667215836 ('quit', 'vmip3s0')
23.342699667215836 ('quis', 'n')
23.342699667215836 ('quier', 'vmsp3p0')
23.342699667215836 ('quiebr', 's')
23

23.342699667215836 ('bas', 'aq0msp')
23.342699667215836 ('banxic', 'n')
23.342699667215836 ('band', 'ncmp000')
23.342699667215836 ('bancari', 'aq0mp0')
23.342699667215836 ('añad', 'vmn0000')
23.342699667215836 ('avanz', 'aq0fsp')
23.342699667215836 ('automotriz', 's')
23.342699667215836 ('aureli', 'n')
23.342699667215836 ('aument', 'vmn0000')
23.342699667215836 ('aument', 'vmip3s0')
23.342699667215836 ('aument', 's')
23.342699667215836 ('atun', 's')
23.342699667215836 ('atlant', 'n')
23.342699667215836 ('atlant', 'aq0ms0')
23.342699667215836 ('atend', 'vmn0000')
23.342699667215836 ('atend', 'n')
23.342699667215836 ('aspect', 'ncms000')
23.342699667215836 ('asever', 'vmis3s0')
23.342699667215836 ('asent', 's')
23.342699667215836 ('asegur', 'vmn0000')
23.342699667215836 ('asegur', 'vmip3p0')
23.342699667215836 ('asegur', 'ncfs000')
23.342699667215836 ('ascend', 'vmis3s0')
23.342699667215836 ('argentin', 'aq0fs0')
23.342699667215836 ('are', 'ncfp000')
23.342699667215836 ('aquell', 'dd0mp0

16.234982886394356 ('logr', 'vmis3p0')
16.234982886394356 ('logr', 'ncms000')
16.234982886394356 ('logic', 'rg')
16.234982886394356 ('lluvi', 'ncfs000')
16.234982886394356 ('lloyd', 's')
16.234982886394356 ('llev', 'vmis3p0')
16.234982886394356 ('llev', 'vmif3s0')
16.234982886394356 ('llev', 's')
16.234982886394356 ('llen', 'aq0ms0')
16.234982886394356 ('llen', 'aq0fs0')
16.234982886394356 ('lleg', 'vmis3p0')
16.234982886394356 ('lleg', 'vmip3p0')
16.234982886394356 ('llav', 'n')
16.234982886394356 ('llam', 'ncfp000')
16.234982886394356 ('litigi', 'n')
16.234982886394356 ('list', 'n')
16.234982886394356 ('liqu', 'ncmp000')
16.234982886394356 ('limpi', 'aq0fp0')
16.234982886394356 ('limit', 's')
16.234982886394356 ('limit', 'n')
16.234982886394356 ('limit', 'aq0msp')
16.234982886394356 ('lig', 'aq0msp')
16.234982886394356 ('liebenow', 's')
16.234982886394356 ('licenci', 'n')
16.234982886394356 ('lic', 's')
16.234982886394356 ('liberal', 'aq0cs0')
16.234982886394356 ('lf', 's')
16.234982

16.234982886394356 ('blanquiazul', 'aq0cs0')
16.234982886394356 ('bilba', 'n')
16.234982886394356 ('bezanill', 'n')
16.234982886394356 ('bes', 'ncms000')
16.234982886394356 ('benefici', 's')
16.234982886394356 ('benefic', 's')
16.234982886394356 ('bateador', 'ncms000')
16.234982886394356 ('batall', 'ncfs000')
16.234982886394356 ('batall', 'ncfp000')
16.234982886394356 ('bastant', 'rg')
16.234982886394356 ('bast', 'vmis3s0')
16.234982886394356 ('bas', 's')
16.234982886394356 ('barat', 'aq0ms0')
16.234982886394356 ('banort', 's')
16.234982886394356 ('banor', 'n')
16.234982886394356 ('bander', 'ncfs000')
16.234982886394356 ('banc', 's')
16.234982886394356 ('banc', 'n')
16.234982886394356 ('banacci', 's')
16.234982886394356 ('baluart', 's')
16.234982886394356 ('balanz', 'ncfs000')
16.234982886394356 ('bak', 's')
16.234982886394356 ('baj', 'ncfp000')
16.234982886394356 ('añad', 'vmip3s0')
16.234982886394356 ('ayunt', 'ncms000')
16.234982886394356 ('ayunt', 'ncmp000')
16.234982886394356 ('ay

8.628436370232219 ('temor', 'ncms000')
8.628436370232219 ('teler', 's')
8.628436370232219 ('telefon', 'aq0fp0')
8.628436370232219 ('tel', 's')
8.628436370232219 ('tej', 's')
8.628436370232219 ('teilhard', 's')
8.628436370232219 ('tecnolog', 's')
8.628436370232219 ('tecnolog', 'aq0fs0')
8.628436370232219 ('tecnolog', 'aq0fp0')
8.628436370232219 ('teclaz', 'n')
8.628436370232219 ('technologi', 's')
8.628436370232219 ('technical', 's')
8.628436370232219 ('teatral', 's')
8.628436370232219 ('teatr', 'ncms000')
8.628436370232219 ('tatahuicap', 's')
8.628436370232219 ('tarjet', 'ncfp000')
8.628436370232219 ('tapaboc', 'n')
8.628436370232219 ('tanqu', 'ncmp000')
8.628436370232219 ('tangibl', 'aq0cp0')
8.628436370232219 ('tangencial', 's')
8.628436370232219 ('tandr', 'n')
8.628436370232219 ('tambi', 's')
8.628436370232219 ('tall', 'ncfs000')
8.628436370232219 ('talent', 'ncms000')
8.628436370232219 ('talant', 'ncms000')
8.628436370232219 ('tajant', 's')
8.628436370232219 ('tajant', 'aq0cs0')
8.

8.628436370232219 ('peg', 'vmip3s0')
8.628436370232219 ('ped', 'vmn0000')
8.628436370232219 ('ped', 'vmii1s0')
8.628436370232219 ('ped', 'n')
8.628436370232219 ('pec', 's')
8.628436370232219 ('pcus', 's')
8.628436370232219 ('pc', 's')
8.628436370232219 ('paz', 'n')
8.628436370232219 ('pavor', 'ncms000')
8.628436370232219 ('paut', 'n')
8.628436370232219 ('paull', 'n')
8.628436370232219 ('paul', 's')
8.628436370232219 ('patrull', 'vmip3s0')
8.628436370232219 ('patron', 'ncmp000')
8.628436370232219 ('patrocin', 'n')
8.628436370232219 ('patriot', 'ncms000')
8.628436370232219 ('patriot', 'nccp000')
8.628436370232219 ('patrimonioseacional', 's')
8.628436370232219 ('patrimonial', 's')
8.628436370232219 ('patrac', 'n')
8.628436370232219 ('pati', 'n')
8.628436370232219 ('passel', 's')
8.628436370232219 ('pasquin', 's')
8.628436370232219 ('pasiv', 's')
8.628436370232219 ('pasion', 'ncfp000')
8.628436370232219 ('pasill', 'ncmp000')
8.628436370232219 ('pas', 'vmip3p0')
8.628436370232219 ('pas', 'v

8.628436370232219 ('insalv', 's')
8.628436370232219 ('inquisidor', 's')
8.628436370232219 ('inquisicion', 's')
8.628436370232219 ('inquietud', 'ncfs000')
8.628436370232219 ('inquiet', 'n')
8.628436370232219 ('inorgan', 'n')
8.628436370232219 ('inofens', 'n')
8.628436370232219 ('innecesari', 's')
8.628436370232219 ('inncanci', 'n')
8.628436370232219 ('inmunosupresor', 's')
8.628436370232219 ('inmunoestimul', 's')
8.628436370232219 ('inmun', 's')
8.628436370232219 ('inmortal', 'aq0cs0')
8.628436370232219 ('inmoder', 'n')
8.628436370232219 ('inmobiliari', 'aq0mp0')
8.628436370232219 ('inminent', 'aq0cs0')
8.628436370232219 ('injust', 'n')
8.628436370232219 ('injert', 'n')
8.628436370232219 ('inici', 'vmip3s0')
8.628436370232219 ('inici', 'vmif3s0')
8.628436370232219 ('inici', 'ncfp000')
8.628436370232219 ('inic', 'n')
8.628436370232219 ('inhibi', 'n')
8.628436370232219 ('inhib', 'vmn0000')
8.628436370232219 ('inhib', 's')
8.628436370232219 ('ingres', 'vmn0000')
8.628436370232219 ('ingrat'

8.628436370232219 ('estelar', 's')
8.628436370232219 ('estatu', 'n')
8.628436370232219 ('estandar', 's')
8.628436370232219 ('estand', 'aq0cs0')
8.628436370232219 ('estanc', 'n')
8.628436370232219 ('estall', 'vmis3s0')
8.628436370232219 ('estall', 's')
8.628436370232219 ('estall', 'ncms000')
8.628436370232219 ('estall', 'n')
8.628436370232219 ('estadi', 'ncmp000')
8.628436370232219 ('estacional', 's')
8.628436370232219 ('estacion', 'ncfp000')
8.628436370232219 ('establec', 'aq0fpp')
8.628436370232219 ('establ', 'aq0cp0')
8.628436370232219 ('estabiliz', 'ncfs000')
8.628436370232219 ('estabiliz', 'n')
8.628436370232219 ('esquem', 'ncmp000')
8.628436370232219 ('espos', 'ncfp000')
8.628436370232219 ('esplendidez', 's')
8.628436370232219 ('espiritus', 's')
8.628436370232219 ('esper', 'vmsp3s0')
8.628436370232219 ('esper', 'vmis3s0')
8.628436370232219 ('esper', 'vmip3p0')
8.628436370232219 ('esper', 'vmip1s0')
8.628436370232219 ('esper', 'ncfs000')
8.628436370232219 ('espej', 'ncms000')
8.628

8.628436370232219 ('coster', 'n')
8.628436370232219 ('cost', 'vmis3s0')
8.628436370232219 ('cost', 'ncfp000')
8.628436370232219 ('cosmovision', 's')
8.628436370232219 ('cortin', 's')
8.628436370232219 ('cort', 'vmn0000')
8.628436370232219 ('cort', 'ncfp000')
8.628436370232219 ('cort', 'n')
8.628436370232219 ('corruptel', 'n')
8.628436370232219 ('corroid', 'n')
8.628436370232219 ('corrig', 'n')
8.628436370232219 ('corrient', 'ncfs000')
8.628436370232219 ('corresponsal', 's')
8.628436370232219 ('correspondient', 'aq0cp0')
8.628436370232219 ('correspond', 'vmsp3s0')
8.628436370232219 ('correspond', 'vmip3s0')
8.628436370232219 ('correligionari', 'n')
8.628436370232219 ('correg', 's')
8.628436370232219 ('correg', 'n')
8.628436370232219 ('corredur', 'n')
8.628436370232219 ('correct', 'n')
8.628436370232219 ('corr', 'vmis3s0')
8.628436370232219 ('corr', 'vmip3p0')
8.628436370232219 ('corr', 'n')
8.628436370232219 ('corporal', 'aq0cs0')
8.628436370232219 ('corpor', 'aq0fs0')
8.628436370232219

8.628436370232219 ('antigu', 'aq0fp0')
8.628436370232219 ('antievasion', 's')
8.628436370232219 ('antiestalin', 'n')
8.628436370232219 ('anticorrupcion', 'aq0cs0')
8.628436370232219 ('anti', 'nc00000')
8.628436370232219 ('antepondr', 's')
8.628436370232219 ('antepas', 'n')
8.628436370232219 ('antecendent', 's')
8.628436370232219 ('antañ', 'n')
8.628436370232219 ('antal', 's')
8.628436370232219 ('antagon', 'n')
8.628436370232219 ('anonim', 'ncms000')
8.628436370232219 ('anomal', 'n')
8.628436370232219 ('aniquil', 'n')
8.628436370232219 ('anhel', 'n')
8.628436370232219 ('angusti', 'ncfs000')
8.628436370232219 ('angusti', 'n')
8.628436370232219 ('angul', 'n')
8.628436370232219 ('angloamerican', 'n')
8.628436370232219 ('anglican', 'n')
8.628436370232219 ('anex', 's')
8.628436370232219 ('anex', 'n')
8.628436370232219 ('andre', 's')
8.628436370232219 ('andrad', 's')
8.628436370232219 ('andant', 's')
8.628436370232219 ('andad', 'n')
8.628436370232219 ('anarqu', 'ncfs000')
8.628436370232219 ('

Es interesante ver como es que la frecuencia bruta se ve ligeramente cambiada con el modelo TF-IDF