# Practica 16
- **Alumna:** Enya Quetzalli Gómez Rodríguez *(Eduardo Gómez Rodríguez)*
- **Profesora:** Olga Kolesnikova
- **Escuela:** Escuela Superior de Cómputo del IPN
- **Grupo:** 3CV9
- **Semestre:** 2020/2

## Selección de Articulos
**Instrucciones:**
   ...

In [1]:
import nltk
import numpy
import math
from nltk.stem import SnowballStemmer

In [2]:
def cleanHtml(txt):
	from bs4 import BeautifulSoup
	return BeautifulSoup(txt,'lxml').get_text().lower()

In [3]:
def splitText(txt):
	return txt.replace('/', ' ').replace('.', ' ').replace('-', ' ')

In [6]:
def deleteTrash(txt):
	import re
	good = {'\n'}
	for i in "abcdefghijklmnopqrstuvwxyz áéíóúñü":
		good.add(i)
	ans = ""
	for c in txt:
		if c in good:
			ans += c
	return ans

In [7]:
def splitToSentences(txt):
    tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
    return tokenizer.tokenize(txt)

In [8]:
def deleteStopwords(txt):
	from nltk.corpus import stopwords
	ans = []
	stp = stopwords.words()
	for w in txt:
		if w not in stp:
			ans.append(w)
	return ans

In [9]:
def getVocabulary(articles):
    tokens = set()
    for sentences in articles:
        for sent in sentences:
            for token in sent:
                tokens.add(token)
    return sorted(tokens)

In [10]:
def getFrequencies(sentences, vocabulary):
    tokens = dict()
    
    for token in vocabulary:
        tokens[token] = 0
    
    for sent in sentences:
        for token in sent:
            tokens[token] += 1
            
    return tokens

In [11]:
def getGlobalFrequencies(articles, vocabulary):
    tokens = dict()
    
    for token in vocabulary:
        tokens[token] = 0
    
    for sentences in articles:
        for sent in sentences:
            for token in sent:
                tokens[token] += 1
            
    return tokens

In [12]:
def getContexts(wordList, windowSize=4):
    contexts = dict()
    index = 0
    for index in range(len(wordList)):
        word = wordList[index]
        if word not in contexts:
            contexts[word] = []
        start = max(0, index - windowSize)
        end = min(index + windowSize, len(wordList) - 1)
        for i in range(start, end + 1):
            if i != index:
                contexts[word].append(wordList[i])
    return contexts

In [13]:
def loadFromJson(name):
    import json
    with open("./files/"+name, 'r', encoding="utf8") as f:
        return json.load(f)

In [14]:
def dumpToJson(obj, name):
    import json
    with open("./files/"+name, 'w', encoding="utf8") as outfile:
        json.dump(obj, outfile, indent=4)

In [15]:
def replaceWithStems(tokens):
    ss = SnowballStemmer("spanish")
    stematized_tokens = []
    for word in tokens:
        stematized_tokens.append(ss.stem(word))
    return stematized_tokens

In [16]:
def replacWithLemmas(tokens):
    lemmas = loadFromJson("lemmatized_tokens_generate")
    lemmatized_tokens = []
    for word in tokens:
        if word in lemmas.keys():
            lemmatized_tokens.append(lemmas[word])
        else:
            lemmatized_tokens.append(word)
    return lemmatized_tokens

In [17]:
def getTagger():
    patterns = [
        (r'.*o$', 'n'),  # noun masculine singular
        (r'.*os$', 'n'), # noun masculine plural
        (r'.*a$', 'n'),  # noun femenine singular
        (r'.*as$', 'n')  # noun femenine plural
    ]
    regexTagger = nltk.RegexpTagger(patterns, nltk.DefaultTagger('s'))
    unigramTagger = nltk.UnigramTagger(nltk.corpus.cess_esp.tagged_sents(), None, regexTagger)
    return unigramTagger

In [18]:
def mixTags(tokens, tokenTags):
    taggedTokens = list()
    for i in range(0,len(tokens)):
        taggedTokens.append((tokens[i], tokenTags[i][1]))
    return taggedTokens

In [19]:
def normalizeSencences(sent, tagger):
    sentences = []
    for s in sent:
        cleanSentence = deleteTrash(splitText(s))
        normalizedTokens = deleteStopwords(nltk.word_tokenize(cleanSentence))
        tokenTags = tagger.tag(normalizedTokens)
        stematizedTokens = replaceWithStems(normalizedTokens)
        tokens = mixTags(stematizedTokens, tokenTags)
        sentences.append(tokens)
    return sentences

In [20]:
def loadArticles(org_txt):
    tagger = getTagger()
    arts = org_txt.split("<h3>")[1:]
    articles = list()
    for art in arts:
        sentences = splitToSentences(cleanHtml(art))
        sentences = normalizeSencences(sentences, tagger)
        articles.append(sentences)
    return articles    

In [21]:
f = open("./files/e961024.htm", "r", encoding="utf8")
org_txt = f.read()
f.close()

articles = loadArticles(org_txt)

In [22]:
print(articles)

[[[('epigram', 'n'), ('jorg', 's'), ('mansill', 'n'), ('torr', 's'), ('critic', 'vmip3s0'), ('miami', 's'), ('herald', 's'), ('president', 'ncms000'), ('ecuatorian', 'aq0ms0'), ('autoproclam', 's'), ('loc', 'aq0ms0')], [('neoliberal', 'n'), ('aplic', 'vmip3s0'), ('encomi', 'n'), ('hac', 'vmip3s0'), ('mism', 'di0ms0'), ('pais', 'ncms000'), ('manicomi', 'n')], [('editorial', 'ncms000'), ('not', 'ncfs000'), ('siguient', 'aq0cs0'), ('http', 's'), ('www', 's'), ('excelsior', 's'), ('mx', 's'), ('art', 's'), ('html', 's'), ('excelsior', 's'), ('editorial', 'ncms000'), ('juev', 'W'), ('octubr', 'W')]], [[('hungr', 'n'), ('rebelion', 'ncfs000'), ('antiestalin', 'n'), ('oscar', 's'), ('gonzalez', 's'), ('lopez', 's'), ('curs', 'ncms000'), ('octubr', 'W'), ('insurgent', 'ncfs000'), ('popul', 'aq0cs0'), ('comand', 'n'), ('estudi', 'nccp000'), ('intelectual', 'nccp000'), ('obrer', 'ncmp000'), ('partidari', 'ncmp000'), ('establec', 'vmn0000'), ('suel', 'ncms000'), ('hungar', 'ncms000'), ('regim', '

In [20]:
vocabulary = getVocabulary(articles)

In [21]:
frequencies = dict()
for i in range(len(articles)):
    frequencies[i] = getFrequencies(articles[i], vocabulary)
globalFrequencies = getGlobalFrequencies(articles,vocabulary) 

### Obteniendo la tabla

In [22]:
def getWordFreq(target, frequency, vocabulary):
    w = SnowballStemmer("spanish").stem(target)
    targetTokens = set()
    for token in vocabulary:
        if token[0] == w:
            targetTokens.add(token)
            
    freq = 0
    for token in targetTokens:
        freq += frequency[token]
    return freq

In [23]:
def getTable(topics, articles, frequencies, globalFrequencies, vocabulary):
    t = len(topics)
    a = len(articles)
    
    total = dict()
    for j in range(t):
        total[j] = getWordFreq(topics[j], globalFrequencies, vocabulary)
    
    table = [[0 for j in range(t)] for i in range(a)]
    for i in range(a):
        for j in range(t):
            inDoc = getWordFreq(topics[j], frequencies[i], vocabulary)
            table[i][j] = inDoc/total[j]
    return table    

In [24]:
def printTable(topics, articles, table):
    with open("./p16_docs/table.txt", "w", encoding="utf8") as f:
        f.write(F"{' ':{5}}")
        for j in range(len(topics)):
            f.write(F"{str(topics[j]):{21}}")
        f.write(F"\n")
        for i in range(len(articles)):
            f.write(F"{str(i):{5}}")
            for j in range(len(topics)):
                f.write(F"{str(table[i][j]):{20}} ")
            f.write(F"\n")

In [25]:
topics = ["gobierno", "política", "méxico", "tecnología", "crisis"]
table = getTable(topics,articles, frequencies, globalFrequencies, vocabulary)
printTable(topics, articles, table)

In [26]:
tb = open("./p16_docs/table.txt", "r", encoding="utf8")
tb_txt = tb.read()
tb.close()
print(tb_txt)

     gobierno             política             méxico               tecnología           crisis               
0    0.0                  0.0                  0.0                  0.0                  0.0                  
1    0.044444444444444446 0.027777777777777776 0.0                  0.0                  0.0                  
2    0.0                  0.0                  0.0                  0.0                  0.0                  
3    0.0                  0.0                  0.0                  0.0                  0.0                  
4    0.0                  0.009259259259259259 0.0                  0.0                  0.2                  
5    0.011111111111111112 0.046296296296296294 0.0053475935828877   0.04                 0.0                  
6    0.011111111111111112 0.037037037037037035 0.026737967914438502 0.0                  0.0                  
7    0.06666666666666667  0.06481481481481481  0.0                  0.0                  0.06666666666666667  
8