In [1]:
import gensim
import math
import nltk
import numpy
import spacy

from gensim import corpora
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

# Extraordinario
- **Alumna:** Enya Quetzalli Gómez Rodríguez *(Eduardo Gómez Rodríguez)*
- **Profesora:** Olga Kolesnikova
- **Escuela:** Escuela Superior de Cómputo del IPN
- **Grupo:** 3CV9
- **Semestre:** 2020/2

**Instrucciones:**
   - Obtener un texto normalizado, lematizado, sin stopwords.
   - Obtener 5 tópicos del los textos.

# Normalizar Texto (para 6)

In [2]:
def cleanHtml(txt):
    from bs4 import BeautifulSoup
    return BeautifulSoup(txt,'lxml').get_text().lower()

In [3]:
def cleanText(txt):
    return txt.replace('/', ' ').replace('.', ' ').replace('-', ' ').replace('\n', ' ')

In [4]:
def deleteTrash(txt):
    import re
    good = {'\n'}
    for i in "abcdefghijklmnopqrstuvwxyz áéíóúñü":
        good.add(i)
    ans = ""
    for c in txt:
        if c in good:
            ans += c
    return ans

In [5]:
def deleteStopwords(txt):
    from nltk.corpus import stopwords
    ans = []
    stp = stopwords.words()
    for w in txt:
        if w not in stp:
            ans.append(w)
    return ans

In [6]:
def replaceWithLemmasEnglish(tokens):
    lemma = WordNetLemmatizer()
    lemmatized_tokens = []
    for word in tokens:
        lemmatized_tokens.append(lemma.lemmatize(word))
    return lemmatized_tokens

In [7]:
def loadDocumentsEnglish(org_txt):
    arts = org_txt.split("<h3>")[1:]
    documents = list()
    
    for art in arts:
        txt = deleteTrash(cleanText(cleanHtml(art)))
        normalizedTokens = deleteStopwords(nltk.word_tokenize(txt))
        lemmatizedTokens = replaceWithLemmasEnglish(normalizedTokens)
        documents.append(lemmatizedTokens)
        
    return documents

In [8]:
def loadDocumentsSpanish(org_txt):
    docs = org_txt.split("<h3>")[1:]
    nlp = spacy.load("es_core_news_sm")
    documents = list()
    
    for doc in docs:
        cleanRawText = ' '.join(deleteTrash(cleanText(cleanHtml(doc))).split())
        normalizedTokens = nlp(cleanRawText)
        lemmatizedTokens = [token.lemma_ for token in normalizedTokens if not token.lemma_.isspace()]
        noStopwordsTokens = deleteStopwords(lemmatizedTokens)
        documents.append(noStopwordsTokens)
        
    return documents

In [39]:
f = open("./files/e971124.htm", "r", encoding="utf8")
org_txt = f.read()
f.close()

documents = loadDocumentsEnglish(org_txt)

In [40]:
for doc in documents:
    print(doc)
    print()

['epigrama', 'jorge', 'mansilla', 'torres', 'desvincula', 'gobierno', 'chiapas', 'grupo', 'paramilitar', 'paz', 'justicia', 'nadie', 'sabe', 'batos', 'santos', 'cristianos', 'inventó', 'pilatos', 'lavado', 'manos', 'editorial', 'nota', 'siguiente', 'http', 'www', 'excelsior', 'mx', 'art', 'html']

['cuentos', 'políticos', 'festejar', 'francisco', 'martin', 'moreno', 'mexicano', 'vemos', 'frente', 'espejo', 'debemos', 'contemplar', 'imagen', 'debe', 'estimular', 'absoluto', 'bien', 'puede', 'ser', 'interese', 'maquillarla', 'desdibujarla', 'decorarla', 'afeites', 'escapar', 'realidad', 'fondo', 'sabemos', 'menos', 'debemos', 'saber', 'hecho', 'haciendo', 'ahora', 'mismo', 'país', 'así', 'llamado', 'cuerno', 'abundancia', 'anterior', 'viene', 'cuento', 'después', 'haber', 'leído', 'crónicas', 'reportajes', 'historias', 'relativas', 'suerte', 'siguió', 'méxico', 'después', 'devastador', 'conflicto', 'armado', 'vimos', 'involucrados', 'principios', 'siglo', 'curso', 'llamar', 'atención', '

# Extraer 5 Tópicos (para 7)

In [41]:
'''Creating the term dictionary of our courpus, where every unique term is assigned an index.'''
dictionary = corpora.Dictionary(documents)

In [42]:
'''Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.'''
doc_term_matrix = [dictionary.doc2bow(doc) for doc in documents]

In [43]:
'''Creating the object for LDA model using gensim library'''
Lda = gensim.models.ldamodel.LdaModel

In [44]:
'''Running and Trainign LDA model on the document term matrix.'''
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)

In [46]:
for top in ldamodel.print_topics():
    print(top)
    print()

(0, '0.004*"millones" + 0.003*"mil" + 0.003*"país" + 0.003*"méxico" + 0.003*"nacional" + 0.003*"java" + 0.003*"sistema" + 0.002*"sun" + 0.002*"desarrollo" + 0.002*"años"')

(1, '0.005*"méxico" + 0.004*"año" + 0.003*"mx" + 0.003*"empresas" + 0.003*"ciento" + 0.003*"mismo" + 0.003*"ser" + 0.003*"www" + 0.003*"excelsior" + 0.002*"grandes"')

(2, '0.003*"debe" + 0.003*"partido" + 0.003*"nacional" + 0.003*"ser" + 0.003*"iglesias" + 0.003*"gobierno" + 0.002*"chihuahua" + 0.002*"artemio" + 0.002*"política" + 0.002*"año"')

(3, '0.004*"debe" + 0.003*"gobierno" + 0.003*"ser" + 0.003*"clinton" + 0.003*"siguiente" + 0.003*"excelsior" + 0.002*"pública" + 0.002*"nota" + 0.002*"http" + 0.002*"html"')

(4, '0.004*"mercado" + 0.003*"crecimiento" + 0.003*"internet" + 0.003*"redes" + 0.003*"armas" + 0.003*"unidos" + 0.003*"méxico" + 0.003*"puede" + 0.003*"país" + 0.003*"gran"')

