In [1]:
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd #tratamiento de datos
import numpy as np #operaciones matriciales y con vectores
import re
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
# NLTK Stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('spanish')
import spacy
nlp = spacy.load('es_core_news_sm')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/garzuzo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Importamos los modelos generados anteriormente

In [2]:
lda_model=joblib.load('modelo_entrenado.pkl')

In [3]:
vectorizer = joblib.load('modelo_vectorizer.pkl')

In [12]:
# Show top n keywords for each topic
listKeywords=None
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        listKeywords=top_keyword_locs
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19
Topic 0,familia,compartir,casa,dios,bien,comunidad,amigos,vivir,disfrutar,barrio,empezar,personas,solo,salir,mejor,miedo,ciudad,salud,cuidar,librar
Topic 1,tranquilidad,naturaleza,sentir,representar,problemas,musica,generar,solidaridad,animales,conflicto,personal,conciencia,territorio,nuevo,aire,campo,personas,silencio,agua,encontrar
Topic 2,respeto,vida,no_violencia,personas,tolerancia,interior,no_guerra,entender,existir,mejor,seguridad,creer,querer,persona,demas_personas,siempre,significar,vivir,sociedad,saber
Topic 3,amor,armonia,vivir,mundo,respeto,bueno,union,construir,felicidad,personas,corazon,perdonar,amar,hogar,comprender,conflictos,sana_convivencia,lograr,padres,comprension
Topic 4,respeto,tranquilo,convivencia,libertad,social,educacion,igualdad,pensar,entorno,vivir,comunidad,diferencias,valores,sociedad,oportunidades,importar,convivir,construir,oportunidad,contar


In [31]:
dictPercentageTopic={}
indexTopic=0
for topic in lda_model.components_:
    listIndex=(-topic).argsort()[:20]
    listPercentage=[]
    for i in listIndex:
        listPercentage.append(topic[i])
    dictPercentageTopic[indexTopic]=listPercentage
    indexTopic+=1
print(dictPercentageTopic)

{0: [461.81088176525583, 165.5753567191865, 143.8364704224571, 136.20966458986905, 104.42437997845856, 90.04598482437581, 85.26235899746699, 79.66186477826284, 76.85211653846984, 62.500142293398135, 55.13918423980476, 54.863704238480516, 53.903704811865154, 45.99585253000119, 43.212546047651216, 40.32816233623312, 40.00627203224864, 37.78967046872543, 37.468401522791076, 37.24640748911386], 1: [732.1302512709294, 153.35964062172954, 85.83112883720949, 63.43301741242175, 55.702275769940805, 55.59358868062559, 49.9463284454919, 38.61353867542728, 33.73669029769277, 33.638770010641416, 33.51469720172925, 30.5858260628885, 28.914575322432018, 28.64054263678656, 26.45939138449189, 25.52813334476734, 25.237296785812735, 23.763294901722375, 23.556976567675612, 23.471540887776996], 2: [169.26679222678888, 163.2005829489928, 142.3505213919794, 114.8496763265771, 103.91916070677229, 79.0500880501144, 62.984046635920265, 60.27655360247092, 58.11532344843668, 57.22979858948762, 57.071660207788206,

## Procesamiento de un nuevo texto

Evaluaremos un nuevo texto, pero antes tenemos que hacer una respectiva limpieza.

In [5]:
text= "Para mi la paz es tener tranquilidad en mi hogar con mi familia, mis amigas y mi perro"

In [6]:
text= re.sub('[,\.\'\"!\)(?0-9]', '', text).lower().strip()

In [7]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc= True))  # deacc=True removes punctuations

data = [text]
data_tokenized = list(sent_to_words(data)) 

In [8]:
stop_words_exceptions=["no"]
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if (word not in stop_words) or (word in stop_words_exceptions)] for doc in texts]
    
    
data_tokenized_nostops= remove_stopwords(data_tokenized)

In [9]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_tokenized, min_count=7, threshold=5) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_tokenized], threshold=5)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

data_bigrams_nonstops=make_bigrams(data_tokenized_nostops)

In [10]:
data_bigrams_nonstops

[['paz', 'tener', 'tranquilidad', 'hogar', 'familia', 'amigas', 'perro']]

In [11]:
data = data_bigrams_nonstops

data_prepared=[]
for row in data:
    text=""
    for word in row:
        if(len(word) > 3):
            text+=word + " "
    data_prepared.append(text.rstrip())

print(data_prepared)

['tener tranquilidad hogar familia amigas perro']


In [12]:
allowed_postags=['ADJ', 'VERB', 'ADV']
stopwordsToken=["demas","tener","mismo", "poder","cada","tambien", "hacer"]
unifiedWord={"respetar":"respeto", "violencia":"no_violencia","musicar":"musica","guerra":"no_guerra"}
data_list=[]

token_list=""
text=nlp(data_prepared[0])
for token in text:
    if (token.pos_ in allowed_postags) and (token.lemma_ not in stopwordsToken) and (token.text not in stopwordsToken):
        if token.lemma_ in unifiedWord:
            token_list+=unifiedWord[token.lemma_]+" "
        else :
            token_list+=token.lemma_+" "

    elif (token.is_stop is not True) and (token.lemma_ not in stopwordsToken) and (token.text not in stopwordsToken):
        if token.text in unifiedWord:
            token_list+=unifiedWord[token.text]+" "
        else :
            token_list+=token.text+" "
data_list.append(token_list.rstrip())

In [13]:
data_list

['tranquilidad hogar familia amigas perro']

Ahora transformamos el texto procesado en una matriz de dispersión

In [14]:
text_processed=vectorizer.transform(data_list)
print(text_processed)

  (0, 181)	1
  (0, 214)	1
  (0, 426)	1


In [15]:
topic_probability_scores = lda_model.transform(text_processed)

Estas son las probabilidades de que el texto quede dentro de cada tópico. 

In [16]:
print(topic_probability_scores)

[[0.54585021 0.30004432 0.05073834 0.05212619 0.05124093]]


Escogemos el de mayor probabilidad y esto nos da el tópico 4. 

In [17]:
listAct=np.argsort(topic_probability_scores)[0].tolist()



In [18]:
firstTopic=listAct.pop()
secondTopic=listAct.pop()

In [19]:
topic1 = df_topic_keywords.iloc[firstTopic, :].values.tolist()
topic2 = df_topic_keywords.iloc[secondTopic, :].values.tolist()

In [20]:
print(firstTopic)
print(secondTopic)

0
1


El texto entraría en el tópico 0 como primario y 1 como secundario, los cuales tienen asociados las siguientes palabras:

In [21]:
print("El tópico primario es topic",firstTopic,":", ', '.join(topic1))

El tópico primario es topic 0 : familia, compartir, casa, dios, bien, comunidad, amigos, vivir, disfrutar, barrio, empezar, personas, solo, salir, mejor, miedo, ciudad, salud, cuidar, librar


In [22]:
print("El tópico secundario es topic",secondTopic,":", ', '.join(topic2))

El tópico secundario es topic 1 : tranquilidad, naturaleza, sentir, representar, problemas, musica, generar, solidaridad, animales, conflicto, personal, conciencia, territorio, nuevo, aire, campo, personas, silencio, agua, encontrar


In [31]:
for i in range(5):
    print(i,", ".join(df_topic_keywords.iloc[i, :].values.tolist()))

0 familia, compartir, casa, dios, bien, comunidad, amigos, vivir, disfrutar, barrio, empezar, personas, solo, salir, mejor, miedo, ciudad, salud, cuidar, librar
1 tranquilidad, naturaleza, sentir, representar, problemas, musica, generar, solidaridad, animales, conflicto, personal, conciencia, territorio, nuevo, aire, campo, personas, silencio, agua, encontrar
2 respeto, vida, no_violencia, personas, tolerancia, interior, no_guerra, entender, existir, mejor, seguridad, creer, querer, persona, demas_personas, siempre, significar, vivir, sociedad, saber
3 amor, armonia, vivir, mundo, respeto, bueno, union, construir, felicidad, personas, corazon, perdonar, amar, hogar, comprender, conflictos, sana_convivencia, lograr, padres, comprension
4 respeto, tranquilo, convivencia, libertad, social, educacion, igualdad, pensar, entorno, vivir, comunidad, diferencias, valores, sociedad, oportunidades, importar, convivir, construir, oportunidad, contar
