In [51]:
import pandas as pd
import numpy as np

In [46]:
%%writefile documents.txt
The sky is blue and beautiful.
Love this blue and beautiful sky!
The quick brown fox jumps over the lazy dog.
A king's breakfast has sausages, ham, bacon, eggs, toast and beans
I love green eggs, ham, sausages and bacon!
The brown fox is quick and the blue dog is lazy!
The sky is very blue and the sky is very beautiful today
The dog is lazy but the brown fox is quick!

Writing documents.txt


In [47]:
with open('documents.txt' , 'r') as f:
    documents = f.readlines()

documents = [t.replace('\n', '') for t in documents]
documents

['The sky is blue and beautiful.',
 'Love this blue and beautiful sky!',
 'The quick brown fox jumps over the lazy dog.',
 "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
 'I love green eggs, ham, sausages and bacon!',
 'The brown fox is quick and the blue dog is lazy!',
 'The sky is very blue and the sky is very beautiful today',
 'The dog is lazy but the brown fox is quick!']

In [48]:
%%writefile labels.txt
weather
weather
animals
food
food
animals
weather
animals

Writing labels.txt


In [49]:
with open('labels.txt' , 'r') as f:
    labels = f.readlines()

labels = [t.replace('\n', '') for t in labels]
labels

['weather',
 'weather',
 'animals',
 'food',
 'food',
 'animals',
 'weather',
 'animals']

In [52]:
corpus = pd.DataFrame({'Document': documents, 'Labels': labels})
corpus

Unnamed: 0,Document,Labels
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,"A king's breakfast has sausages, ham, bacon, e...",food
4,"I love green eggs, ham, sausages and bacon!",food
5,The brown fox is quick and the blue dog is lazy!,animals
6,The sky is very blue and the sky is very beaut...,weather
7,The dog is lazy but the brown fox is quick!,animals


In [2]:
## TF-IDF
## tecnica que mira no solo la cantidad de veces que una palabra está preente 
## en un texto sino que tiene en cuenta también tiene en cuenta la proporcion
## con respecto a todos los textos que se tengan en un df.
##
## Cabe aclarar que esta tecnica está basada en BoW.
##
import sklearn
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
nlp = spacy.load('es_core_news_lg')

In [53]:
## se define una funcion para normalizar los textos
##
import nltk
import re

tokenizer = nltk.WordPunctTokenizer()

STOPWORDS = nltk.corpus.stopwords.words('english')

def normalize_document(document):
    document = re.sub(r'[^a-zA-Z\s]', '', document, re.I|re.A) # re.I -> ignorecase # re.A -> ASCII-only matching
    document = document.lower().strip()
    tokens = tokenizer.tokenize(document)
    tokens = [token for token in tokens if token not in STOPWORDS]
    document = ' '.join(tokens)
    return document

In [56]:
re.sub(r'[^a-zA-Z\s]', '', 'Love this blue and beautiful sky!', re.I|re.A)

'Love this blue and beautiful sky'

In [54]:
corpus['Normalized_Documents'] = corpus['Document'].map(normalize_document)
corpus

Unnamed: 0,Document,Labels,Normalized_Documents
0,The sky is blue and beautiful.,weather,sky blue beautiful
1,Love this blue and beautiful sky!,weather,love blue beautiful sky
2,The quick brown fox jumps over the lazy dog.,animals,quick brown fox jumps lazy dog
3,"A king's breakfast has sausages, ham, bacon, e...",food,kings breakfast sausages ham bacon eggs toast ...
4,"I love green eggs, ham, sausages and bacon!",food,love green eggs ham sausages bacon
5,The brown fox is quick and the blue dog is lazy!,animals,brown fox quick blue dog lazy
6,The sky is very blue and the sky is very beaut...,weather,sky blue sky beautiful today
7,The dog is lazy but the brown fox is quick!,animals,dog lazy brown fox quick


In [58]:
## importamos el transformador de TFIDF
##
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0.0, max_df=1.0, norm='l2', use_idf=True, smooth_idf = True)
tv_matrix = tv.fit_transform(corpus['Normalized_Documents'].tolist())
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns = vocab)

Unnamed: 0,bacon,beans,beautiful,blue,breakfast,brown,dog,eggs,fox,green,ham,jumps,kings,lazy,love,quick,sausages,sky,toast,today
0,0.0,0.0,0.6,0.53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0
1,0.0,0.0,0.49,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57,0.0,0.0,0.49,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.38,0.38,0.0,0.38,0.0,0.0,0.53,0.0,0.38,0.0,0.38,0.0,0.0,0.0,0.0
3,0.32,0.38,0.0,0.0,0.38,0.0,0.0,0.32,0.0,0.0,0.32,0.0,0.38,0.0,0.0,0.0,0.32,0.0,0.38,0.0
4,0.39,0.0,0.0,0.0,0.0,0.0,0.0,0.39,0.0,0.47,0.39,0.0,0.0,0.0,0.39,0.0,0.39,0.0,0.0,0.0
5,0.0,0.0,0.0,0.37,0.0,0.42,0.42,0.0,0.42,0.0,0.0,0.0,0.0,0.42,0.0,0.42,0.0,0.0,0.0,0.0
6,0.0,0.0,0.36,0.32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.72,0.0,0.5
7,0.0,0.0,0.0,0.0,0.0,0.45,0.45,0.0,0.45,0.0,0.0,0.0,0.0,0.45,0.0,0.45,0.0,0.0,0.0,0.0


In [62]:
## Non-negative Matrix Factorization.
##
from sklearn.decomposition import NMF

tv = TfidfVectorizer(min_df=0.0, max_df=1.0, norm='l2', use_idf=True, smooth_idf = True)
tv_matrix = tv.fit_transform(corpus['Normalized_Documents'].tolist())

nmf_model = NMF(n_components=3, max_iter=1000, random_state=0)
nmf_model.fit(tv_matrix)

NMF(max_iter=1000, n_components=3, random_state=0)

In [64]:
## vamos a ver cuales son las palabras con mayores coeficientes por
## topico.
for index, topic in enumerate(nmf_model.components_):
    print(f"The top ten words for the component # {index}")
    print([tv.get_feature_names()[i] for i in topic.argsort()[-3:]])
    print('\n')

The top ten words for the component # 0
['brown', 'lazy', 'quick']


The top ten words for the component # 1
['blue', 'beautiful', 'sky']


The top ten words for the component # 2
['eggs', 'ham', 'bacon']


