<a href="https://colab.research.google.com/github/gu1lleom/Colab-Clases/blob/main/DM_Clase_07_Text_mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Mining

## Lectura de datos

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/comentarios_parrilas.csv')
df["comentario"] = df["comentario"].astype(str)
df.head()

Unnamed: 0,comida,servicio,ambiente,comentario
0,0.0,1.0,1.0,"no fue todo malo, pedi 1 chori 1 morci bien ..."
1,3.0,3.0,3.0,"Buenas tardes, alguien podría decirme si en la..."
2,3.0,3.0,3.0,"Hiper recomendable. La atención es lenta, pero..."
3,3.0,3.0,2.0,Fuimos un viernes por la noche. Repleto de gen...
4,3.0,2.0,2.0,"Cómo siempre el Mejor Lechón, se Cortaba con s..."


## Minúsculas

In [None]:
df['comentario'] = df['comentario'].str.lower()
df.head()

Unnamed: 0,comida,servicio,ambiente,comentario
0,0.0,1.0,1.0,"no fue todo malo, pedi 1 chori 1 morci bien ..."
1,3.0,3.0,3.0,"buenas tardes, alguien podría decirme si en la..."
2,3.0,3.0,3.0,"hiper recomendable. la atención es lenta, pero..."
3,3.0,3.0,2.0,fuimos un viernes por la noche. repleto de gen...
4,3.0,2.0,2.0,"cómo siempre el mejor lechón, se cortaba con s..."


## Tokenizador

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
#df["comentario"] = df["comentario"].apply(word_tokenize)
#df["comentario"]

regexp = RegexpTokenizer('\w+')
df["comentario"] = df["comentario"].apply(regexp.tokenize)
df["comentario"]

0        [no, fue, todo, malo, pedi, 1, chori, 1, morci...
1        [buenas, tardes, alguien, podría, decirme, si,...
2        [hiper, recomendable, la, atención, es, lenta,...
3        [fuimos, un, viernes, por, la, noche, repleto,...
4        [cómo, siempre, el, mejor, lechón, se, cortaba...
                               ...                        
21539    [el, lugar, no, es, lindo, se, quedo, con, la,...
21540    [la, mejor, parrilla, kosher, es, muy, caro, o...
21541    [espetacular, 10, puntos, uno, de, lo, mejor, ...
21542    [parrillada, bien, servida, carne, en, su, pun...
21543    [atendido, por, los, dueños, mucho, gusto, en,...
Name: comentario, Length: 21544, dtype: object

## Normalización

In [None]:
m = {'morci': 'morcilla', 'chori': 'chorizo'}

# def f(texto):
#     lista = []
#     for token in texto:
#         lista.append(m.get(token, token))
#     return lista
# df['comentario'] = df['comentario'].apply(f)

df['comentario'] = df['comentario'].apply(lambda texto: [m.get(token, token) for token in texto])
df['comentario']

0        [no, fue, todo, malo, pedi, 1, chorizo, 1, mor...
1        [buenas, tardes, alguien, podría, decirme, si,...
2        [hiper, recomendable, la, atención, es, lenta,...
3        [fuimos, un, viernes, por, la, noche, repleto,...
4        [cómo, siempre, el, mejor, lechón, se, cortaba...
                               ...                        
21539    [el, lugar, no, es, lindo, se, quedo, con, la,...
21540    [la, mejor, parrilla, kosher, es, muy, caro, o...
21541    [espetacular, 10, puntos, uno, de, lo, mejor, ...
21542    [parrillada, bien, servida, carne, en, su, pun...
21543    [atendido, por, los, dueños, mucho, gusto, en,...
Name: comentario, Length: 21544, dtype: object

## Stopwords

In [None]:
stopwords = nltk.corpus.stopwords.words("spanish")
stopwords.extend(['es', 'osea', 'sólo'])

df['comentario'] = df['comentario'].apply(lambda texto: [token for token in texto if token not in stopwords])
df['comentario']

0        [malo, pedi, 1, chorizo, 1, morcilla, bien, pa...
1        [buenas, tardes, alguien, podría, decirme, si,...
2        [hiper, recomendable, atención, lenta, cuidada...
3        [viernes, noche, repleto, gente, excelente, at...
4        [cómo, siempre, mejor, lechón, cortaba, verlo,...
                               ...                        
21539    [lugar, lindo, quedo, decoracion, comida, senc...
21540    [mejor, parrilla, kosher, caro, carne, casher,...
21541    [espetacular, 10, puntos, mejor, kosher, mundo...
21542    [parrillada, bien, servida, carne, punto, tier...
21543                   [atendido, dueños, gusto, comidas]
Name: comentario, Length: 21544, dtype: object

## Stemming

In [None]:
stemmer = SnowballStemmer('spanish')
df['comentario'].apply(lambda texto: [stemmer.stem(token) for token in texto])

0        [mal, pedi, 1, choriz, 1, morcill, bien, pap, ...
1        [buen, tard, algui, podr, dec, si, parrill, ta...
2        [hip, recomend, atencion, lent, cuid, atent, m...
3        [viern, noch, replet, gent, excelent, atencion...
4        [com, siempr, mejor, lechon, cort, verl, com, ...
                               ...                        
21539    [lug, lind, qued, decoracion, com, sencill, bu...
21540    [mejor, parrill, kosh, car, carn, cash, mas, c...
21541    [espetacul, 10, punt, mejor, kosh, mund, franc...
21542    [parrill, bien, serv, carn, punt, tiern, ensal...
21543                             [atend, dueñ, gust, com]
Name: comentario, Length: 21544, dtype: object

## Lematizador (en realidad no)

In [None]:
# no hay lematizador en español en nltk. ver spacy
lemmatizer = WordNetLemmatizer()

for token in ['kites', 'babies', 'dogs', 'flying', 'smiling', 'driving', 'drived', 'died', 'tried', 'feet']:
    print(token + " ---> " + lemmatizer.lemmatize(token))

kites ---> kite
babies ---> baby
dogs ---> dog
flying ---> flying
smiling ---> smiling
driving ---> driving
drived ---> drived
died ---> died
tried ---> tried
feet ---> foot


In [None]:
df["comentario"]

0        [malo, pedi, 1, chorizo, 1, morcilla, bien, pa...
1        [buenas, tardes, alguien, podría, decirme, si,...
2        [hiper, recomendable, atención, lenta, cuidada...
3        [viernes, noche, repleto, gente, excelente, at...
4        [cómo, siempre, mejor, lechón, cortaba, verlo,...
                               ...                        
21539    [lugar, lindo, quedo, decoracion, comida, senc...
21540    [mejor, parrilla, kosher, caro, carne, casher,...
21541    [espetacular, 10, puntos, mejor, kosher, mundo...
21542    [parrillada, bien, servida, carne, punto, tier...
21543                   [atendido, dueños, gusto, comidas]
Name: comentario, Length: 21544, dtype: object

## Preparación para clasificación

In [None]:
df['comentario'] = df['comentario'].str.join(' ')
df['comentario']

0        malo pedi 1 chorizo 1 morcilla bien papas frit...
1        buenas tardes alguien podría decirme si parril...
2        hiper recomendable atención lenta cuidada aten...
3        viernes noche repleto gente excelente atención...
4        cómo siempre mejor lechón cortaba verlo cómo f...
                               ...                        
21539    lugar lindo quedo decoracion comida sencilla b...
21540    mejor parrilla kosher caro carne casher mas ca...
21541    espetacular 10 puntos mejor kosher mundo franc...
21542    parrillada bien servida carne punto tierna ens...
21543                        atendido dueños gusto comidas
Name: comentario, Length: 21544, dtype: object

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0.005, max_df=0.7, strip_accents='ascii', max_features=2000)
vectorized_data = vectorizer.fit_transform(df['comentario'])
vectorized_data.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
vectorizer.get_feature_names_out()

array(['10', '100', '12', '15', '20', '21', '25', '30', '40', '45', '50',
       'abundante', 'abundantes', 'aca', 'accesible', 'accesibles',
       'aceite', 'aceptable', 'aceptan', 'achuras', 'acondicionado',
       'acorde', 'acordes', 'ademas', 'adentro', 'afuera', 'agradable',
       'agua', 'aguas', 'ahi', 'ahora', 'aire', 'aire acondicionado',
       'aire libre', 'aires', 'algun', 'alguna', 'alla', 'alli',
       'almorzar', 'almuerzo', 'altamente', 'altamente recomendable',
       'alto', 'altos', 'amable', 'amables', 'ambientacion', 'ambientado',
       'ambiente', 'ambiente agradable', 'ambiente bueno',
       'ambiente calido', 'ambiente familiar', 'ambiente lindo',
       'ambiente ruidoso', 'ambos', 'amigas', 'amigo', 'amigos',
       'amigos familia', 'amplia', 'amplio', 'ano', 'anoche', 'anos',
       'aparte', 'apenas', 'argentina', 'arriba', 'as', 'asado',
       'asado tira', 'asador', 'asi', 'atencion', 'atencion amable',
       'atencion ambiente', 'atencion buena'

In [None]:
count_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names_out())
count_df

Unnamed: 0,10,100,12,15,20,21,25,30,40,45,...,volver,volvere,volveremos,volveria,volvi,voy,voy volver,vuelva,vuelvo,zona
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21539,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21540,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21541,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21542,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
count_df.hist()

KeyboardInterrupt: ignored

Error in callback <function flush_figures at 0x7f7793780950> (for post_execute):


KeyboardInterrupt: ignored

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0.001, max_df=0.9, strip_accents='ascii', max_features=2000)
vectorized_data = vectorizer.fit_transform(df['comentario'])

In [None]:
tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,00,10,10 minutos,10 puntos,100,100 recomendable,11,12,13,14,...,voy siempre,voy volver,vuelta,vuelva,vuelve,vuelvo,vuelvo mas,vuelvo nunca,yendo,zona
0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21539,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21540,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21541,0.0,0.381684,0.0,0.524001,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21542,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
