#Text Mining
`Autor: Erwing FC 
~erwingforerocastro@gmail.com`

In [None]:
import pandas as pd
import numpy as np
from google.colab import files
#subimos el archivo
files.upload()

df = pd.read_csv('Amazon_Unlocked_Mobile.csv')

df = df.sample(frac=0.1, random_state=10)

df.head()

Saving Amazon_Unlocked_Mobile.csv to Amazon_Unlocked_Mobile.csv


Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,Very good one! Better than Samsung S and iphon...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0


In [None]:
#eliminamos los vacios
df.dropna(inplace=True)

# Eliminamos las revisiones neutrales
df = df[df['Rating'] != 3]

#nueva columna
# 4 y 5 estrellas seran positivas 1
# 1 y 2 estrellas seran negativas 0
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.head(10)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0,0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0,1
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0,0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0,1
277158,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.0,5,I fell in love with this phone because it did ...,0.0,1
100311,Blackberry Torch 2 9810 Unlocked Phone with 1....,BlackBerry,77.49,5,I am pleased with this Blackberry phone! The p...,0.0,1
251669,Motorola Moto E (1st Generation) - Black - 4 G...,Motorola,89.99,5,"Great product, best value for money smartphone...",0.0,1
279878,OtterBox 77-29864 Defender Series Hybrid Case ...,OtterBox,9.99,5,I've bought 3 no problems. Fast delivery.,0.0,1
406017,Verizon HTC Rezound 4G Android Smarphone - 8MP...,HTC,74.99,4,Great phone for the price...,0.0,1
302567,"RCA M1 Unlocked Cell Phone, Dual Sim, 5Mp Came...",RCA,159.99,5,My mom is not good with new technoloy but this...,4.0,1


In [None]:
#media de los ejemplos postivos
df['Positively Rated'].mean()

0.7471776686078667

In [None]:
from sklearn.model_selection import train_test_split

# Separamos los datos 
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], 
                                                    df['Positively Rated'], 
                                                    random_state=0)

In [None]:
print('La primera revision:\n\n', X_train.iloc[0])
print('\n\nTamaño del grupo de entrenamiento: ', X_train.shape)

La primera revision:

 Everything about it is awesome!


Tamaño del grupo de entrenamiento:  (23052,)


###CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Tokenizamos los datos
vect = CountVectorizer().fit(X_train)

In [None]:
vect.get_feature_names()[::2000] #2000 palabras del vocabulario

['00',
 'arroja',
 'comapañias',
 'dvds',
 'golden',
 'lands',
 'oil',
 'razonable',
 'smallsliver',
 'tweak']

In [None]:
len(vect.get_feature_names())   #tamaño del vocabulario

19601

In [None]:
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

<23052x19601 sparse matrix of type '<class 'numpy.int64'>'
	with 613289 stored elements in Compressed Sparse Row format>

In [None]:
#entenamos un modelo lineal simple
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=10000)
model.fit(X_train_vectorized, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
from sklearn.metrics import roc_auc_score
#se hacen unas predicciones
predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.8974332776669326


In [None]:
#obtenemos las palabras del vocabulario en un array
feature_names = np.array(vect.get_feature_names())

# Se ordenan los coeficientes del modelo
sorted_coef_index = model.coef_[0].argsort()

#los 10 coeficientes más pequeños y 10 más grandes
# Los 10 coeficientes más grandes se indexan usando [: -11: -1]
# para que la lista devuelta esté en orden de mayor a menor
print('Coefs más pequeños:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Coefs más grandes: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Coefs más pequeños:
['worst' 'terrible' 'slow' 'junk' 'poor' 'sucks' 'horrible' 'useless'
 'waste' 'disappointed']

Coefs más grandes: 
['excelent' 'excelente' 'excellent' 'perfectly' 'love' 'perfect' 'exactly'
 'great' 'best' 'awesome']


(1, 19601)

###Tfidf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Ajuste el TfidfVectorizer a los datos de entrenamiento especificando una frecuencia de documento mínima de 5
vect = TfidfVectorizer(min_df=5).fit(X_train) #min_df numero minimo de documentos, si aparece menos de 5 veces 
len(vect.get_feature_names())

5442

In [None]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.889951006492175


In [None]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Pequeños tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Grandes tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Pequeños tfidf:
['61' 'printer' 'approach' 'adjustment' 'consequences' 'length' 'emailing'
 'degrees' 'handsfree' 'chipset']

Grandes tfidf: 
['unlocked' 'handy' 'useless' 'cheat' 'up' 'original' 'exelent' 'exelente'
 'exellent' 'satisfied']


In [None]:
sorted_coef_index = model.coef_[0].argsort()

print('Pequeños Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Grandes Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Pequeños Coefs:
['not' 'slow' 'disappointed' 'worst' 'terrible' 'never' 'return' 'doesn'
 'horrible' 'waste']

Grandes Coefs: 
['great' 'love' 'excellent' 'good' 'best' 'perfect' 'price' 'awesome'
 'far' 'perfectly']


In [None]:
print(model.predict(vect.transform(['not an issue, phone is working', #no es un problema, el teléfono funciona
                                    'an issue, phone is not working']))) #un problema, el teléfono no funciona
#0 Negative
#1 Positive

[0 0]


###N-grams

In [None]:
# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

29072

In [None]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.9110661794597458


In [None]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Pequeños Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Grandes Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Pequeños Coefs:
['no good' 'junk' 'poor' 'slow' 'worst' 'broken' 'not good' 'terrible'
 'defective' 'horrible']

Grandes Coefs: 
['excellent' 'excelente' 'excelent' 'perfect' 'great' 'love' 'awesome'
 'no problems' 'good' 'best']


In [None]:
print(model.predict(vect.transform(['not an issue, phone is working',   #no es un problema, el teléfono funciona
                                    'an issue, phone is not working'])))#un problema, el teléfono no funciona
#0 Negative
#1 Positive

[1 0]
