# Sentiment Analysis on Spanish reviews using ntlk's `LogisticClassifier`

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, precision_score
from sklearn.linear_model import LogisticRegression

## Prepare the Data

In [2]:
train_data = pd.read_json('./data/amazon-reviews-ml/dataset_es_train.json', lines=True)

In [3]:
test_data = pd.read_json('./data/amazon-reviews-ml/dataset_es_test.json', lines=True)

In [5]:
len(test_data)

5000

In [7]:
train_data

Unnamed: 0,review_id,product_id,reviewer_id,stars,review_body,review_title,language,product_category
0,es_0491108,product_es_0296024,reviewer_es_0999081,1,Nada bueno se me fue ka pantalla en menos de 8...,television Nevir,es,electronics
1,es_0869872,product_es_0922286,reviewer_es_0216771,1,"Horrible, nos tuvimos que comprar otro porque ...",Dinero tirado a la basura con esta compra,es,electronics
2,es_0811721,product_es_0474543,reviewer_es_0929213,1,Te obligan a comprar dos unidades y te llega s...,solo llega una unidad cuando te obligan a comp...,es,drugstore
3,es_0359921,product_es_0656090,reviewer_es_0224702,1,"No entro en descalificar al vendedor, solo pue...",PRODUCTO NO RECIBIDO.,es,wireless
4,es_0068940,product_es_0662544,reviewer_es_0224827,1,Llega tarde y co la talla equivocada,Devuelto,es,shoes
...,...,...,...,...,...,...,...,...
199995,es_0715276,product_es_0317036,reviewer_es_0643604,5,Mando funciona perfectamente y cumple con toda...,Tal y como se describe,es,electronics
199996,es_0085190,product_es_0622919,reviewer_es_0466173,5,"Compré la batería con cierta reticencia, pero ...",Funciona perfectamente,es,electronics
199997,es_0484496,product_es_0358101,reviewer_es_0330744,5,Buena calidad. Satisfecha con la compra.,Buena calidad.,es,apparel
199998,es_0930141,product_es_0788855,reviewer_es_0694290,5,Perfecto para el cumple de mi hijo,Recomendado,es,toy


In [8]:
# Concatenate title and body texts
train_data['review'] = train_data['review_title'] + ' ' + train_data['review_body']
test_data['review'] = test_data['review_title'] + ' ' + test_data['review_body']

In [9]:
# Create a binary sentiment column based on rating Heuristic
train_data = train_data[train_data['stars'] != 3]
train_data['sentiment'] = np.where(train_data['stars']>3, 1, 0)


test_data = test_data[test_data['stars'] != 3]
test_data['sentiment'] = np.where(test_data['stars']>3, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['sentiment'] = np.where(train_data['stars']>3, 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['sentiment'] = np.where(test_data['stars']>3, 1, 0)


In [10]:
train_data[['review', 'stars', 'sentiment']]

Unnamed: 0,review,stars,sentiment
0,television Nevir Nada bueno se me fue ka panta...,1,0
1,Dinero tirado a la basura con esta compra Horr...,1,0
2,solo llega una unidad cuando te obligan a comp...,1,0
3,PRODUCTO NO RECIBIDO. No entro en descalificar...,1,0
4,Devuelto Llega tarde y co la talla equivocada,1,0
...,...,...,...
199995,Tal y como se describe Mando funciona perfecta...,5,1
199996,Funciona perfectamente Compré la batería con c...,5,1
199997,Buena calidad. Buena calidad. Satisfecha con l...,5,1
199998,Recomendado Perfecto para el cumple de mi hijo,5,1


In [7]:
x_train = train_data['review']
x_test = test_data['review']
y_train = train_data['sentiment']
y_test = test_data['sentiment']

# Word Counts with Countvectorizer

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
counts = CountVectorizer().fit(x_train)
counts

CountVectorizer()

In [39]:
len(counts.get_feature_names_out()) # 54716

54716

In [40]:
x_train_counts = counts.transform(x_train)

#### Train a Logistic Regression Model

In [41]:
from sklearn.linear_model import LogisticRegression

# Train the model with training set and training labels
model = LogisticRegression()
model.fit(x_train_counts, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [42]:
# Predict the transformed test documents
x_test_counts = counts.transform(x_test)

predictions = model.predict(x_test_counts)

### Performance with Word Counts Vector

In [44]:
from sklearn.metrics import roc_auc_score, precision_score


print('AUC: ', roc_auc_score(y_test, predictions))

print('Precision: ', precision_score(y_test, predictions))

AUC:  0.9185
Precision:  0.9172482552342971


In [45]:
feature_names = np.array(counts.get_feature_names_out())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['decepcionada' 'decepcionante' 'decepcion' 'decepción' 'engaño' 'timo'
 'malísima' 'pésima' 'incompleto' 'desilusión']

Largest Coefs: 
['encantada' 'geniales' 'acierto' 'impresionante' 'lujo' 'chulas'
 'estupenda' 'fantastico' 'ideales' 'encanta']


# Tf-idf vector with TfidfVectorizer

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Minimum document frequency 5
vect = TfidfVectorizer(min_df=5).fit(x_train)


19959

In [47]:
len(vect.get_feature_names_out()) # 19,959 features

19959

In [48]:
#Xtrain_vectorized = vect.transform(x_train)
x_train_counts = vect.transform(x_train)

model = LogisticRegression()
model.fit(x_train_counts, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [49]:
x_test_counts = vect.transform(x_test)
predictions = model.predict(x_test_counts)

#### Performance of Tf-idf vector

In [51]:
print('AUC: ', roc_auc_score(y_test, predictions))
print('Precision: ', precision_score(y_test, predictions))

AUC:  0.9200000000000002
Precision:  0.9251012145748988


In [53]:
feature_names = np.array(vect.get_feature_names_out())

sorted_tfidf_index = x_train_counts.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['siri' 'olvidaba' 'asegúrese' 'enviarle' 'virtudes' 'obviando' 'nuclear'
 'desactivarla' 'asegurarte' 'bastaría']

Largest tfidf: 
['incomodas' 'árboles' 'comodisimos' 'xx' 'demasiado' 'descascarillado'
 'impresionante' 'ok' 'cómodos' 'comodisimo']


# N-gram counts with CountVectorizer

In [54]:
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(x_train)

x_train_counts = vect.transform(x_train)

114215

In [55]:
len(vect.get_feature_names_out()) # 114,215

114215

In [56]:
model = LogisticRegression()
model.fit(x_train_counts, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

#### Performance of N gram counts

In [61]:
x_test_counts = vect.transform(x_test)

predictions = model.predict(x_test_counts)

In [62]:
print('AUC: ', roc_auc_score(y_test, predictions))
print('Precision: ', precision_score(y_test, predictions))

AUC:  0.9347500000000001
Precision:  0.9362769693928751


In [63]:
feature_names = np.array(vect.get_feature_names_out())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['dos estrellas' 'no cumple' 'no recomendable' 'decepcion' 'decepcionante'
 'decepcionada' 'decepción' 'nada recomendable' 'engaño' 'desilusión']

Largest Coefs: 
['nada mal' 'las mejores' 'genial' 'perfecto' 'está mal' 'encantada'
 'los mejores' 'geniales' 'fenomenal' 'estupendo']


# Turi Create

In [76]:
import turicreate
import math
import string

In [71]:
train_data = turicreate.SFrame.read_json('./data/amazon-reviews-ml/dataset_es_train.json', orient='lines')

test_data = turicreate.SFrame.read_json('./data/amazon-reviews-ml/dataset_es_test.json', orient='lines')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[dict]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[dict]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [72]:
test_data

language,product_category,product_id,review_body,review_id
es,wireless,product_es_0113523,no me llego el articulo me lo mando por correos ...,es_0038754
es,home,product_es_0017036,"la mensajería horrible, no compro mas ...",es_0748979
es,toy,product_es_0138642,Estoy muy decepcionado con el vendedor ya qu ...,es_0411746
es,home,product_es_0170887,Mi valoración no es sobre el producto sino sobre ...,es_0786686
es,digital_ebook_purchase,product_es_0710642,Pues tenía interés en este libro y probé la ...,es_0429700
es,pc,product_es_0813312,Compre este teclado al ver sus buenos ...,es_0370652
es,lawn_and_garden,product_es_0260888,Sigue sin llegar después de meses ...,es_0838239
es,wireless,product_es_0234796,"No sirve para nada, es malo y se rompe y se ...",es_0233338
es,sports,product_es_0690174,Todavía espero que me llegue despues dw una ...,es_0470247
es,camera,product_es_0624641,La peor cámara que he tenido en mis manos. Dos ...,es_0454942

review_title,reviewer_id,stars
no me llego,reviewer_es_0580071,1
amazon sigue sin cumplir en las entregas ...,reviewer_es_0819733,1
ESTAFA EN EL ENVÍO,reviewer_es_0508607,1
Estafa de Amazon,reviewer_es_0491157,1
No conseguí pasar de la portada en Kindle ...,reviewer_es_0008745,1
Una verdadera pena,reviewer_es_0789216,1
No compréis es un engaño,reviewer_es_0022974,1
"No sirve para nada, pésimo producto ...",reviewer_es_0942055,1
Pésimo trato del vendedor,reviewer_es_0969485,1
Pésima camara,reviewer_es_0681717,1


## Word Counts vector

In [74]:
test_data = test_data[test_data['stars'] != 3]
train_data = train_data[train_data['stars'] != 3]

train_data['review'] = train_data['review_title'] + ' ' + train_data['review_body']
test_data['review'] = test_data['review_title'] + ' ' + test_data['review_body']

In [6]:
impo
punctuation = string.punctuation + '¿¡'
punctuation

NameError: name 'string' is not defined

In [83]:
def remove_punctuation(text):
    translator = text.maketrans('', '', punctuation)
    text = text.translate(translator)
    
    return text

In [87]:
# Create array of punctuation-less reviews
train_data['review_clean'] = train_data['review'].apply(remove_punctuation)
test_data['review_clean'] = test_data['review'].apply(remove_punctuation)

In [88]:
train_data['word_count'] = turicreate.text_analytics.count_words(train_data['review_clean'])
test_data['word_count'] = turicreate.text_analytics.count_words(test_data['review_clean'])

In [96]:
train_data['stars'] = train_data['stars'].apply(lambda x: int(x))
test_data['stars'] = test_data['stars'].apply(lambda x: int(x))

In [97]:
train_data['sentiment'] = train_data['stars'].apply(lambda r: +1 if r>3 else -1)
test_data['sentiment'] = test_data['stars'].apply(lambda r: +1 if r>3 else -1)

### Train the model

In [98]:
turi_model = turicreate.logistic_classifier.create(train_data,
                                                        target = 'sentiment',
                                                        features=['word_count'],
                                                        validation_set=None)

In [100]:
len(turi_model.coefficients) # 79,356

79356

In [112]:
weights = turi_model.coefficients

weights.sort('value', ascending=False).print_rows(num_rows=10)
weights.sort('value', ascending=True).print_rows(num_rows=10)

+------------+---------------+-------+--------------------+--------+
|    name    |     index     | class |       value        | stderr |
+------------+---------------+-------+--------------------+--------+
| word_count |     mesno     |   1   | 42.59737825579724  |  None  |
| word_count |  perplejidad  |   1   | 42.582525960998225 |  None  |
| word_count |     musca     |   1   | 42.39526752827434  |  None  |
| word_count |     tenies    |   1   | 42.18137590459115  |  None  |
| word_count |     estok     |   1   | 40.37992740221876  |  None  |
| word_count |   devolvamos  |   1   | 39.58329100555991  |  None  |
| word_count |    concide    |   1   |  39.5725173462358  |  None  |
| word_count |    probara    |   1   | 38.45273306703082  |  None  |
| word_count | elproductomuy |   1   | 36.954227722712126 |  None  |
| word_count |   miligramos  |   1   | 36.87239577440345  |  None  |
+------------+---------------+-------+--------------------+--------+
[79356 rows x 5 columns]

+-------

In [113]:
predictions = turi_model.predict(test_data)
predictions_norm = predictions.apply(lambda x: 0 if x<0 else 1)

print(turicreate.evaluation.precision(test_data['sentiment'], predictions_norm))
print(turicreate.evaluation.auc(test_data['sentiment'], predictions_norm))

0.38395061728395063
0.7495125


# Testing different iterations parameter

In [22]:
# Word Count
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(x_train)
x_train_counts = vect.transform(x_train)

In [26]:
ITERATIONS = 1500
model = LogisticRegression(max_iter=ITERATIONS)  # default is 100
model.fit(x_train_counts, y_train)

LogisticRegression(max_iter=1500)

In [27]:
x_test_counts = vect.transform(x_test)
predictions = model.predict(x_test_counts)

print('Precision: ', precision_score(y_test, predictions))
print('AUC: ', roc_auc_score(y_test, predictions))

Precision:  0.9155908639523337
AUC:  0.9185000000000001


In [31]:
# N Gram Count
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(x_train)

x_train_counts = vect.transform(x_train)

In [32]:
ITERATIONS = 1500
model = LogisticRegression(max_iter=ITERATIONS)
model.fit(x_train_counts, y_train)

LogisticRegression(max_iter=1500)

In [33]:
x_test_counts = vect.transform(x_test)
predictions = model.predict(x_test_counts)
print('Precision: ', precision_score(y_test, predictions))
print('AUC: ', roc_auc_score(y_test, predictions))


Precision:  0.9337349397590361
AUC:  0.932
