In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np # to use numpy arrays instead of lists
import pandas as pd # DataFrame (table)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score

In [3]:
from sklearn.model_selection import GridSearchCV

In [4]:
dataSet = pd.read_csv("Amazon_Unlocked_Mobile.csv", header=0)
print(dataSet)

                                             Product Name Brand Name   Price  \
0       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
1       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
2       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
3       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
4       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
...                                                   ...        ...     ...   
288192  OtterBox Samsung Galaxy S5 Defender Series Pro...   Ultrabox   14.99   
288193  OtterBox Samsung Galaxy S5 Defender Series Pro...   Ultrabox   14.99   
288194  OtterBox Samsung Galaxy S5 Defender Series Pro...   Ultrabox   14.99   
288195  OtterBox Samsung Galaxy S5 Defender Series Pro...   Ultrabox   14.99   
288196           OtterBox Samsung Galaxy S5 Defender Seri        NaN     NaN   

        Rating                         

In [5]:
dataSet.dropna(inplace=True)
dataSet = dataSet.iloc[0:50000,:]

# Encode 4s and 5s as positive
dataSet['Sentiment'] = 2

for row in range(len(dataSet)):
  if dataSet.iloc[row,3] > 3:
    dataSet.iloc[row,6] = 4

datos = ['Reviews', 'Rating', 'Sentiment']
misDatos = dataSet[datos]
print(misDatos.head(20))

misDatos = misDatos[['Reviews', 'Sentiment']]
print(misDatos.head(20))

                                              Reviews  Rating  Sentiment
0   I feel so LUCKY to have found this used (phone...     5.0          4
1   nice phone, nice up grade from my pantach revu...     4.0          4
2                                        Very pleased     5.0          4
3   It works good but it goes slow sometimes but i...     4.0          4
4   Great phone to replace my lost phone. The only...     4.0          4
5   I already had a phone with problems... I know ...     1.0          2
6   The charging port was loose. I got that solder...     2.0          2
7   Phone looks good but wouldn't stay charged, ha...     2.0          2
8   I originally was using the Samsung S2 Galaxy f...     5.0          4
9   It's battery life is great. It's very responsi...     3.0          2
10  My fiance had this phone previously, but cause...     3.0          2
11  This is a great product it came after two days...     5.0          4
12  These guys are the best! I had a little situat.

In [6]:
train_size = 0.75
test_size = 1 - train_size
# Split data into train and test sets
trainSet, testSet = train_test_split(misDatos, test_size=test_size, random_state=0)

print("train set")
print(trainSet)
print("\n")
print("test set")
print(testSet)

train set
                                                 Reviews  Sentiment
49657                                              Super          4
25773  Awesome service and went out of way to make su...          4
42791  The phone is as described, no complaints. Its ...          4
44626  Very undone. Speaker was blown, front facing h...          2
39543  The phone seems to be decent, but I can't be s...          2
...                                                  ...        ...
28884           It was exactly as described. Good price.          4
63238  This is the first negative review I have ever ...          2
57078                                   Not as described          2
58466             It came in on time and no issue at all          4
3664   Phone works great but the screen was/is cracke...          2

[37500 rows x 2 columns]


test set
                                                 Reviews  Sentiment
18699  Great purchase had a bit of problem getting Fa...          4
2

In [7]:
vectorizer = TfidfVectorizer(stop_words='english',  ## NOTA: este se quita porque los comentarios están en español
                             min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(trainSet['Reviews'])
test_vectors = vectorizer.transform(testSet['Reviews'])

print(train_vectors)
print("\n")
print(test_vectors)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 400884 stored elements and shape (37500, 4871)>
  Coords	Values
  (0, 4206)	1.0
  (1, 465)	0.3732214938577385
  (1, 3836)	0.34498262927372236
  (1, 4757)	0.37970489192330326
  (1, 4737)	0.37702388902195494
  (1, 2645)	0.3903473716133277
  (1, 4223)	0.39500536646931483
  (1, 3354)	0.315371065344649
  (1, 1926)	0.21796941358055422
  (2, 3132)	0.09440577389052866
  (2, 1227)	0.22183363192563105
  (2, 953)	0.3065525989603865
  (2, 4811)	0.2083967705063876
  (2, 4054)	0.46907793611622456
  (2, 4617)	0.21115755149998972
  (2, 2445)	0.41926090735462823
  (2, 1967)	0.13232587749610572
  (2, 736)	0.2562859284572832
  (2, 913)	0.43849454936536236
  (2, 3942)	0.29766132562048175
  (3, 4046)	0.23041057419377609
  (3, 592)	0.3161883134199908
  (3, 1637)	0.47112298092726057
  (3, 3556)	0.2242573659775506
  (3, 511)	0.15722438848853545
  :	:
  (37496, 314)	0.156436439166711
  (37496, 2877)	0.2467761527045587
  (37496, 2915)	0.2360719389896

In [8]:
# Perform classification with DT
classifier_dt = DecisionTreeClassifier(max_depth=5)
classifier_dt.fit(train_vectors, trainSet.iloc[:,1:])

prediction_dt = classifier_dt.predict(test_vectors)

df_dt = pd.DataFrame(prediction_dt, columns=['Sentiment'], index=testSet.index)
print(df_dt)
print("\n")

print(testSet)

       Sentiment
18699          4
27243          4
62620          2
33496          4
57107          4
...          ...
22243          4
10291          4
50657          4
38506          4
9309           4

[12500 rows x 1 columns]


                                                 Reviews  Sentiment
18699  Great purchase had a bit of problem getting Fa...          4
27243       Very fast shipping. Product like description          4
62620  I was very excited to receive this in the mail...          2
33496  It stoped working a week atfer i got it , it s...          2
57107                                           not good          2
...                                                  ...        ...
22243                               nice, smart and fast          4
10291  You cannot beat an iPhone on Straighttalk. The...          4
50657  It us for another person but I know they are good          4
38506                                         cool phone          4
9309   iphone is in 

In [9]:
# Perform the class with RF
classifier_rf = RandomForestClassifier(n_estimators=100, max_depth=5)
classifier_rf.fit(train_vectors, trainSet.iloc[:,1:])

prediction_rf = classifier_rf.predict(test_vectors)

df_rf = pd.DataFrame(prediction_rf, columns=['Sentiment'], index=testSet.index)
print(df_rf)
print("\n")

print(testSet)

       Sentiment
18699          4
27243          4
62620          4
33496          4
57107          4
...          ...
22243          4
10291          4
50657          4
38506          4
9309           4

[12500 rows x 1 columns]


                                                 Reviews  Sentiment
18699  Great purchase had a bit of problem getting Fa...          4
27243       Very fast shipping. Product like description          4
62620  I was very excited to receive this in the mail...          2
33496  It stoped working a week atfer i got it , it s...          2
57107                                           not good          2
...                                                  ...        ...
22243                               nice, smart and fast          4
10291  You cannot beat an iPhone on Straighttalk. The...          4
50657  It us for another person but I know they are good          4
38506                                         cool phone          4
9309   iphone is in 

In [10]:
# Perform the class with SVM
classifier_svm = svm.SVC(kernel="poly")
classifier_svm.fit(train_vectors, trainSet.iloc[:,1:])

prediction_svm = classifier_svm.predict(test_vectors)

df_svm = pd.DataFrame(prediction_svm, columns=['Sentiment'], index=testSet.index)
print(df_svm)
print("\n")

print(testSet)

       Sentiment
18699          4
27243          4
62620          2
33496          2
57107          4
...          ...
22243          4
10291          4
50657          4
38506          4
9309           4

[12500 rows x 1 columns]


                                                 Reviews  Sentiment
18699  Great purchase had a bit of problem getting Fa...          4
27243       Very fast shipping. Product like description          4
62620  I was very excited to receive this in the mail...          2
33496  It stoped working a week atfer i got it , it s...          2
57107                                           not good          2
...                                                  ...        ...
22243                               nice, smart and fast          4
10291  You cannot beat an iPhone on Straighttalk. The...          4
50657  It us for another person but I know they are good          4
38506                                         cool phone          4
9309   iphone is in 

In [11]:
methodsUsed=['DT', 'RF', 'SVM']
performanceHeaders=['precision','recall','f1-score']
modPerformancePos = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)
modPerformanceNeg = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)
modPerformanceNeut = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)
dfAcc= pd.DataFrame(index=methodsUsed, columns=['accuracy'])

In [None]:
# results report
report = classification_report(testSet[['Sentiment']], prediction_dt, output_dict=True)
print(report)
print("\n")

# DT metrics
print("DT metrics")
positive = report['4']
dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
print(dfpos)
print("\n")
negative = report['2']
dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')

dfAcc.iloc[0,0] = accuracy_score(testSet[['Sentiment']], prediction_dt)
print("accuracy: ", dfAcc.iloc[0,0])
print("\n")

# Comparative metrics
print("Comparative metrics")
modPerformancePos.iloc[0,0] = dfpos.iloc[0,0]
modPerformanceNeg.iloc[0,0] = dfneg.iloc[0,0]
modPerformancePos.iloc[0,1] = dfpos.iloc[1,0]
modPerformanceNeg.iloc[0,1] = dfneg.iloc[1,0]
modPerformancePos.iloc[0,2] = dfpos.iloc[2,0]
modPerformanceNeg.iloc[0,2] = dfneg.iloc[2,0]

{'2': {'precision': 0.6141045958795562, 'recall': 0.6551141166525782, 'f1-score': 0.6339468302658486, 'support': 2366.0}, '4': {'precision': 0.813443072702332, 'recall': 0.7850838481906444, 'f1-score': 0.7990119020884797, 'support': 4532.0}, 'accuracy': 0.7405044940562482, 'macro avg': {'precision': 0.7137738342909441, 'recall': 0.7200989824216113, 'f1-score': 0.7164793661771642, 'support': 6898.0}, 'weighted avg': {'precision': 0.7450703797242677, 'recall': 0.7405044940562482, 'f1-score': 0.7423949174650605, 'support': 6898.0}}


DT metrics
              positive
precision     0.813443
recall        0.785084
f1-score      0.799012
support    4532.000000


accuracy:  0.7405044940562482


Comparative metrics


In [None]:
# results report
report = classification_report(testSet[['Sentiment']], prediction_rf, output_dict=True)
print(report)
print("\n")

# RF metrics
print("RF metrics")
positive = report['4']
dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
print(dfpos)
print("\n")
negative = report['2']
dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')

dfAcc.iloc[1,0] = accuracy_score(testSet[['Sentiment']], prediction_rf)
print("accuracy: ", dfAcc.iloc[0,0])
print("\n")

# Comparative metrics
print("Comparative metrics")
modPerformancePos.iloc[1,0] = dfpos.iloc[0,0]
modPerformanceNeg.iloc[1,0] = dfneg.iloc[0,0]
modPerformancePos.iloc[1,1] = dfpos.iloc[1,0]
modPerformanceNeg.iloc[1,1] = dfneg.iloc[1,0]
modPerformancePos.iloc[1,2] = dfpos.iloc[2,0]
modPerformanceNeg.iloc[1,2] = dfneg.iloc[2,0]

{'2': {'precision': 1.0, 'recall': 0.029585798816568046, 'f1-score': 0.05747126436781609, 'support': 2366.0}, '4': {'precision': 0.6637375512595196, 'recall': 1.0, 'f1-score': 0.797887323943662, 'support': 4532.0}, 'accuracy': 0.6671498985213106, 'macro avg': {'precision': 0.8318687756297598, 'recall': 0.514792899408284, 'f1-score': 0.4276792941557391, 'support': 6898.0}, 'weighted avg': {'precision': 0.779074888708052, 'recall': 0.6671498985213106, 'f1-score': 0.5439261182381747, 'support': 6898.0}}


RF metrics
              positive
precision     0.663738
recall        1.000000
f1-score      0.797887
support    4532.000000


accuracy:  0.7405044940562482


Comparative metrics


In [None]:
# results report
report = classification_report(testSet[['Sentiment']], prediction_svm, output_dict=True)
print(report)
print("\n")

# SVM metrics
print("SVM metrics")
positive = report['4']
dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
print(dfpos)
print("\n")
negative = report['2']
dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')

dfAcc.iloc[2,0] = accuracy_score(testSet[['Sentiment']], prediction_rf)
print("accuracy: ", dfAcc.iloc[0,0])
print("\n")

# Comparative metrics
print("Comparative metrics")
modPerformancePos.iloc[2,0] = dfpos.iloc[0,0]
modPerformanceNeg.iloc[2,0] = dfneg.iloc[0,0]
modPerformancePos.iloc[2,1] = dfpos.iloc[1,0]
modPerformanceNeg.iloc[2,1] = dfneg.iloc[1,0]
modPerformancePos.iloc[2,2] = dfpos.iloc[2,0]
modPerformanceNeg.iloc[2,2] = dfneg.iloc[2,0]

{'2': {'precision': 0.9752220663861617, 'recall': 0.8816568047337278, 'f1-score': 0.9260821309655938, 'support': 2366.0}, '4': {'precision': 0.9411641101071654, 'recall': 0.9883053839364518, 'f1-score': 0.9641588634162093, 'support': 4532.0}, 'accuracy': 0.9517251377210786, 'macro avg': {'precision': 0.9581930882466636, 'recall': 0.9349810943350898, 'f1-score': 0.9451204971909015, 'support': 6898.0}, 'weighted avg': {'precision': 0.9528459199877257, 'recall': 0.9517251377210786, 'f1-score': 0.9510986214651864, 'support': 6898.0}}


SVM metrics
              positive
precision     0.941164
recall        0.988305
f1-score      0.964159
support    4532.000000


accuracy:  0.7405044940562482


Comparative metrics


In [None]:
print("Positive comments metrics")
print(modPerformancePos)
print("\n")
print("Negative comments metrics")
print(modPerformanceNeg)
print("\n")
print(dfAcc)

Positive comments metrics
    precision    recall  f1-score
DT   0.813443  0.785084  0.799012
RF   0.663738       1.0  0.797887
SVM  0.941164  0.988305  0.964159


Negative comments metrics
    precision    recall  f1-score
DT   0.614105  0.655114  0.633947
RF        1.0  0.029586  0.057471
SVM  0.975222  0.881657  0.926082


     accuracy
DT   0.740504
RF    0.66715
SVM   0.66715


# Fine-tuning

In [None]:
# Perform classification with DT
print('Classification with DT')
dt_clf = DecisionTreeClassifier()
param_search_dt = {
        'criterion': ["gini", 'entropy'],
        'max_depth': [5, 10, 20, 30, None]
    }

grid_search_dt = GridSearchCV(estimator=dt_clf, param_grid=param_search_dt, cv=5, verbose=1)
grid_search_dt.fit(train_vectors, trainSet.iloc[:,1:])
best_clf_dt = grid_search_dt.best_estimator_

print(best_clf_dt.get_params())
print("\n")

grid_dt_prediction = best_clf_dt.predict(test_vectors)

df_grid_dt = pd.DataFrame(grid_dt_prediction, columns=['Sentiment'], index=testSet.index)
print(df_grid_dt)
print("\n")

print(testSet)

Classification with DT
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


       Sentiment
2295           4
10083          4
27921          4
11326          4
31333          4
...          ...
517            2
148            2
6244           2
28392          4
7639           2

[6898 rows x 1 columns]


                                                 Reviews  Sentiment
2295   Take a step back and think about this thing. I...          4
10083                                            Love it          4
27921  I had to buy a new charger it won't work with ...          4
11326  Absolutely love this product... I've been want...          4
31333                          Broke in a

In [None]:
# results report
report = classification_report(testSet[['Sentiment']], grid_dt_prediction, output_dict=True)
print(report)
print("\n")

acc = accuracy_score(testSet[['Sentiment']], grid_dt_prediction)
print("accuracy: ", acc)

{'2': {'precision': 0.9157525230364195, 'recall': 0.8820794590025359, 'f1-score': 0.8986006458557588, 'support': 2366.0}, '4': {'precision': 0.9395973154362416, 'recall': 0.9576345984112974, 'f1-score': 0.9485302152770189, 'support': 4532.0}, 'accuracy': 0.9317193389388229, 'macro avg': {'precision': 0.9276749192363305, 'recall': 0.9198570287069167, 'f1-score': 0.9235654305663888, 'support': 6898.0}, 'weighted avg': {'precision': 0.9314186000378682, 'recall': 0.9317193389388229, 'f1-score': 0.9314044743012722, 'support': 6898.0}}


accuracy:  0.9317193389388229


In [None]:
# Perform classification with RF
print('Classification with RF')
rf_clf = RandomForestClassifier()
param_search_rf = {
        'n_estimators': [75, 100],
        'max_depth': [5, 15, None]
    }

grid_search_rf = GridSearchCV(estimator=rf_clf, param_grid=param_search_rf, cv=5, verbose=1)
grid_search_rf.fit(train_vectors, trainSet.iloc[:,1:])
best_clf_rf = grid_search_rf.best_estimator_

print(best_clf_rf.get_params())
print("\n")

grid_rf_prediction = best_clf_rf.predict(test_vectors)

df_grid_rf = pd.DataFrame(grid_rf_prediction, columns=['Sentiment'], index=testSet.index)
print(df_grid_rf)
print("\n")

print(testSet)

Classification with RF
Fitting 5 folds for each of 6 candidates, totalling 30 fits
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


       Sentiment
2295           4
10083          4
27921          4
11326          4
31333          4
...          ...
517            4
148            2
6244           2
28392          4
7639           2

[6898 rows x 1 columns]


                                                 Reviews  Sentiment
2295   Take a step back and think about this thing. I...          4
10083                                            Love it          4
27921  I had to buy a new charger it won't work with ...        

In [None]:
# results report
report = classification_report(testSet[['Sentiment']], grid_rf_prediction, output_dict=True)
print(report)
print("\n")

acc = accuracy_score(testSet[['Sentiment']], grid_rf_prediction)
print("accuracy: ", acc)

{'2': {'precision': 0.949671772428884, 'recall': 0.9171597633136095, 'f1-score': 0.9331326596430876, 'support': 2366.0}, '4': {'precision': 0.9575113808801214, 'recall': 0.9746248896734334, 'f1-score': 0.9659923455440131, 'support': 4532.0}, 'accuracy': 0.9549144679617281, 'macro avg': {'precision': 0.9535915766545027, 'recall': 0.9458923264935215, 'f1-score': 0.9495625025935504, 'support': 6898.0}, 'weighted avg': {'precision': 0.954822411092411, 'recall': 0.9549144679617281, 'f1-score': 0.9547215399711528, 'support': 6898.0}}


accuracy:  0.9549144679617281
