In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np # to use numpy arrays instead of lists
import pandas as pd # DataFrame (table)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score

In [None]:
dataSet = pd.read_csv("Amazon_Unlocked_Mobile.csv", header=0)
print(dataSet)

                                             Product Name Brand Name   Price  \
0       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
1       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
2       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
3       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
4       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
...                                                   ...        ...     ...   
401310  Studio 7.0 LTE - Smartphone - GSM Unlocked - Grey        BLU     NaN   
401311  Studio 7.0 LTE - Smartphone - GSM Unlocked - Grey        BLU     NaN   
401312  Studio 7.0 LTE - Smartphone - GSM Unlocked - Grey        BLU     NaN   
401313  Studio 7.0 LTE - Smartphone - GSM Unlocked - Grey        BLU     NaN   
401314  Studio 7.0 LTE - Smartphone - GSM Unlocked - Grey        BLU     NaN   

        Rating                         

In [None]:
dataSet.dropna(inplace=True)
dataSet = dataSet.iloc[0:50000,:]

# Encode 4s and 5s as positive
dataSet['Sentiment'] = 2

for row in range(len(dataSet)):
  if dataSet.iloc[row,3] > 3:
    dataSet.iloc[row,6] = 4

datos = ['Reviews', 'Rating', 'Sentiment']
misDatos = dataSet[datos]
print(misDatos.head(20))

misDatos = misDatos[['Reviews', 'Sentiment']]
print(misDatos.head(20))

                                              Reviews  Rating  Sentiment
0   I feel so LUCKY to have found this used (phone...       5          4
1   nice phone, nice up grade from my pantach revu...       4          4
2                                        Very pleased       5          4
3   It works good but it goes slow sometimes but i...       4          4
4   Great phone to replace my lost phone. The only...       4          4
5   I already had a phone with problems... I know ...       1          2
6   The charging port was loose. I got that solder...       2          2
7   Phone looks good but wouldn't stay charged, ha...       2          2
8   I originally was using the Samsung S2 Galaxy f...       5          4
9   It's battery life is great. It's very responsi...       3          2
10  My fiance had this phone previously, but cause...       3          2
11  This is a great product it came after two days...       5          4
12  These guys are the best! I had a little situat.

In [None]:
train_size = 0.75
test_size = 1 - train_size
# Split data into train and test sets
trainSet, testSet = train_test_split(misDatos, test_size=test_size, random_state=0)

print("train set")
print(trainSet)
print("\n")
print("test set")
print(testSet)

train set
                                                 Reviews  Sentiment
49657                                              Super          4
25773  Awesome service and went out of way to make su...          4
42791  The phone is as described, no complaints. Its ...          4
44626  Very undone. Speaker was blown, front facing h...          2
39543  The phone seems to be decent, but I can't be s...          2
...                                                  ...        ...
28884           It was exactly as described. Good price.          4
63238  This is the first negative review I have ever ...          2
57078                                   Not as described          2
58466             It came in on time and no issue at all          4
3664   Phone works great but the screen was/is cracke...          2

[37500 rows x 2 columns]


test set
                                                 Reviews  Sentiment
18699  Great purchase had a bit of problem getting Fa...          4
2

In [None]:
vectorizer = TfidfVectorizer(stop_words='english',  ## NOTA: este se quita porque los comentarios están en español
                             min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(trainSet['Reviews'])
test_vectors = vectorizer.transform(testSet['Reviews'])

print(train_vectors)
print("\n")
print(test_vectors)

  (0, 4206)	1.0
  (1, 465)	0.3732214938577385
  (1, 3836)	0.34498262927372236
  (1, 4757)	0.37970489192330326
  (1, 4737)	0.37702388902195494
  (1, 2645)	0.3903473716133277
  (1, 4223)	0.39500536646931483
  (1, 3354)	0.315371065344649
  (1, 1926)	0.21796941358055422
  (2, 3132)	0.09440577389052866
  (2, 1227)	0.22183363192563105
  (2, 953)	0.3065525989603865
  (2, 4811)	0.2083967705063876
  (2, 4054)	0.46907793611622456
  (2, 4617)	0.21115755149998972
  (2, 2445)	0.41926090735462823
  (2, 1967)	0.13232587749610572
  (2, 736)	0.2562859284572832
  (2, 913)	0.43849454936536236
  (2, 3942)	0.29766132562048175
  (3, 4046)	0.23041057419377609
  (3, 592)	0.3161883134199908
  (3, 1637)	0.47112298092726057
  (3, 3556)	0.2242573659775506
  (3, 511)	0.15722438848853545
  :	:
  (37496, 314)	0.156436439166711
  (37496, 2877)	0.2467761527045587
  (37496, 2915)	0.2360719389896648
  (37496, 478)	0.16664428448470933
  (37496, 4742)	0.21859670380621032
  (37496, 4291)	0.2557946782329826
  (37496, 1198)	

In [None]:
# Perform classification with DT
classifier_dt = DecisionTreeClassifier(max_depth=5)
classifier_dt.fit(train_vectors, trainSet.iloc[:,1:])

prediction_dt = classifier_dt.predict(test_vectors)

df_dt = pd.DataFrame(prediction_dt, columns=['Sentiment'], index=testSet.index)
print(df_dt)
print("\n")

print(testSet)

       Sentiment
18699          4
27243          4
62620          2
33496          4
57107          4
...          ...
22243          4
10291          4
50657          4
38506          4
9309           4

[12500 rows x 1 columns]


                                                 Reviews  Sentiment
18699  Great purchase had a bit of problem getting Fa...          4
27243       Very fast shipping. Product like description          4
62620  I was very excited to receive this in the mail...          2
33496  It stoped working a week atfer i got it , it s...          2
57107                                           not good          2
...                                                  ...        ...
22243                               nice, smart and fast          4
10291  You cannot beat an iPhone on Straighttalk. The...          4
50657  It us for another person but I know they are good          4
38506                                         cool phone          4
9309   iphone is in 

In [None]:
# Perform the class with RF
classifier_rf = RandomForestClassifier(n_estimators=100, max_depth=5)
classifier_rf.fit(train_vectors, trainSet.iloc[:,1:])

prediction_rf = classifier_rf.predict(test_vectors)

df_rf = pd.DataFrame(prediction_rf, columns=['Sentiment'], index=testSet.index)
print(df_rf)
print("\n")

print(testSet)

       Sentiment
18699          4
27243          4
62620          4
33496          4
57107          4
...          ...
22243          4
10291          4
50657          4
38506          4
9309           4

[12500 rows x 1 columns]


                                                 Reviews  Sentiment
18699  Great purchase had a bit of problem getting Fa...          4
27243       Very fast shipping. Product like description          4
62620  I was very excited to receive this in the mail...          2
33496  It stoped working a week atfer i got it , it s...          2
57107                                           not good          2
...                                                  ...        ...
22243                               nice, smart and fast          4
10291  You cannot beat an iPhone on Straighttalk. The...          4
50657  It us for another person but I know they are good          4
38506                                         cool phone          4
9309   iphone is in 

In [None]:
# Perform the class with SVM
classifier_svm = svm.SVC(kernel="poly")
classifier_svm.fit(train_vectors, trainSet.iloc[:,1:])

prediction_svm = classifier_svm.predict(test_vectors)

df_svm = pd.DataFrame(prediction_svm, columns=['Sentiment'], index=testSet.index)
print(df_svm)
print("\n")

print(testSet)

       Sentiment
18699          4
27243          4
62620          2
33496          2
57107          4
...          ...
22243          4
10291          4
50657          4
38506          4
9309           4

[12500 rows x 1 columns]


                                                 Reviews  Sentiment
18699  Great purchase had a bit of problem getting Fa...          4
27243       Very fast shipping. Product like description          4
62620  I was very excited to receive this in the mail...          2
33496  It stoped working a week atfer i got it , it s...          2
57107                                           not good          2
...                                                  ...        ...
22243                               nice, smart and fast          4
10291  You cannot beat an iPhone on Straighttalk. The...          4
50657  It us for another person but I know they are good          4
38506                                         cool phone          4
9309   iphone is in 

In [None]:
methodsUsed=['DT', 'RF', 'SVM']
performanceHeaders=['precision','recall','f1-score']
modPerformancePos = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)
modPerformanceNeg = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)
modPerformanceNeut = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)
dfAcc= pd.DataFrame(index=methodsUsed, columns=['accuracy'])

In [None]:
# results report
report = classification_report(testSet[['Sentiment']], prediction_dt, output_dict=True)
print(report)
print("\n")

# DT metrics
print("DT metrics")
positive = report['4']
dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
print(dfpos)
print("\n")
negative = report['2']
dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')

dfAcc.iloc[0,0] = accuracy_score(testSet[['Sentiment']], prediction_dt)
print("accuracy: ", dfAcc.iloc[0,0])
print("\n")

# Comparative metrics
print("Comparative metrics")
modPerformancePos.iloc[0,0] = dfpos.iloc[0,0]
modPerformanceNeg.iloc[0,0] = dfneg.iloc[0,0]
modPerformancePos.iloc[0,1] = dfpos.iloc[1,0]
modPerformanceNeg.iloc[0,1] = dfneg.iloc[1,0]
modPerformancePos.iloc[0,2] = dfpos.iloc[2,0]
modPerformanceNeg.iloc[0,2] = dfneg.iloc[2,0]

{'2': {'precision': 0.8276157804459692, 'recall': 0.2544831223628692, 'f1-score': 0.3892698668818072, 'support': 3792.0}, '4': {'precision': 0.7505734956767249, 'recall': 0.9769177767570051, 'f1-score': 0.8489172737251771, 'support': 8708.0}, 'accuracy': 0.75776, 'macro avg': {'precision': 0.789094638061347, 'recall': 0.6157004495599372, 'f1-score': 0.6190935703034921, 'support': 12500.0}, 'weighted avg': {'precision': 0.7739450431843229, 'recall': 0.75776, 'f1-score': 0.7094786363851724, 'support': 12500.0}}


DT metrics
              positive
precision     0.750573
recall        0.976918
f1-score      0.848917
support    8708.000000


accuracy:  0.75776


Comparative metrics


In [None]:
# results report
report = classification_report(testSet[['Sentiment']], prediction_rf, output_dict=True)
print(report)
print("\n")

# RF metrics
print("RF metrics")
positive = report['4']
dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
print(dfpos)
print("\n")
negative = report['2']
dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')

dfAcc.iloc[1,0] = accuracy_score(testSet[['Sentiment']], prediction_rf)
print("accuracy: ", dfAcc.iloc[0,0])
print("\n")

# Comparative metrics
print("Comparative metrics")
modPerformancePos.iloc[1,0] = dfpos.iloc[0,0]
modPerformanceNeg.iloc[1,0] = dfneg.iloc[0,0]
modPerformancePos.iloc[1,1] = dfpos.iloc[1,0]
modPerformanceNeg.iloc[1,1] = dfneg.iloc[1,0]
modPerformancePos.iloc[1,2] = dfpos.iloc[2,0]
modPerformanceNeg.iloc[1,2] = dfneg.iloc[2,0]

{'2': {'precision': 1.0, 'recall': 0.0058016877637130804, 'f1-score': 0.011536444677503933, 'support': 3792.0}, '4': {'precision': 0.6978682481166854, 'recall': 1.0, 'f1-score': 0.8220522986878127, 'support': 8708.0}, 'accuracy': 0.6984, 'macro avg': {'precision': 0.8489341240583427, 'recall': 0.5029008438818565, 'f1-score': 0.4167943716826583, 'support': 12500.0}, 'weighted avg': {'precision': 0.7895229363680076, 'recall': 0.6984, 'f1-score': 0.5761742092152455, 'support': 12500.0}}


RF metrics
              positive
precision     0.697868
recall        1.000000
f1-score      0.822052
support    8708.000000


accuracy:  0.75776


Comparative metrics


In [None]:
# results report
report = classification_report(testSet[['Sentiment']], prediction_svm, output_dict=True)
print(report)
print("\n")

# SVM metrics
print("SVM metrics")
positive = report['4']
dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
print(dfpos)
print("\n")
negative = report['2']
dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')

dfAcc.iloc[2,0] = accuracy_score(testSet[['Sentiment']], prediction_rf)
print("accuracy: ", dfAcc.iloc[0,0])
print("\n")

# Comparative metrics
print("Comparative metrics")
modPerformancePos.iloc[2,0] = dfpos.iloc[0,0]
modPerformanceNeg.iloc[2,0] = dfneg.iloc[0,0]
modPerformancePos.iloc[2,1] = dfpos.iloc[1,0]
modPerformanceNeg.iloc[2,1] = dfneg.iloc[1,0]
modPerformancePos.iloc[2,2] = dfpos.iloc[2,0]
modPerformanceNeg.iloc[2,2] = dfneg.iloc[2,0]

{'2': {'precision': 0.9794738363688927, 'recall': 0.8934599156118144, 'f1-score': 0.9344917942352778, 'support': 3792.0}, '4': {'precision': 0.9553146775799137, 'recall': 0.9918465778594396, 'f1-score': 0.9732379288974027, 'support': 8708.0}, 'accuracy': 0.962, 'macro avg': {'precision': 0.9673942569744032, 'recall': 0.942653246735627, 'f1-score': 0.9538648615663403, 'support': 12500.0}, 'weighted avg': {'precision': 0.9626435999901383, 'recall': 0.962, 'f1-score': 0.9614839014863005, 'support': 12500.0}}


SVM metrics
              positive
precision     0.955315
recall        0.991847
f1-score      0.973238
support    8708.000000


accuracy:  0.75776


Comparative metrics


In [None]:
print("Positive comments metrics")
print(modPerformancePos)
print("\n")
print("Negative comments metrics")
print(modPerformanceNeg)
print("\n")
print(dfAcc)

Positive comments metrics
    precision    recall  f1-score
DT   0.750573  0.976918  0.848917
RF   0.697868       1.0  0.822052
SVM  0.955315  0.991847  0.973238


Negative comments metrics
    precision    recall  f1-score
DT   0.827616  0.254483   0.38927
RF        1.0  0.005802  0.011536
SVM  0.979474   0.89346  0.934492


    accuracy
DT   0.75776
RF    0.6984
SVM   0.6984
