# Import libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np # to use numpy arrays instead of lists
import pandas as pd # DataFrame (table)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
from sklearn.model_selection import GridSearchCV

# Read data

In [1]:
dataSet = pd.read_csv("Amazon_Unlocked_Mobile.zip", header=0, compression='zip')
print(dataSet)

NameError: name 'pd' is not defined

# Preprocess data

In [None]:
dataSet.dropna(inplace=True)
dataSet = dataSet.iloc[0:50000,:]

# Encode 4s and 5s as positive
# Encode 3s as neutraal
# Encode 1s and 2s as negative
dataSet['Positively Rated'] = 0
dataSet['Poorly Rated'] = 0
dataSet['Neutral'] = 0

for row in range(len(dataSet)):
  if dataSet.iloc[row,3] > 3:
    dataSet.iloc[row,6] = 1
  elif dataSet.iloc[row,3] < 3:
    dataSet.iloc[row,7] = 1
  else:
    dataSet.iloc[row,8] = 1

datos = ['Reviews', 'Rating', 'Positively Rated','Poorly Rated','Neutral']
misDatos = dataSet[datos]
print(misDatos.head(20))

misDatos = misDatos[['Reviews', 'Positively Rated','Poorly Rated','Neutral']]
print(misDatos.head(20))

                                              Reviews  Rating  \
0   I feel so LUCKY to have found this used (phone...       5   
1   nice phone, nice up grade from my pantach revu...       4   
2                                        Very pleased       5   
3   It works good but it goes slow sometimes but i...       4   
4   Great phone to replace my lost phone. The only...       4   
5   I already had a phone with problems... I know ...       1   
6   The charging port was loose. I got that solder...       2   
7   Phone looks good but wouldn't stay charged, ha...       2   
8   I originally was using the Samsung S2 Galaxy f...       5   
9   It's battery life is great. It's very responsi...       3   
10  My fiance had this phone previously, but cause...       3   
11  This is a great product it came after two days...       5   
12  These guys are the best! I had a little situat...       5   
13  I'm really disappointed about my phone and ser...       1   
14  Ordered this phone as

# Split data

In [None]:
train_size = 0.75
test_size = 1 - train_size
# Split data into train and test sets
trainSet, testSet = train_test_split(misDatos, test_size=test_size, random_state=0)

print("train set")
print(trainSet)
print("\n")
print("test set")
print(testSet)

train set
                                                 Reviews  Positively Rated  \
49657                                              Super                 1   
25773  Awesome service and went out of way to make su...                 1   
42791  The phone is as described, no complaints. Its ...                 1   
44626  Very undone. Speaker was blown, front facing h...                 0   
39543  The phone seems to be decent, but I can't be s...                 0   
...                                                  ...               ...   
28884           It was exactly as described. Good price.                 1   
63238  This is the first negative review I have ever ...                 0   
57078                                   Not as described                 0   
58466             It came in on time and no issue at all                 1   
3664   Phone works great but the screen was/is cracke...                 0   

       Poorly Rated  Neutral  
49657             0   

# Vectorize data

In [None]:
vectorizer = TfidfVectorizer(stop_words='english',
                             min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(trainSet['Reviews'])
test_vectors = vectorizer.transform(testSet['Reviews'])

print(train_vectors)
print("\n")
print(test_vectors)

  (0, 4206)	1.0
  (1, 465)	0.3732214938577385
  (1, 3836)	0.34498262927372236
  (1, 4757)	0.37970489192330326
  (1, 4737)	0.37702388902195494
  (1, 2645)	0.3903473716133277
  (1, 4223)	0.39500536646931483
  (1, 3354)	0.315371065344649
  (1, 1926)	0.21796941358055422
  (2, 3132)	0.09440577389052866
  (2, 1227)	0.22183363192563105
  (2, 953)	0.3065525989603865
  (2, 4811)	0.2083967705063876
  (2, 4054)	0.46907793611622456
  (2, 4617)	0.21115755149998972
  (2, 2445)	0.41926090735462823
  (2, 1967)	0.13232587749610572
  (2, 736)	0.2562859284572832
  (2, 913)	0.43849454936536236
  (2, 3942)	0.29766132562048175
  (3, 4046)	0.23041057419377609
  (3, 592)	0.3161883134199908
  (3, 1637)	0.47112298092726057
  (3, 3556)	0.2242573659775506
  (3, 511)	0.15722438848853545
  :	:
  (37496, 314)	0.156436439166711
  (37496, 2877)	0.2467761527045587
  (37496, 2915)	0.2360719389896648
  (37496, 478)	0.16664428448470933
  (37496, 4742)	0.21859670380621032
  (37496, 4291)	0.2557946782329826
  (37496, 1198)	

# Implement models

In [None]:
# Perform classification with DT
classifier_dt = DecisionTreeClassifier(max_depth=5)
classifier_dt.fit(train_vectors, trainSet.iloc[:,1:])

prediction_dt = classifier_dt.predict(test_vectors)

df_dt = pd.DataFrame(prediction_dt, columns=['Positively Rated','Poorly Rated','Neutral'], index=testSet.index)
print(df_dt)
print("\n")

print(testSet)

       Positively Rated  Poorly Rated  Neutral
18699                 1             0        0
27243                 1             0        0
62620                 0             0        1
33496                 1             0        0
57107                 1             0        0
...                 ...           ...      ...
22243                 1             0        0
10291                 1             0        0
50657                 1             0        0
38506                 0             0        0
9309                  1             0        0

[12500 rows x 3 columns]


                                                 Reviews  Positively Rated  \
18699  Great purchase had a bit of problem getting Fa...                 1   
27243       Very fast shipping. Product like description                 1   
62620  I was very excited to receive this in the mail...                 0   
33496  It stoped working a week atfer i got it , it s...                 0   
57107             

In [None]:
# Perform the class with RF
classifier_rf = RandomForestClassifier(n_estimators=100, max_depth=5)
classifier_rf.fit(train_vectors, trainSet.iloc[:,1:])

prediction_rf = classifier_rf.predict(test_vectors)

df_rf = pd.DataFrame(prediction_rf, columns=['Positively Rated','Poorly Rated','Neutral'], index=testSet.index)
print(df_rf)
print("\n")

print(testSet)

       Positively Rated  Poorly Rated  Neutral
18699                 1             0        0
27243                 1             0        0
62620                 1             0        0
33496                 1             0        0
57107                 1             0        0
...                 ...           ...      ...
22243                 1             0        0
10291                 1             0        0
50657                 1             0        0
38506                 1             0        0
9309                  1             0        0

[12500 rows x 3 columns]


                                                 Reviews  Positively Rated  \
18699  Great purchase had a bit of problem getting Fa...                 1   
27243       Very fast shipping. Product like description                 1   
62620  I was very excited to receive this in the mail...                 0   
33496  It stoped working a week atfer i got it , it s...                 0   
57107             

# Create metrics

In [None]:
methodsUsed=['DT', 'RF', 'GRID DT', 'GRID RF']
performanceHeaders=['precision','recall','f1-score']
modPerformancePos = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)
modPerformanceNeg = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)
modPerformanceNeut = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)
dfAcc= pd.DataFrame(index=methodsUsed, columns=['accuracy'])

In [None]:
# results report
report = classification_report(testSet[['Positively Rated','Poorly Rated','Neutral']], prediction_dt, output_dict=True)
print(report)
print("\n")

# DT metrics
print("DT metrics")
positive = report['0']
dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
print(dfpos)
print("\n")
negative = report['1']
dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')
neutral = report['2']
dfneut = pd.DataFrame.from_dict(neutral, columns=['neutral'], orient='index')

dfAcc.iloc[0,0] = accuracy_score(testSet[['Positively Rated','Poorly Rated','Neutral']], prediction_dt)
print("accuracy: ", dfAcc.iloc[0,0])
print("\n")

# Comparative metrics
print("Comparative metrics")
modPerformancePos.iloc[0,0] = dfpos.iloc[0,0]
modPerformanceNeg.iloc[0,0] = dfneg.iloc[0,0]
modPerformanceNeut.iloc[0,0] = dfneut.iloc[0,0]
modPerformancePos.iloc[0,1] = dfpos.iloc[1,0]
modPerformanceNeg.iloc[0,1] = dfneg.iloc[1,0]
modPerformanceNeut.iloc[0,1] = dfneut.iloc[1,0]
modPerformancePos.iloc[0,2] = dfpos.iloc[2,0]
modPerformanceNeg.iloc[0,2] = dfneg.iloc[2,0]
modPerformanceNeut.iloc[0,2] = dfneut.iloc[2,0]

{'0': {'precision': 0.8226349390597795, 'recall': 0.813849333945797, 'f1-score': 0.8182185533683543, 'support': 8708.0}, '1': {'precision': 0.6938369781312127, 'recall': 0.22915298752462246, 'f1-score': 0.3445212240868707, 'support': 3046.0}, '2': {'precision': 0.8, 'recall': 0.0160857908847185, 'f1-score': 0.03153745072273324, 'support': 746.0}, 'micro avg': {'precision': 0.8091531755915318, 'recall': 0.62376, 'f1-score': 0.7044633176725695, 'support': 12500.0}, 'macro avg': {'precision': 0.7721573057303308, 'recall': 0.353029370785046, 'f1-score': 0.3980924093926527, 'support': 12500.0}, 'weighted avg': {'precision': 0.7898985987776186, 'recall': 0.62376, 'f1-score': 0.6558388599631517, 'support': 12500.0}, 'samples avg': {'precision': 0.62376, 'recall': 0.62376, 'f1-score': 0.62376, 'support': 12500.0}}


DT metrics
              positive
precision     0.822635
recall        0.813849
f1-score      0.818219
support    8708.000000


accuracy:  0.62376


Comparative metrics


In [None]:
# results report
report = classification_report(testSet[['Positively Rated','Poorly Rated','Neutral']], prediction_rf, output_dict=True)
print(report)
print("\n")

# RF metrics
print("RF metrics")
positive = report['0']
dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
print(dfpos)
print("\n")
negative = report['1']
dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')
neutral = report['2']
dfneut = pd.DataFrame.from_dict(neutral, columns=['neutral'], orient='index')

dfAcc.iloc[1,0] = accuracy_score(testSet[['Positively Rated','Poorly Rated','Neutral']], prediction_rf)
print("accuracy: ", dfAcc.iloc[1,0])
print("\n")

# Comparative metrics
print("Comparative metrics")
modPerformancePos.iloc[1,0] = dfpos.iloc[0,0]
modPerformanceNeg.iloc[1,0] = dfneg.iloc[0,0]
modPerformanceNeut.iloc[1,0] = dfneut.iloc[0,0]
modPerformancePos.iloc[1,1] = dfpos.iloc[1,0]
modPerformanceNeg.iloc[1,1] = dfneg.iloc[1,0]
modPerformanceNeut.iloc[1,1] = dfneut.iloc[1,0]
modPerformancePos.iloc[1,2] = dfpos.iloc[2,0]
modPerformanceNeg.iloc[1,2] = dfneg.iloc[2,0]
modPerformanceNeut.iloc[1,2] = dfneut.iloc[2,0]

{'0': {'precision': 0.6977564102564102, 'recall': 1.0, 'f1-score': 0.8219747026618841, 'support': 8708.0}, '1': {'precision': 1.0, 'recall': 0.00032829940906106366, 'f1-score': 0.0006563833278634722, 'support': 3046.0}, '2': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 746.0}, 'micro avg': {'precision': 0.6977806265523596, 'recall': 0.69672, 'f1-score': 0.697249909931548, 'support': 12500.0}, 'macro avg': {'precision': 0.5659188034188034, 'recall': 0.333442766469687, 'f1-score': 0.2742103619965825, 'support': 12500.0}, 'weighted avg': {'precision': 0.7297650256410255, 'recall': 0.69672, 'f1-score': 0.5727804043517087, 'support': 12500.0}, 'samples avg': {'precision': 0.69672, 'recall': 0.69672, 'f1-score': 0.69672, 'support': 12500.0}}


RF metrics
              positive
precision     0.697756
recall        1.000000
f1-score      0.821975
support    8708.000000


accuracy:  0.69672


Comparative metrics


In [None]:
print("Positive comments metrics")
print(modPerformancePos)
print("\n")
print("Negative comments metrics")
print(modPerformanceNeg)
print("\n")
print("Neutral comments metrics")
print(modPerformanceNeut)
print("\n")
print(dfAcc)

Positive comments metrics
        precision    recall  f1-score
DT       0.822635  0.813849  0.818219
RF       0.697756       1.0  0.821975
GRID DT       NaN       NaN       NaN
GRID RF       NaN       NaN       NaN


Negative comments metrics
        precision    recall  f1-score
DT       0.693837  0.229153  0.344521
RF            1.0  0.000328  0.000656
GRID DT       NaN       NaN       NaN
GRID RF       NaN       NaN       NaN


Neutral comments metrics
        precision    recall  f1-score
DT            0.8  0.016086  0.031537
RF            0.0       0.0       0.0
GRID DT       NaN       NaN       NaN
GRID RF       NaN       NaN       NaN


        accuracy
DT       0.62376
RF       0.69672
GRID DT      NaN
GRID RF      NaN


# Fine-tuning

Grid Search for DT

In [None]:
# Perform classification with DT
print('Classification with DT')
dt_clf = DecisionTreeClassifier()
param_search_dt = {
        'criterion': ["gini", 'entropy'],
        'max_depth': [5, 10, 20, 30, None]
    }

grid_search_dt = GridSearchCV(estimator=dt_clf, param_grid=param_search_dt, cv=5, verbose=1)
grid_search_dt.fit(train_vectors, trainSet.iloc[:,1:])
best_clf_dt = grid_search_dt.best_estimator_

print(best_clf_dt.get_params())
print("\n")

grid_dt_prediction = best_clf_dt.predict(test_vectors)

df_grid_dt = pd.DataFrame(grid_dt_prediction, columns=['Positively Rated','Poorly Rated','Neutral'], index=testSet.index)
print(df_grid_dt)
print("\n")

print(testSet)

Classification with DT
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


       Positively Rated  Poorly Rated  Neutral
18699                 1             0        0
27243                 1             0        0
62620                 0             0        1
33496                 0             1        0
57107                 1             0        0
...                 ...           ...      ...
22243                 1             0        0
10291                 1             0        0
50657                 1             0        0
38506                 1             0        0
9309                  1             0        0

[12500 rows x 3 columns]


                    

Grid Searc for the RF

In [None]:
# Perform classification with RF
print('Classification with RF')
rf_clf = RandomForestClassifier()
param_search_rf = {
        'n_estimators': [60, 80, 100, 120, 150],
        'max_depth': [5, 10, 20, 30, 40, None]
    }

grid_search_rf = GridSearchCV(estimator=rf_clf, param_grid=param_search_rf, cv=5, verbose=1)
grid_search_rf.fit(train_vectors, trainSet.iloc[:,1:])
best_clf_rf = grid_search_rf.best_estimator_

print(best_clf_rf.get_params())
print("\n")

grid_rf_prediction = best_clf_rf.predict(test_vectors)

df_grid_rf = pd.DataFrame(grid_rf_prediction, columns=['Positively Rated','Poorly Rated','Neutral'], index=testSet.index)
print(df_grid_rf)
print("\n")

print(testSet)

Classification with RF
Fitting 5 folds for each of 30 candidates, totalling 150 fits
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 150, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


       Positively Rated  Poorly Rated  Neutral
18699                 1             0        0
27243                 1             0        0
62620                 0             0        1
33496                 0             1        0
57107                 1             0        0
...                 ...           ...      ...
22243                 1             0        0
10291                 1             0        0
50657                 1             0        0
38506                 1    

# New Metrics

In [None]:
# results report
report = classification_report(testSet[['Positively Rated','Poorly Rated','Neutral']], grid_dt_prediction, output_dict=True)
print(report)
print("\n")

# RF metrics
print("GRID DT metrics")
positive = report['0']
dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
print(dfpos)
print("\n")
negative = report['1']
dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')
neutral = report['2']
dfneut = pd.DataFrame.from_dict(neutral, columns=['neutral'], orient='index')

dfAcc.iloc[2,0] = accuracy_score(testSet[['Positively Rated','Poorly Rated','Neutral']], grid_dt_prediction)
print("accuracy: ", dfAcc.iloc[2,0])
print("\n")

# Comparative metrics
print("Comparative metrics")
modPerformancePos.iloc[2,0] = dfpos.iloc[0,0]
modPerformanceNeg.iloc[2,0] = dfneg.iloc[0,0]
modPerformanceNeut.iloc[2,0] = dfneut.iloc[0,0]
modPerformancePos.iloc[2,1] = dfpos.iloc[1,0]
modPerformanceNeg.iloc[2,1] = dfneg.iloc[1,0]
modPerformanceNeut.iloc[2,1] = dfneut.iloc[1,0]
modPerformancePos.iloc[2,2] = dfpos.iloc[2,0]
modPerformanceNeg.iloc[2,2] = dfneg.iloc[2,0]
modPerformanceNeut.iloc[2,2] = dfneut.iloc[2,0]

{'0': {'precision': 0.960018070928394, 'recall': 0.9761139182361047, 'f1-score': 0.9679990889420339, 'support': 8708.0}, '1': {'precision': 0.9186937687437521, 'recall': 0.9051214707813526, 'f1-score': 0.9118571192326773, 'support': 3046.0}, '2': {'precision': 0.8508634222919937, 'recall': 0.7265415549597856, 'f1-score': 0.7838033261026753, 'support': 746.0}, 'micro avg': {'precision': 0.9445244956772334, 'recall': 0.94392, 'f1-score': 0.9442221510883483, 'support': 12500.0}, 'macro avg': {'precision': 0.9098584206547132, 'recall': 0.8692589813257476, 'f1-score': 0.8878865114257956, 'support': 12500.0}, 'weighted avg': {'precision': 0.94343381554142, 'recall': 0.94392, 'f1-score': 0.943325610637005, 'support': 12500.0}, 'samples avg': {'precision': 0.94392, 'recall': 0.94392, 'f1-score': 0.94392, 'support': 12500.0}}


GRID DT metrics
              positive
precision     0.960018
recall        0.976114
f1-score      0.967999
support    8708.000000


accuracy:  0.94392


Comparative met

In [None]:
# results report
report = classification_report(testSet[['Positively Rated','Poorly Rated','Neutral']], grid_rf_prediction, output_dict=True)
print(report)
print("\n")

# RF metrics
print("GRID DT metrics")
positive = report['0']
dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
print(dfpos)
print("\n")
negative = report['1']
dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')
neutral = report['2']
dfneut = pd.DataFrame.from_dict(neutral, columns=['neutral'], orient='index')

dfAcc.iloc[3,0] = accuracy_score(testSet[['Positively Rated','Poorly Rated','Neutral']], grid_rf_prediction)
print("accuracy: ", dfAcc.iloc[3,0])
print("\n")

# Comparative metrics
print("Comparative metrics")
modPerformancePos.iloc[3,0] = dfpos.iloc[0,0]
modPerformanceNeg.iloc[3,0] = dfneg.iloc[0,0]
modPerformanceNeut.iloc[3,0] = dfneut.iloc[0,0]
modPerformancePos.iloc[3,1] = dfpos.iloc[1,0]
modPerformanceNeg.iloc[3,1] = dfneg.iloc[1,0]
modPerformanceNeut.iloc[3,1] = dfneut.iloc[1,0]
modPerformancePos.iloc[3,2] = dfpos.iloc[2,0]
modPerformanceNeg.iloc[3,2] = dfneg.iloc[2,0]
modPerformanceNeut.iloc[3,2] = dfneut.iloc[2,0]

{'0': {'precision': 0.9710112105084362, 'recall': 0.9847266881028939, 'f1-score': 0.9778208563772165, 'support': 8708.0}, '1': {'precision': 0.9452237808951236, 'recall': 0.9290873276428102, 'f1-score': 0.9370860927152318, 'support': 3046.0}, '2': {'precision': 0.9980879541108987, 'recall': 0.6997319034852547, 'f1-score': 0.8226950354609929, 'support': 746.0}, 'micro avg': {'precision': 0.9659054097829608, 'recall': 0.95416, 'f1-score': 0.9599967804249839, 'support': 12500.0}, 'macro avg': {'precision': 0.9714409818381529, 'recall': 0.8711819730769864, 'f1-score': 0.9125339948511471, 'support': 12500.0}, 'weighted avg': {'precision': 0.9663432697184591, 'recall': 0.95416, 'f1-score': 0.9586367001757837, 'support': 12500.0}, 'samples avg': {'precision': 0.95416, 'recall': 0.95416, 'f1-score': 0.95416, 'support': 12500.0}}


GRID DT metrics
              positive
precision     0.971011
recall        0.984727
f1-score      0.977821
support    8708.000000


accuracy:  0.95416


Comparative

In [None]:
print("Positive comments metrics")
print(modPerformancePos)
print("\n")
print("Negative comments metrics")
print(modPerformanceNeg)
print("\n")
print("Neutral comments metrics")
print(modPerformanceNeut)
print("\n")
print(dfAcc)

Positive comments metrics
        precision    recall  f1-score
DT       0.822635  0.813849  0.818219
RF       0.697756       1.0  0.821975
GRID DT  0.960018  0.976114  0.967999
GRID RF  0.971011  0.984727  0.977821


Negative comments metrics
        precision    recall  f1-score
DT       0.693837  0.229153  0.344521
RF            1.0  0.000328  0.000656
GRID DT  0.918694  0.905121  0.911857
GRID RF  0.945224  0.929087  0.937086


Neutral comments metrics
        precision    recall  f1-score
DT            0.8  0.016086  0.031537
RF            0.0       0.0       0.0
GRID DT  0.850863  0.726542  0.783803
GRID RF  0.998088  0.699732  0.822695


        accuracy
DT       0.62376
RF       0.69672
GRID DT  0.94392
GRID RF  0.95416
