In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np # to use numpy arrays instead of lists
import pandas as pd # DataFrame (table)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [3]:
dataSet = pd.read_csv("Amazon_Unlocked_Mobile.csv", header=0)
print(dataSet)

                                             Product Name Brand Name   Price  \
0       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
1       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
2       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
3       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
4       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
...                                                   ...        ...     ...   
413835  Samsung Convoy U640 Phone for Verizon Wireless...    Samsung   79.95   
413836  Samsung Convoy U640 Phone for Verizon Wireless...    Samsung   79.95   
413837  Samsung Convoy U640 Phone for Verizon Wireless...    Samsung   79.95   
413838  Samsung Convoy U640 Phone for Verizon Wireless...    Samsung   79.95   
413839  Samsung Convoy U640 Phone for Verizon Wireless...    Samsung   79.95   

        Rating                         

In [4]:
dataSet.dropna(inplace=True)
dataSet = dataSet.iloc[0:50000,:]

# Encode 4s and 5s as positive
# Encode 1s and 2s as negative
dataSet['Positively Rated'] = 0
dataSet['Poorly Rated'] = 0

for row in range(len(dataSet)):
  if dataSet.iloc[row,3] > 3:
    dataSet.iloc[row,6] = 1
  if dataSet.iloc[row,3] < 3:
    dataSet.iloc[row,7] = 1

datos = ['Reviews', 'Rating', 'Positively Rated','Poorly Rated']
misDatos = dataSet[datos]
print(misDatos.head(20))

misDatos = misDatos[['Reviews', 'Positively Rated','Poorly Rated']]
print(misDatos.head(20))

                                              Reviews  Rating  \
0   I feel so LUCKY to have found this used (phone...       5   
1   nice phone, nice up grade from my pantach revu...       4   
2                                        Very pleased       5   
3   It works good but it goes slow sometimes but i...       4   
4   Great phone to replace my lost phone. The only...       4   
5   I already had a phone with problems... I know ...       1   
6   The charging port was loose. I got that solder...       2   
7   Phone looks good but wouldn't stay charged, ha...       2   
8   I originally was using the Samsung S2 Galaxy f...       5   
9   It's battery life is great. It's very responsi...       3   
10  My fiance had this phone previously, but cause...       3   
11  This is a great product it came after two days...       5   
12  These guys are the best! I had a little situat...       5   
13  I'm really disappointed about my phone and ser...       1   
14  Ordered this phone as

In [5]:
train_size = 0.75
test_size = 1 - train_size
# Split data into train and test sets
trainSet, testSet = train_test_split(misDatos, test_size=test_size, random_state=0)

print("train set")
print(trainSet)
print("\n")
print("test set")
print(testSet)

train set
                                                 Reviews  Positively Rated  \
49657                                              Super                 1   
25773  Awesome service and went out of way to make su...                 1   
42791  The phone is as described, no complaints. Its ...                 1   
44626  Very undone. Speaker was blown, front facing h...                 0   
39543  The phone seems to be decent, but I can't be s...                 0   
...                                                  ...               ...   
28884           It was exactly as described. Good price.                 1   
63238  This is the first negative review I have ever ...                 0   
57078                                   Not as described                 0   
58466             It came in on time and no issue at all                 1   
3664   Phone works great but the screen was/is cracke...                 0   

       Poorly Rated  
49657             0  
25773    

In [6]:
vectorizer = TfidfVectorizer(stop_words='english',
                             min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(trainSet['Reviews'])
test_vectors = vectorizer.transform(testSet['Reviews'])

print(train_vectors)
print("\n")
print(test_vectors)

  (0, 4206)	1.0
  (1, 465)	0.3732214938577385
  (1, 3836)	0.34498262927372236
  (1, 4757)	0.37970489192330326
  (1, 4737)	0.37702388902195494
  (1, 2645)	0.3903473716133277
  (1, 4223)	0.39500536646931483
  (1, 3354)	0.315371065344649
  (1, 1926)	0.21796941358055422
  (2, 3132)	0.09440577389052866
  (2, 1227)	0.22183363192563105
  (2, 953)	0.3065525989603865
  (2, 4811)	0.2083967705063876
  (2, 4054)	0.46907793611622456
  (2, 4617)	0.21115755149998972
  (2, 2445)	0.41926090735462823
  (2, 1967)	0.13232587749610572
  (2, 736)	0.2562859284572832
  (2, 913)	0.43849454936536236
  (2, 3942)	0.29766132562048175
  (3, 4046)	0.23041057419377609
  (3, 592)	0.3161883134199908
  (3, 1637)	0.47112298092726057
  (3, 3556)	0.2242573659775506
  (3, 511)	0.15722438848853545
  :	:
  (37496, 314)	0.156436439166711
  (37496, 2877)	0.2467761527045587
  (37496, 2915)	0.2360719389896648
  (37496, 478)	0.16664428448470933
  (37496, 4742)	0.21859670380621032
  (37496, 4291)	0.2557946782329826
  (37496, 1198)	

In [7]:
# Perform classification with DT
classifier_dt = DecisionTreeClassifier(max_depth=5)
classifier_dt.fit(train_vectors, trainSet.iloc[:,1:])

prediction_dt = classifier_dt.predict(test_vectors)

df_dt = pd.DataFrame(prediction_dt, columns=['Positively Rated','Poorly Rated'], index=testSet.index)
print(df_dt)
print("\n")

print(testSet)

       Positively Rated  Poorly Rated
18699                 1             0
27243                 1             0
62620                 0             0
33496                 1             0
57107                 1             0
...                 ...           ...
22243                 1             0
10291                 1             0
50657                 1             0
38506                 1             0
9309                  1             0

[12500 rows x 2 columns]


                                                 Reviews  Positively Rated  \
18699  Great purchase had a bit of problem getting Fa...                 1   
27243       Very fast shipping. Product like description                 1   
62620  I was very excited to receive this in the mail...                 0   
33496  It stoped working a week atfer i got it , it s...                 0   
57107                                           not good                 0   
...                                             

In [8]:
# Perform the class with RF
classifier_rf = RandomForestClassifier(n_estimators=100, max_depth=5)
classifier_rf.fit(train_vectors, trainSet.iloc[:,1:])

prediction_rf = classifier_rf.predict(test_vectors)

df_rf = pd.DataFrame(prediction_rf, columns=['Positively Rated','Poorly Rated'], index=testSet.index)
print(df_rf)
print("\n")

print(testSet)

       Positively Rated  Poorly Rated
18699                 1             0
27243                 1             0
62620                 1             0
33496                 1             0
57107                 1             0
...                 ...           ...
22243                 1             0
10291                 1             0
50657                 1             0
38506                 1             0
9309                  1             0

[12500 rows x 2 columns]


                                                 Reviews  Positively Rated  \
18699  Great purchase had a bit of problem getting Fa...                 1   
27243       Very fast shipping. Product like description                 1   
62620  I was very excited to receive this in the mail...                 0   
33496  It stoped working a week atfer i got it , it s...                 0   
57107                                           not good                 0   
...                                             

In [9]:
methodsUsed=['DT', 'RF']
performanceHeaders=['precision','recall','f1-score']
modPerformancePos = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)
modPerformanceNeg = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)
modPerformanceNeut = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)
dfAcc= pd.DataFrame(index=methodsUsed, columns=['accuracy'])

In [11]:
# results report
report = classification_report(testSet[['Positively Rated','Poorly Rated']], prediction_dt, output_dict=True)
print(report)
print("\n")

# DT metrics
print("DT metrics")
positive = report['0']
dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
print(dfpos)
print("\n")
negative = report['1']
dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')

dfAcc.iloc[0,0] = accuracy_score(testSet[['Positively Rated','Poorly Rated']], prediction_dt)
print("accuracy: ", dfAcc.iloc[0,0])
print("\n")

# Comparative metrics
print("Comparative metrics")
modPerformancePos.iloc[0,0] = dfpos.iloc[0,0]
modPerformanceNeg.iloc[0,0] = dfneg.iloc[0,0]
modPerformancePos.iloc[0,1] = dfpos.iloc[1,0]
modPerformanceNeg.iloc[0,1] = dfneg.iloc[1,0]
modPerformancePos.iloc[0,2] = dfpos.iloc[2,0]
modPerformanceNeg.iloc[0,2] = dfneg.iloc[2,0]

{'0': {'precision': 0.7512613968310171, 'recall': 0.9746210381258613, 'f1-score': 0.8484878780304924, 'support': 8708.0}, '1': {'precision': 0.7095937770095073, 'recall': 0.2695338148391333, 'f1-score': 0.3906733285748275, 'support': 3046.0}, 'micro avg': {'precision': 0.7473903966597077, 'recall': 0.7919006295729113, 'f1-score': 0.7690019828155982, 'support': 11754.0}, 'macro avg': {'precision': 0.7304275869202622, 'recall': 0.6220774264824973, 'f1-score': 0.61958060330266, 'support': 11754.0}, 'weighted avg': {'precision': 0.7404634072124772, 'recall': 0.7919006295729113, 'f1-score': 0.7298471499683897, 'support': 11754.0}, 'samples avg': {'precision': 0.74464, 'recall': 0.74464, 'f1-score': 0.74464, 'support': 11754.0}}


DT metrics
              positive
precision     0.751261
recall        0.974621
f1-score      0.848488
support    8708.000000


accuracy:  0.74584


Comparative metrics


In [13]:
# results report
report = classification_report(testSet[['Positively Rated','Poorly Rated']], prediction_rf, output_dict=True)
print(report)
print("\n")

# RF metrics
print("RF metrics")
positive = report['0']
dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
print(dfpos)
print("\n")
negative = report['1']
dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')

dfAcc.iloc[1,0] = accuracy_score(testSet[['Positively Rated','Poorly Rated']], prediction_rf)
print("accuracy: ", dfAcc.iloc[0,0])
print("\n")

# Comparative metrics
print("Comparative metrics")
modPerformancePos.iloc[1,0] = dfpos.iloc[0,0]
modPerformanceNeg.iloc[1,0] = dfneg.iloc[0,0]
modPerformancePos.iloc[1,1] = dfpos.iloc[1,0]
modPerformanceNeg.iloc[1,1] = dfneg.iloc[1,0]
modPerformancePos.iloc[1,2] = dfpos.iloc[2,0]
modPerformanceNeg.iloc[1,2] = dfneg.iloc[2,0]

{'0': {'precision': 0.6974211116450425, 'recall': 1.0, 'f1-score': 0.8217420024535246, 'support': 8708.0}, '1': {'precision': 1.0, 'recall': 0.0006565988181221273, 'f1-score': 0.0013123359580052493, 'support': 3046.0}, 'micro avg': {'precision': 0.6974695707879565, 'recall': 0.7410243321422495, 'f1-score': 0.7185875752825674, 'support': 11754.0}, 'macro avg': {'precision': 0.8487105558225212, 'recall': 0.5003282994090611, 'f1-score': 0.41152716920576493, 'support': 11754.0}, 'weighted avg': {'precision': 0.7758331665990327, 'recall': 0.7410243321422495, 'f1-score': 0.6091310815631594, 'support': 11754.0}, 'samples avg': {'precision': 0.6968, 'recall': 0.6968, 'f1-score': 0.6968, 'support': 11754.0}}


RF metrics
              positive
precision     0.697421
recall        1.000000
f1-score      0.821742
support    8708.000000


accuracy:  0.74584


Comparative metrics


In [14]:
print("Positive comments metrics")
print(modPerformancePos)
print("\n")
print("Negative comments metrics")
print(modPerformanceNeg)
print("\n")
print(dfAcc)

Positive comments metrics
   precision    recall  f1-score
DT  0.751261  0.974621  0.848488
RF  0.697421       1.0  0.821742


Negative comments metrics
   precision    recall  f1-score
DT  0.709594  0.269534  0.390673
RF       1.0  0.000657  0.001312


   accuracy
DT  0.74584
RF   0.6968
