In [6]:
import warnings
import numpy as np # to use numpy arrays instead of lists
import pandas as pd # DataFrame (table)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from google.colab import drive
drive.mount('gdrive')

Mounted at gdrive


In [3]:
def main():
    warnings.filterwarnings("ignore")

    datos = leeDatos()
    print(datos)
    print("\n")

    encodeDatos = encodeData(datos)
    print(encodeDatos)
    print("\n")
    print(encodeDatos['Positively Rated'].mean())
    print("\n")

    ts_size = .25
    [trainSet, testSet] = splitDataSet(encodeDatos, test_size=ts_size)
    print(trainSet)
    print('trainSet shape: ', trainSet.shape)
    print(testSet)
    print('testSet shape: ', testSet.shape)
    print("\n")

    [train_vectors, test_vectors] = processData(trainSet, testSet)

    metodosML(train_vectors, trainSet, test_vectors, testSet)


def leeDatos():

    dataSet = pd.read_csv("gdrive/MyDrive/Colab Notebooks/Amazon_Unlocked_Mobile.zip", header=0, compression='zip')

    return dataSet

def encodeData(dataSet=0):
    dataSet.dropna(inplace=True)

    # Remove any 'neutral' ratings equal to 3
    dataSet = dataSet[dataSet['Rating'] != 3]

    # Encode 4s and 5s as 1 (rated positively)
    # Encode 1s and 2s as 0 (rated poorly)
    dataSet['Positively Rated'] = np.where(dataSet['Rating'] > 3, 1, 0)
    datos = ['Reviews', 'Positively Rated']
    misDatos = dataSet[datos]
    misDatos = misDatos.iloc[0:5000,:]

    return misDatos

# --------------------
# split data
def splitDataSet(dataSet=0, test_size=.2):
    """
    Split data in train and test sets
    """

    train, test = train_test_split(dataSet, test_size=test_size, random_state=0)

    return [train, test]

def processData(trainSet=0, testSet=0):

    # Create feature vectors
    vectorizer = TfidfVectorizer(stop_words='english',
                                 min_df = 5,
                                 max_df = 0.8,
                                 sublinear_tf = True,
                                 use_idf = True)
    train_vectors = vectorizer.fit_transform(trainSet['Reviews'])
    test_vectors = vectorizer.transform(testSet['Reviews'])

    return [train_vectors, test_vectors]

In [22]:
def metodoSVM(train_vectors=0, trainSet=0, test_vectors=0, testSet=0):

    # Perform classification with SVM, kernel=linear
    classifier_linear = svm.SVC(kernel='linear')
    classifier_linear.fit(train_vectors, trainSet['Positively Rated'])

    prediction_linear = classifier_linear.predict(test_vectors)

    df = pd.DataFrame(prediction_linear, columns=['Prediction'])
    print(df)
    print("\n")

    # results report
    report = classification_report(testSet['Positively Rated'], prediction_linear, output_dict=True)
    positive = report['1']
    dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
    negative = report['0']
    dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')

    print(dfpos)
    print("\n")
    print(dfneg)
    print("\n")
    print("accuracy: ", round(report['accuracy'],2))
    print("\n")

def metodosML(train_vectors=0, trainSet=0, test_vectors=0, testSet=0):
    methodsUsed=['SVM', 'DT']
    performanceHeaders=['precision','recall','f1-score']
    modPerformancePos = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)
    modPerformanceNeg = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)

    # Perform classification with SVM
    print('Classification with SVM')
    svm_clf = svm.SVC()
    param_search_svm = {
        'kernel': ["poly"],
        'degree': [1,2,3,4],
        'coef0': [1,2]
    }
    grid_search_svm = GridSearchCV(estimator=svm_clf, param_grid=param_search_svm, cv=5, verbose=1)
    grid_search_svm.fit(train_vectors, trainSet['Positively Rated'])
    best_clf_svm = grid_search_svm.best_estimator_
    svm_prediction = best_clf_svm.predict(test_vectors)

    df = pd.DataFrame(svm_prediction, columns=['SVM Prediction'])

    # results report
    report = classification_report(testSet['Positively Rated'], svm_prediction, output_dict=True)
    positive = report['1']
    dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
    negative = report['0']
    dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')

    print(dfpos)
    print("\n")
    print(dfneg)
    print("\n")
    print("accuracy: ", round(report['accuracy'],2))
    print("\n")

    modPerformancePos.iloc[0,0] = dfpos.iloc[0,0]
    modPerformanceNeg.iloc[0,0] = dfneg.iloc[0,0]
    modPerformancePos.iloc[0,1] = dfpos.iloc[1,0]
    modPerformanceNeg.iloc[0,1] = dfneg.iloc[1,0]
    modPerformancePos.iloc[0,2] = dfpos.iloc[2,0]
    modPerformanceNeg.iloc[0,2] = dfneg.iloc[2,0]

    # Perform classification with DT
    print('Classification with DT')
    dt_clf = DecisionTreeClassifier()
    param_search_clf = {
        'criterion': ["gini", 'entropy'],
        'max_depth': [5, 10, 20, 30, None]
    }
    grid_search_dt = GridSearchCV(estimator=dt_clf, param_grid=param_search_clf, cv=5, verbose=1)
    grid_search_dt.fit(train_vectors, trainSet['Positively Rated'])
    best_clf_dt = grid_search_dt.best_estimator_
    dt_prediction = best_clf_dt.predict(test_vectors)

    df['DT Prediction'] = dt_prediction

    # results report
    report = classification_report(testSet['Positively Rated'], dt_prediction, output_dict=True)
    positive = report['1']
    dfpos = pd.DataFrame.from_dict(positive, columns=['positive'], orient='index')
    negative = report['0']
    dfneg = pd.DataFrame.from_dict(negative, columns=['negative'], orient='index')

    print(dfpos)
    print("\n")
    print(dfneg)
    print("\n")
    print("accuracy: ", round(report['accuracy'],2))
    print("\n")

    modPerformancePos.iloc[1,0] = dfpos.iloc[0,0]
    modPerformanceNeg.iloc[1,0] = dfneg.iloc[0,0]
    modPerformancePos.iloc[1,1] = dfpos.iloc[1,0]
    modPerformanceNeg.iloc[1,1] = dfneg.iloc[1,0]
    modPerformancePos.iloc[1,2] = dfpos.iloc[2,0]
    modPerformanceNeg.iloc[1,2] = dfneg.iloc[2,0]

    print(df)
    print("\n")
    print(modPerformancePos)
    print("\n")
    print(modPerformanceNeg)
    print("\n")


In [23]:
main()

                                             Product Name Brand Name   Price  \
0       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
1       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
2       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
3       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
4       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
...                                                   ...        ...     ...   
413835  Samsung Convoy U640 Phone for Verizon Wireless...    Samsung   79.95   
413836  Samsung Convoy U640 Phone for Verizon Wireless...    Samsung   79.95   
413837  Samsung Convoy U640 Phone for Verizon Wireless...    Samsung   79.95   
413838  Samsung Convoy U640 Phone for Verizon Wireless...    Samsung   79.95   
413839  Samsung Convoy U640 Phone for Verizon Wireless...    Samsung   79.95   

        Rating                         