# Import Libraries

In [1]:
import math
import random
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import time

import matplotlib.pyplot as plt

import pickle
from functools import partial, update_wrapper

def wrapped_partial(func, *args, **kwargs):
    partial_func = partial(func, *args, **kwargs)
    update_wrapper(partial_func, func)
    return partial_func


from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import classification_report, make_scorer, accuracy_score, balanced_accuracy_score, f1_score
from sklearn.pipeline import make_pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF,Matern,RationalQuadratic,ExpSineSquared,DotProduct
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score


from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split

import random
from datetime import datetime
random.seed(datetime.now())

# Result storage

In [2]:
results_overview = {}
predictions_per_model = {}

# Data Loading

In [3]:
df_data_train = pd.read_csv('amazon_review_ID.shuf.lrn.csv', low_memory=False, sep=',')
df_data_res = pd.read_csv('amazon_review_ID.shuf.tes.csv', low_memory=False, sep=',')

class_factor = df_data_train['Class'].factorize()
class_index = class_factor[1]


df_data_train = df_data_train.replace('?', np.nan)
# drop the NaN
df_data_train = df_data_train.dropna(axis=0, how="any")

output_train=df_data_train['Class']

X = df_data_train.drop(labels=['Class'], axis=1)
X = X.drop(labels=['ID'], axis=1)

X_scale= preprocessing.scale(X)

X_train, X_test, y_train, y_test = train_test_split(X, output_train, random_state = 0, test_size=0.33)


df_data_res = df_data_res.replace('?', np.nan)
# drop the NaN
X_res = df_data_res.dropna(axis=0, how="any")
X_res = df_data_res.drop(labels=['ID'], axis=1)


In [5]:
X.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V9991,V9992,V9993,V9994,V9995,V9996,V9997,V9998,V9999,V10000
0,15,2,13,6,11,6,8,2,8,7,...,0,0,2,0,0,1,0,0,0,0
1,11,9,6,11,4,3,6,5,3,1,...,1,0,0,0,0,0,0,0,0,0
2,18,10,4,4,8,5,5,6,2,3,...,0,0,0,0,1,0,0,0,0,0
3,17,6,11,6,11,3,7,4,6,1,...,0,0,0,0,0,1,0,0,0,0
4,14,9,10,13,8,1,0,12,3,1,...,0,0,0,1,0,0,0,0,0,2


## Class Distribution

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(16,8))
output_train.value_counts().plot(kind='bar')
plt.savefig("output_graphic.jpg")
plt.show()

display(output_train.value_counts())
display(len(output_train.value_counts()))


## Modelselection

In [None]:
models = [SGDClassifier(),KNeighborsClassifier(),GaussianProcessClassifier(),MLPClassifier(),
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, X, output_train, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])


sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.savefig("Amazon_models.jpg")
plt.show()

cv_df.groupby('model_name').accuracy.mean()


# Linear Model

In [None]:
clf = LinearSVC()

parameter_grid = {
    'penalty': ['l2', 'l1'],
    'loss': ['hinge', 'squared_hinge']
}
cv_grid = GridSearchCV(clf, parameter_grid, cv=3, verbose=10, n_jobs=-1, scoring=['accuracy', 'balanced_accuracy', 'f1_weighted'], refit='accuracy')
cv_grid.fit(X, output_train)

print("Best Parameter Choice:")
print(cv_grid.best_params_)


In [9]:
clf = make_pipeline(LinearSVC(loss = 'squared_hinge', penalty = 'l2' ))

clf.fit(X, output_train)

CV
cv_result = cross_validate(clf, X, output_train, cv=3, scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted'], n_jobs=-1)

clf_scaled = make_pipeline(LinearSVC(loss = 'squared_hinge', penalty = 'l2' ))
clf_scaled.fit(X_scale,output_train)
scaled_acc = cross_validate(clf_scaled, X_scale, output_train, cv=3, scoring = ['accuracy'], n_jobs=-1)['test_accuracy'].mean()

results_overview['LinearSVC'] = {
    'scaled accuracy' : scaled_acc,
    'accuracy': cv_result['test_accuracy'].mean(),
    'balanced_accuracy': cv_result['test_balanced_accuracy'].mean(),
    'f1_weighted': cv_result['test_f1_weighted'].mean(),
    'fit_time' : cv_result['fit_time'].mean()
    }



Pipeline(steps=[('linearsvc', LinearSVC())])

In [None]:
clf2 = make_pipeline(LinearSVC(loss = 'squared_hinge', penalty = 'l2' ))

clf2.fit(X_train, y_train)

results_overview['LinearSVC']['Holdout'] = accuracy_score(y_test,clf2.predict(X_test))

display(results_overview)

# Predict

y_houldout_prediction = clf2.predict(X_res)


In [10]:
y_prediction = clf.predict(X_res)
f = open("linearSVC_amazon_result.csv", "w")
f.write('ID,"Class"\n')
for i in range(750):
    f.write(str(i+750)+','+y_prediction[i]+'\n')
f.close()

f = open("linearSVC_amazon_result2.csv", "w")
f.write('ID,"Class"\n')
for i in range(750):
    f.write(str(i+750)+','+y_houldout_prediction[i]+'\n')
f.close()

In [None]:
occurs=np.zeros(50)
authors=list()

for i in range(50):
    author=y_prediction[0]
    indices=(y_prediction == author)
    occurs[i]=np.sum(indices)
    authors.append(author)
    y_prediction=np.delete(y_prediction,indices)
    
print(authors)
fig = plt.figure(figsize=(16,8))
plt.bar(authors,occurs)
plt.savefig("linearSVC_amazon_graphic.jpg")
plt.show()

## Logistic Model

In [None]:
clf = LogisticRegression()

parameter_grid = {
    'penalty': ['l2', 'none'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [50, 100, 200, 250]
}
cv_grid = GridSearchCV(clf, parameter_grid, cv=3, verbose=10, n_jobs=-1, scoring=['accuracy', 'balanced_accuracy', 'f1_weighted'], refit='accuracy')
cv_grid.fit(X, output_train)

print("Best Parameter Choice:")
print(cv_grid.best_params_)

In [None]:
clf = make_pipeline(LogisticRegression(max_iter= 50, penalty = 'none', solver = 'sag' ))

clf.fit(X, output_train)

# CV
cv_result = cross_validate(clf, X, output_train, cv=3, scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted'], n_jobs=-1)

clf_scaled = make_pipeline(LogisticRegression(max_iter= 50, penalty = 'none', solver = 'sag' ))
clf_scaled.fit(X_scale,output_train)
scaled_acc = cross_validate(clf_scaled, X_scale, output_train, cv=3, scoring = ['accuracy'], n_jobs=-1)['test_accuracy'].mean()

results_overview['LogRegression'] = {
    'scaled accuracy' : scaled_acc,
    'accuracy': cv_result['test_accuracy'].mean(),
    'balanced_accuracy': cv_result['test_balanced_accuracy'].mean(),
    'f1_weighted': cv_result['test_f1_weighted'].mean(),
    'fit_time' : cv_result['fit_time'].mean()
    }

In [None]:
clf2 = make_pipeline(LogisticRegression(max_iter= 50, penalty = 'none', solver = 'sag' ))

clf2.fit(X_train, y_train)

results_overview['LogRegression']['Holdout'] = accuracy_score(y_test,clf2.predict(X_test))

display(results_overview)

# Predict
y_prediction = clf.predict(X_res)
y_houldout_prediction = clf2.predict(X_res)

In [None]:
f = open("logReg_amazon_result.csv", "w")
f.write('ID,"Class"\n')
for i in range(750):
    f.write(str(i+750)+','+y_prediction[i]+'\n')
f.close()

f = open("logReg_amazon_result2.csv", "w")
f.write('ID,"Class"\n')
for i in range(750):
    f.write(str(i+750)+','+y_houldout_prediction[i]+'\n')
f.close()

In [None]:
occurs=np.zeros(50)
authors=list()

for i in range(50):
    author=y_prediction[0]
    indices=(y_prediction == author)
    occurs[i]=np.sum(indices)
    authors.append(author)
    y_prediction=np.delete(y_prediction,indices)
    
print(authors)
fig = plt.figure(figsize=(16,8))
plt.bar(authors,occurs)
plt.savefig("logReg_amazon_graphic.jpg")
plt.show()

# Random Forest

In [None]:
clf = RandomForestClassifier()

parameter_grid = {
    'criterion' : ['gini', 'entropy'],
    'max_features' : ['auto', 'sqrt', 'log2']
}
cv_grid = GridSearchCV(clf, parameter_grid, cv=3, verbose=10, n_jobs=-1, scoring=['accuracy', 'balanced_accuracy', 'f1_weighted'], refit='accuracy')
cv_grid.fit(X, output_train)

print("Best Parameter Choice:")
print(cv_grid.best_params_)


In [None]:
clf = make_pipeline(RandomForestClassifier(criterion= 'gini',max_features = 'auto',n_estimators=3000))

clf.fit(X, output_train)

# CV
cv_result = cross_validate(clf, X, output_train, cv=3, scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted'], n_jobs=-1)

clf_scaled = make_pipeline(RandomForestClassifier(criterion= 'gini',max_features = 'auto',n_estimators=3000))
clf_scaled.fit(X_scale,output_train)
scaled_acc = cross_validate(clf_scaled, X_scale, output_train, cv=3, scoring = ['accuracy'], n_jobs=-1)['test_accuracy'].mean()

results_overview['RandomForestClassifier'] = {
    'scaled accuracy' : scaled_acc,
    'accuracy': cv_result['test_accuracy'].mean(),
    'balanced_accuracy': cv_result['test_balanced_accuracy'].mean(),
    'f1_weighted': cv_result['test_f1_weighted'].mean(),
    'fit_time' : cv_result['fit_time'].mean()
    }

In [None]:
clf2 = make_pipeline(cv_grid.best_estimator_)

clf2.fit(X_train, y_train)

results_overview['RandomForestClassifier']['Holdout'] = accuracy_score(y_test,clf2.predict(X_test))

display(results_overview)

# Predict
y_prediction = clf.predict(X_res)
y_houldout_prediction = clf2.predict(X_res)

In [None]:
f = open("RandomTree_amazon_result.csv", "w")
f.write('ID,"Class"\n')
for i in range(750):
    f.write(str(i+750)+','+y_prediction[i]+'\n')
f.close()

f = open("RandomTree_amazon_result2.csv", "w")
f.write('ID,"Class"\n')
for i in range(750):
    f.write(str(i+750)+','+y_houldout_prediction[i]+'\n')
f.close()

In [None]:
occurs=np.zeros(50)
authors=list()

for i in range(50):
    author=y_prediction[0]
    indices=(y_prediction == author)
    occurs[i]=np.sum(indices)
    authors.append(author)
    y_prediction=np.delete(y_prediction,indices)
    
print(authors)
fig = plt.figure(figsize=(16,8))
plt.bar(authors,occurs)
plt.savefig("RandomTree_amazon_graphic.jpg")
plt.show()

## Kneighbors

In [None]:
clf = KNeighborsClassifier()

parameter_grid = {
    'n_neighbors': [1,2,3,5,10,15],
    'weights':['uniform', 'distance']
}
cv_grid = GridSearchCV(clf, parameter_grid, cv=3, verbose=10, n_jobs=-1, scoring=['accuracy', 'balanced_accuracy', 'f1_weighted'], refit='accuracy')
cv_grid.fit(X, output_train)

print("Best Parameter Choice:")
print(cv_grid.best_params_)



In [None]:
clf = make_pipeline(cv_grid.best_estimator_)

clf.fit(X, output_train)

# CV
cv_result = cross_validate(clf, X, output_train, cv=3, scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted'], n_jobs=-1)

clf_scaled = make_pipeline(cv_grid.best_estimator_)
clf_scaled.fit(X_scale,output_train)
scaled_acc = cross_validate(clf_scaled, X_scale, output_train, cv=3, scoring = ['accuracy'], n_jobs=-1)['test_accuracy'].mean()

results_overview['NeighborsClassifier'] = {
    'scaled accuracy' : scaled_acc,
    'accuracy': cv_result['test_accuracy'].mean(),
    'balanced_accuracy': cv_result['test_balanced_accuracy'].mean(),
    'f1_weighted': cv_result['test_f1_weighted'].mean(),
    'fit_time' : cv_result['fit_time'].mean()
    }

In [None]:
clf2 = make_pipeline(cv_grid.best_estimator_)

clf2.fit(X_train, y_train)

results_overview['NeighborsClassifier']['Holdout'] = accuracy_score(y_test,clf2.predict(X_test))

display(results_overview)

# Predict
y_prediction = clf.predict(X_res)
y_houldout_prediction = clf2.predict(X_res)

In [None]:
f = open("NeighborsClassifier_amazon_result.csv", "w")
f.write('ID,"Class"\n')
for i in range(750):
    f.write(str(i+750)+','+y_prediction[i]+'\n')
f.close()

f = open("NeighborsClassifier_amazon_result2.csv", "w")
f.write('ID,"Class"\n')
for i in range(750):
    f.write(str(i+750)+','+y_houldout_prediction[i]+'\n')
f.close()

In [None]:
occurs=np.zeros(50)
authors=list()

for i in range(50):
    author=y_prediction[0]
    indices=(y_prediction == author)
    occurs[i]=np.sum(indices)
    authors.append(author)
    y_prediction=np.delete(y_prediction,indices)
    
print(authors)
fig = plt.figure(figsize=(16,8))
plt.bar(authors,occurs)
plt.savefig("Neigbors_amazon_graphic.jpg")
plt.show()

In [None]:
pd.DataFrame.from_dict(results_overview).T.to_csv("Overview_amazon.csv")