# Amazon Commerce Reviews

**Kaggle: 184.702 TU ML WS 20**

**Goal: Predict who wrote the review.**

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.feature_selection import SelectPercentile, chi2, SelectFromModel
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, Normalizer
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
import time

### Get the Data

In [None]:
data = pd.read_csv('./184702-tu-ml-ws-20-amazon-commerce-reviews/amazon_review_ID.shuf.lrn.csv')

In [None]:
data

### Basic Data Information

In [None]:
data.info()

In [None]:
#data.describe()

In [None]:
data.describe(include = 'object')

### Check Missing Values

In [None]:
print(data.isnull().sum(axis=0))

In [None]:
sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
# Remove the target and check if the data is all int64

### Exploratory Data Analysis

**Class**

In [None]:
plt.figure(figsize=(20, 8))
plt.xticks(rotation=90)
sns.set_style('darkgrid')
ax = sns.countplot(x = data['Class'])

total = len(data['Class'])

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:.1f}%'.format(100 * height/total),
            ha="center")

### Remove identifiers

In [None]:
idsTrain = data['ID'].to_frame()
data = data.drop('ID',axis=1)

### Target Split

In [None]:
X = data.drop('Class',axis=1)
y = data['Class']

### Feature Selection

**Choose one of the methods:**
1. SelectPercentile (chi2)
2. SelectFromModel (LinearSVC)
3. SelectFromModel (LogisticRegression)
3. SelectFromModel (ExtraTreesClassifier)

In [None]:
featureselection_method = 4

In [None]:
if featureselection_method == 1:
    selection = SelectPercentile(chi2, percentile=5)
elif featureselection_method == 2:
    clf = LinearSVC()
    #clf = LinearSVC(C=0.1, penalty="l1", dual=False).fit(X, y)
    selection = SelectFromModel(clf, prefit=False)
elif featureselection_method == 3:
    clf = LogisticRegression()
    #clf = LogisticRegression(C=0.2, penalty="l2", dual=False, max_iter=200).fit(X, y)
    selection = SelectFromModel(clf, prefit=False)
elif featureselection_method == 4:
    clf = ExtraTreesClassifier(n_estimators=50).fit(X, y)
    selection = SelectFromModel(clf, prefit=False)

#clf.feature_importances_ 
X_transformed = selection.fit_transform(X, y)
columns = np.asarray(X.columns.values)
support = np.asarray(selection.get_support())
columns_with_support = columns[support]
X_transformed.shape

### Scaling

**Choose one of the methods:**
1. Standardization (StandardScaler)
2. Standardization (RobustScaler)
3. MinMaxScaler
4. Normalization

In [None]:
scaling_method = 1

In [None]:
if scaling_method == 1:
    scaler = StandardScaler()
    scaler = scaler.fit(X_transformed)
    X_scaled = scaler.transform(X_transformed)
elif scaling_method == 2:
    scaler = RobustScaler()
    scaler = scaler.fit(X_transformed)
    X_scaled = scaler.transform(X_transformed)
elif scaling_method == 3:
    scaler = preprocessing.MinMaxScaler()
    scaler = scaler.fit(X_transformed)
    X_scaled = scaler.transform(X_transformed)
elif scaling_method == 4:
    scaler = preprocessing.Normalizer()
    scaler = scaler.fit(X_transformed)
    X_scaled = scaler.transform(X_transformed)

In [None]:
X_scaled

### Outliers Detection

**Using mathematical function Z-Score**

In [None]:
z = np.abs(stats.zscore(X_scaled))
threshold = 25
outliers_rows = np.where(z > threshold)
print(np.where(z > threshold))
# The first array contains the list of row numbers and second array respective column numbers

In [None]:
len(set(outliers_rows[0]))

### Data Preparation

**Remove the identified outliers**

In [None]:
X_prepared = X_scaled[(np.abs(stats.zscore(X_scaled)) < threshold).all(axis=1)]
X_prepared.shape

In [None]:
y = y.drop(outliers_rows[0])
y.shape

### Train Dataset Split

In [None]:
y = y.to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, test_size=0.30)

### Models Prediction

**Types:**
- Linear Classifiers: Logistic Regression, Naive Bayes Classifier
- Nearest Neighbor
- Support Vector Machines
- Decision Trees
- Random Forest
- Neural Networks

In [None]:
d = {}

d["Logistic Regression"] = LogisticRegression(max_iter=200)
d["Gaussian Naive Bayes"] = GaussianNB()

d["KNearest Neighbors (5)"] = KNeighborsClassifier()

d["SVM rbf"] = SVC()
d["SGD Classifier"] = SGDClassifier()

d["Decision Tree"] = DecisionTreeClassifier()

d["Random Forest"] = RandomForestClassifier()

d["Multi-layer Perceptron Classifier"] = MLPClassifier(max_iter=1000)

In [None]:
bestPrediction = 0
for name, clf in d.items():
    start = time.time()
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    score = accuracy_score(y_test, predictions)
    if score > bestPrediction:
        bestPrediction = score
        bestClassifierName = name
    end = time.time()
    print(name, "- Accuracy: %0.3f" % score, "- Time: %0.2f" % (end - start), "seconds")

print("###############")
print("Best Classifier:", bestClassifierName)

In [None]:
stop1

### Best Classifiers
- Logistic Regression
- Random Forest
- SGD Classifier
- Multi-layer Perceptron Classifier

### Hyperparameter optimization (incomplete)

In [None]:
if bestClassifierName == 'Logistic Regression':
    param_grid = {
    'penalty' : ['l1','l2'],
    'C': [0.1, 1, 10], 
    'max_iter': [100, 1000]}
    param_randomized = {}
elif bestClassifierName == 'Random Forest':
    param_grid = {
    'n_estimators' : [100, 1000],
    'max_depth' : [1, 10, 20, None]} 
    param_randomized = {}
elif bestClassifierName == 'SGD Classifier':
    param_grid = {} 
    param_randomized = {}
elif bestClassifierName == 'Multi-layer Perceptron Classifier':
    param_grid = {} 
    param_randomized = {}

**GridSearchCV**

In [None]:
start = time.time()
clf_gridsearch = GridSearchCV(d.get(bestClassifierName), param_grid, verbose=0)
clf_gridsearch.fit(X_train, y_train)
print(clf_gridsearch.best_params_)
predictions = clf_gridsearch.predict(X_test)
score_gridsearch = accuracy_score(y_test, predictions)
end = time.time()
print(bestClassifierName, "GridSearchCV - Accuracy: %0.3f" % score_gridsearch, "- Time: %0.2f" % (end - start), "seconds")

**RandomizedSearchCV**

In [None]:
start = time.time()
clf_randomizedsearch = RandomizedSearchCV(d.get(bestClassifierName), param_randomized, random_state=0)
clf_randomizedsearch.fit(X_train, y_train)
print(clf_randomizedsearch.best_params_)
predictions = clf_randomizedsearch.predict(X_test)
score_randomizedsearch = accuracy_score(y_test, predictions)
end = time.time()
print(bestClassifierName, "RandomizedSearchCV - Accuracy: %0.3f" % score_randomizedsearch, "- Time: %0.2f" % (end - start), "seconds")

In [None]:
if score_gridsearch > score_randomizedsearch:
    clf = clf_gridsearch.best_estimator_
else:
    clf = clf_randomizedsearch.best_estimator_

In [None]:
stop2

**Best Classifier with Hyper Parametrization**

In [None]:
clf

### Test Data

In [None]:
testData = pd.read_csv('./184702-tu-ml-ws-20-amazon-commerce-reviews/amazon_review_ID.shuf.tes.csv')

In [None]:
testData

**Remove identifiers**

In [None]:
idsTest = testData['ID'].to_frame()
testData = testData.drop('ID',axis=1)

**Feature selection based on Training Data**

In [None]:
testData_transformed = testData.loc[:, columns_with_support]

**Data pre-processing based on Training Data**

In [None]:
testData_scaled = scaler.transform(testData_transformed)

**Fit Classifier in all Training Data**

In [None]:
clf = clf.fit(X_prepared, y)

### Final Prediction

In [None]:
predictions = clf.predict(testData_scaled)

In [None]:
predictions

### Join IDs to create Submission Dataset

In [None]:
predictions = pd.DataFrame(predictions, columns=['Class']) 

In [None]:
result = pd.concat([idsTest,predictions], axis=1)

In [None]:
result

In [None]:
result.to_csv('submission.csv', index=False)