# Amazon Commerce Reviews

**Kaggle: 184.702 TU ML WS 20**

**Goal: Predict who wrote the review.**

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
import time

### Get the Data

In [None]:
data = pd.read_csv('./184702-tu-ml-ws-20-amazon-commerce-reviews/amazon_review_ID.shuf.lrn.csv')

In [None]:
data

### Basic Data Information

In [None]:
data.info()

In [None]:
#data.describe()

In [None]:
data.describe(include = 'object')

### Check Missing Values

In [None]:
print(data.isnull().sum(axis=0))

In [None]:
#sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
# Remove the target and check if the data is all int64

### Exploratory Data Analysis

**Class**

In [None]:
plt.figure(figsize=(20, 8))
plt.xticks(rotation=90)
sns.set_style('darkgrid')
ax = sns.countplot(x = data['Class'])

total = len(data['Class'])

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:.1f}%'.format(100 * height/total),
            ha="center")

### Remove identifiers

In [None]:
idsTrain = data['ID'].to_frame()
data = data.drop('ID',axis=1)

### Target Split

In [None]:
X = data.drop('Class',axis=1)
y = data['Class']

### Feature Selection

In [None]:
from sklearn.feature_selection import SelectPercentile, chi2

selection = SelectPercentile(chi2, percentile=5)
X_transformed = selection.fit_transform(X, y)
columns = np.asarray(X.columns.values)
support = np.asarray(selection.get_support())
columns_with_support = columns[support]
X_transformed.shape

### Data Pre-processing

**Choose one of the methods:**
1. Standardization
2. MinMaxScaler
3. Normalization

In [None]:
preprocessing_method = 1

In [None]:
if preprocessing_method == 1:
    standard_scaler = StandardScaler()
    X_scaled = standard_scaler.fit_transform(X_transformed)
elif preprocessing_method == 2:
    min_max_scaler = preprocessing.MinMaxScaler()
    X_scaled = min_max_scaler.fit_transform(X_transformed)
elif preprocessing_method == 3:
    normalizer = preprocessing.Normalizer()
    X_scaled = normalizer.fit_transform(X_transformed)

In [None]:
X_scaled

### Outliers Detection

**Using mathematical function Z-Score**

In [None]:
z = np.abs(stats.zscore(X_scaled))
threshold = 15
outliers_rows = np.where(z > threshold)
print(np.where(z > threshold))
# The first array contains the list of row numbers and second array respective column numbers

In [None]:
len(set(outliers_rows[0]))

### Data Preparation

In [None]:
X_prepared = X_scaled[(np.abs(stats.zscore(X_scaled)) < threshold).all(axis=1)]
#X_transformed.index = np.arange(1, len(data) + 1)
X_prepared.shape

In [None]:
y = y.drop(outliers_rows[0])
y.shape

### Train Dataset Split

In [None]:
y = y.to_numpy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, test_size=0.30)

### Models Prediction

**Types:**
- Linear Classifiers: Logistic Regression, Naive Bayes Classifier
- Nearest Neighbor
- Support Vector Machines
- Decision Trees
- Random Forest
- Neural Networks

In [None]:
d = {}

d["Logistic regression"] = LogisticRegression()
d["Gaussian Naive Bayes"] = GaussianNB()

d["KNearest Neighbors (5)"] = KNeighborsClassifier()

d["SVM rbf"] = SVC()
d["SVM-linear"] = SVC(kernel='linear')
d["SGD Classifier"] = SGDClassifier()

d["Decision Tree"] = DecisionTreeClassifier()

d["Random Forest"] = RandomForestClassifier()

d["Multi-layer Perceptron Classifier"] = MLPClassifier(max_iter=1000)

In [None]:
bestPrediction = 0
for name, clf in d.items():
    start = time.time()
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    score = accuracy_score(y_test, predictions)
    if score > bestPrediction:
        bestPrediction = score
        bestClassifierName = name
    end = time.time()
    print(name, "- Accuracy: %0.3f" % score, "- Time: %0.2f" % (end - start), "seconds")

print("###############")
print("Best Classifier:", bestClassifierName)

### Hyperparameter optimization - Random/Grid Search

...

### Test Data

In [None]:
testData = pd.read_csv('./184702-tu-ml-ws-20-amazon-commerce-reviews/amazon_review_ID.shuf.tes.csv')

In [None]:
testData

**Remove identifiers**

In [None]:
idsTest = testData['ID'].to_frame()
testData = testData.drop('ID',axis=1)

**Feature selection based on Training Data**

In [None]:
testData_transformed = testData.loc[:, columns_with_support]

**Data pre-processing based on Training Data**

In [None]:
if preprocessing_method == 1:
    standard_scaler = StandardScaler()
    testData_scaled = standard_scaler.fit_transform(testData_transformed)
elif preprocessing_method == 2:
    min_max_scaler = preprocessing.MinMaxScaler()
    testData_scaled = min_max_scaler.fit_transform(testData_transformed)
elif preprocessing_method == 3:
    normalizer = preprocessing.Normalizer()
    testData_scaled = normalizer.fit_transform(testData_transformed)

### Final Prediction

In [None]:
clf = d.get(bestClassifierName)

In [None]:
#clf = RandomForestClassifier(bootstrap = True, criterion = 'gini', max_depth = 20, n_estimators= 1000)

In [None]:
clf = clf.fit(X_prepared, y)

In [None]:
predictions = clf.predict(testData_scaled)

In [None]:
predictions

### Join IDs to create Submission Dataset

In [None]:
predictions = pd.DataFrame(predictions, columns=['Class']) 

In [None]:
result = pd.concat([idsTest,predictions], axis=1)

In [None]:
result

In [None]:
result.to_csv('submission.csv', index=False)