# ML_25M Classic ML

In [1]:
import torch
import time
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

seed = 42
np.random.seed(seed)
random.seed(seed)
df = pd.read_csv('dataset.csv')


## Data Preprocessing

In [2]:
X = df.drop('rating', axis=1)
Y = df['rating']

# encode Y
Y = LabelEncoder().fit_transform(Y)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42)



# Normalization


# PCA
# pca = PCA(n_components=2)
# pca.fit(X_train)
# X_train = pca.transform(X_train)
# X_test = pca.transform(X_test)

# LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, Y_train)
X_train = lda.transform(X_train)
X_test = lda.transform(X_test)

print(f'Number of training samples: {X_train.shape}')
print(f'Number of testing samples: {X_test.shape}')
results = pd.DataFrame(columns=['Model', 'Accuracy'])


Number of training samples: (11052, 7)
Number of testing samples: (2764, 7)


In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(
    X_train, Y_train, test_size=0.1, random_state=42)

print(f'Number of training samples: {X_train.shape[0]}')
print(f'Number of validation samples: {X_val.shape[0]}')

# train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(Y_train, dtype=torch.long)), batch_size=batch, shuffle=True)
val_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.tensor(
    X_val, dtype=torch.float32), torch.tensor(Y_val, dtype=torch.long)), batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(torch.tensor(
    X_test, dtype=torch.float32), torch.tensor(Y_test, dtype=torch.long)), batch_size=64, shuffle=True)


# Models

## Random Forest Classifier

In [3]:
# Create the model with 100 trees
random_forest = RandomForestClassifier(n_estimators=50,
                                       bootstrap=True,
                                       max_features='sqrt')

# Fit on training data
random_forest.fit(X_train, Y_train)

# accuracy
accuracy = random_forest.score(X_test, Y_test)
print('Accuracy:', accuracy)

results = pd.concat([results, pd.DataFrame(
    {'Model': 'Random Forest', 'Accuracy': accuracy}, index=[0])], ignore_index=True)


Accuracy: 0.863603473227207


## Logistic Classifier

In [4]:
logistic_classifier = LogisticRegression()
logistic_classifier.fit(X_train, Y_train)

# accuracy
accuracy = logistic_classifier.score(X_test, Y_test)
print('Accuracy:', accuracy)

results = pd.concat([results, pd.DataFrame(
    {'Model': 'Logistic Regression', 'Accuracy': accuracy}, index=[0])], ignore_index=True)


Accuracy: 0.8607091172214182


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Naive Bayes

In [5]:
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, Y_train)

# accuracy
accuracy = naive_bayes.score(X_test, Y_test)
print('Accuracy:', accuracy)

results = pd.concat([results, pd.DataFrame(
    {'Model': 'Naive Bayes', 'Accuracy': accuracy}, index=[0])], ignore_index=True)


Accuracy: 0.8418958031837916


## SVM

In [6]:
svm = SVC()
svm.fit(X_train, Y_train)

# accuracy
accuracy = svm.score(X_test, Y_test)
print('Accuracy:', accuracy)

results = pd.concat([results, pd.DataFrame(
    {'Model': 'SVM', 'Accuracy': accuracy}, index=[0])], ignore_index=True)


Accuracy: 0.8614327062228654
