# Training

### Library Imports

In [32]:
import pandas as pd
import matplotlib as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import  LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC

### Notebook Imports

In [33]:
import pickle

# Load the variable
with open('pkl/df.pkl', 'rb') as f:
    df = pickle.load(f)

### Model Training

In [34]:
from sklearn.model_selection import train_test_split

y = df['label'] # genre variable.
X = df.loc[:, df.columns != 'label'] #select all columns but not the labels

# Normalize so everything is on the same scale. 
cols = X.columns
min_max_scaler = MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(X)

# new data frame with the new scaled data. 
X = pd.DataFrame(np_scaled, columns = cols)

#Training of the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



### Model Fitting Helper

In [36]:
def model_assess(model, title = "Default"):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print('Accuracy', title, ':', round(accuracy_score(y_test, preds), 5), '\n')

### Model Fitting

In [37]:
seed = 12
np.random.seed(12)
df_shuffle = df.sample(frac=1, random_state=seed).reset_index(drop=True)

# KNN
knn = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=4, weights='distance', metric="manhattan"))
model_assess(knn, "KNN")

# Random Forest
rforest = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=seed)
model_assess(rforest, "Random Forest")

# Support Vector Machine
svm = make_pipeline(StandardScaler(), SVC(decision_function_shape="ovo", probability=True))
model_assess(svm, "Support Vector Machine")

# Logistic Regression
lg = make_pipeline(StandardScaler(), LogisticRegression(random_state=seed, solver='lbfgs', max_iter=1000, multi_class='multinomial'))
model_assess(lg, "Logistic Regression")


Accuracy KNN : 0.7 



Accuracy Random Forest : 0.69 

Accuracy Support Vector Machine : 0.68 

Accuracy Logistic Regression : 0.67667 



### Notebook Exports

In [38]:
models = {'knn': knn, 'rforest': rforest, 'svm': svm, 'lg': lg}

with open('pkl/models.pkl', 'wb') as f:
    pickle.dump(models, f)


with open('pkl/model_training.pkl', 'wb') as f:
    pickle.dump([X_test, y_test, X_train, y_train], f)
