# Training

## Library Imports

In [11]:
import pandas as pd
import matplotlib as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import  LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC

In [12]:
import pickle

# Load the variable
with open('pkl/df.pkl', 'rb') as f:
    df = pickle.load(f)

In [21]:
from sklearn.model_selection import train_test_split

y = df['label'] # genre variable.
X = df.loc[:, df.columns != 'label'] #select all columns but not the labels

# Normalize so everything is on the same scale. 
cols = X.columns
min_max_scaler = MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(X)

# new data frame with the new scaled data. 
X = pd.DataFrame(np_scaled, columns = cols)

#Training of the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



      chroma_stft_mean  chroma_stft_var  rms_mean   rms_var  \
1             0.343065         0.086147  0.112699  0.001450   
2             0.346815         0.092243  0.132003  0.004620   
3             0.363639         0.086856  0.132565  0.002448   
4             0.335579         0.088129  0.143289  0.001701   
5             0.376670         0.089702  0.132618  0.003583   
...                ...              ...       ...       ...   
9985          0.349126         0.080515  0.050019  0.000097   
9986          0.372564         0.082626  0.057897  0.000088   
9987          0.347481         0.089019  0.052403  0.000701   
9988          0.387527         0.084815  0.066430  0.000320   
9989          0.369293         0.086759  0.050524  0.000067   

      spectral_centroid_mean  spectral_centroid_var  spectral_bandwidth_mean  \
1                1816.693777           90525.690866              2010.051501   
2                1788.539719          111407.437613              2084.565132   
3  

In [20]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', StandardScaler(), X.columns)  
    ],
    remainder='passthrough'  
)

### Model Accuracy Helper

In [15]:
def model_assess(model, title = "Default"):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print('Accuracy', title, ':', round(accuracy_score(y_test, preds), 5), '\n')

In [16]:
seed = 12
np.random.seed(12)
df_shuffle = df.sample(frac=1, random_state=seed).reset_index(drop=True)

# KNN
knn = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=4, weights='distance', metric="manhattan"))
model_assess(knn, "KNN")

# Random Forest
rforest = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=seed)
model_assess(rforest, "Random Forest")

# Support Vector Machine
svm = make_pipeline(StandardScaler(), SVC(decision_function_shape="ovo", probability=True))
model_assess(svm, "Support Vector Machine")

# Logistic Regression
lg = make_pipeline(StandardScaler(), LogisticRegression(random_state=seed, solver='lbfgs', max_iter=1000, multi_class='multinomial'))
model_assess(lg, "Logistic Regression")


Accuracy KNN : 0.92526 

Accuracy Random Forest : 0.81315 

Accuracy Support Vector Machine : 0.85552 

Accuracy Logistic Regression : 0.72639 



In [24]:
models = {'knn': knn, 'rforest': rforest, 'svm': svm, 'lg': lg}

with open('pkl/models.pkl', 'wb') as f:
    pickle.dump(models, f)


with open('pkl/model_training.pkl', 'wb') as f:
    pickle.dump([X_test, y_test, X_train, y_train], f)
