#### Imports

In [68]:
import pandas as pd
import math

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn import metrics
from sklearn.model_selection import KFold

#### Import DataFrame

In [69]:
df = pd.read_csv('train.csv', index_col=0)
train_x = df[["Energy","Danceability","Liveness","Valence","Acousticness","Speechiness","Popularity"]]
train_y = df[["Rock","Pop","Standards","Metal","Indie","Cabaret","Soul","Wave","Invasion","Hip-hop"]]

#### Model training

In [70]:
def train_classifier(x_train,y_train,x_test,y_test,model):
    avg_accuracy = 0
    avg_precision = 0
    avg_recall = 0
    for i in range (len(y_train.columns)):
        model.fit(x_train,y_train.iloc[:,i])
        len_class = sum(y_train.iloc[:,i])
        y_pred = model.predict(x_test)
        avg_accuracy += len_class * metrics.accuracy_score(y_test[[y_train.columns[i]]],y_pred)
        avg_precision += len_class * metrics.precision_score(y_test[[y_train.columns[i]]],y_pred,zero_division=0)
        avg_recall += len_class * metrics.recall_score(y_test[[y_train.columns[i]]],y_pred,zero_division=0)
    return (avg_accuracy/y_train.shape[0],avg_precision/y_train.shape[0],avg_recall/y_train.shape[0])

#### Cross-Validation

In [71]:
def cross_validation(model):
    num_splits = 10
    folds = KFold(n_splits=num_splits)

    sum_accu,sum_pre,sum_rec = 0,0,0

    #Le train x ici est celui du dataset d'entrainement complet
    for train_index,test_index in folds.split(train_x):
        X_train, X_test = train_x.iloc[train_index],train_x.iloc[test_index]
        Y_train, Y_test = train_y.iloc[train_index],train_y.iloc[test_index]

        metrics = train_classifier(X_train,Y_train,X_test,Y_test,model)
        sum_accu += metrics[0]
        sum_pre += metrics[1]
        sum_rec += metrics[2]

    return (sum_accu/num_splits,sum_pre/num_splits,sum_rec/num_splits)


#### Find best k (for K Nearest Neighbors)

In [72]:
# TODO : Réparer la fonction

def sampled_range(mini, maxi, num):
  if not num:
    return []
  lmini = math.log(mini)
  lmaxi = math.log(maxi)
  ldelta = (lmaxi - lmini) / (num - 1)
  out = [x for x in set([int(math.exp(lmini + i * ldelta)) for i in range(num)])]
  out.sort()
  return out

def find_best_k():
  tested_k_values = sampled_range(1, 1000, 10)
  max_accuracy, best_k = 0, 0
  for k in tested_k_values:
    model = KNeighborsClassifier(n_neighbors=k,weights='distance')
    accuracy = cross_validation(model)
    if max_accuracy < accuracy:
      max_accuracy = accuracy
      best_k = k
  return best_k

#### Models

In [73]:
best_k = 10
knn = KNeighborsClassifier(n_neighbors=best_k,weights='distance')
reg = LogisticRegression(random_state=0)
bay = GaussianNB()

models = [knn,reg,bay]

#### Launch training

In [74]:
for model in models:
    print(model,cross_validation(model))

KNeighborsClassifier(n_neighbors=10, weights='distance') (0.7193885823009276, 0.38923501965299223, 0.34135902815180796)
LogisticRegression(random_state=0) (0.7170956316461423, 0.323991643426657, 0.3343489468452267)
GaussianNB() (0.7169199830197437, 0.38856750070664847, 0.4003895417792195)
