#### Imports

In [3]:
import pandas as pd
import math

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn import metrics
from sklearn.model_selection import KFold

#### Import DataFrame

In [4]:
df = pd.read_csv('train.csv', index_col=0)
train_x = df[["Energy","Danceability","Liveness","Valence","Acousticness","Speechiness","Popularity"]]
train_y = df[["Rock","Pop","Standards","Metal","Indie","Cabaret","Soul","Wave","Invasion","Hip-hop"]]

df_test = pd.read_csv('test_copy.csv', index_col=0)
test_x = df_test[["Energy","Danceability","Liveness","Valence","Acousticness","Speechiness","Popularity"]]
test_y = df_test[["Rock","Pop","Standards","Metal","Indie","Cabaret","Soul","Wave","Invasion","Hip-hop"]]

#### Model training

In [5]:
def train_classifier(x_train,y_train,x_test,y_test,model):
    avg_accuracy = 0
    avg_precision = 0
    avg_recall = 0
    for i in range (len(y_train.columns)):
        model.fit(x_train,y_train.iloc[:,i])
        len_class = sum(y_train.iloc[:,i])
        y_pred = model.predict(x_test)
        avg_accuracy += len_class * metrics.accuracy_score(y_test[[y_train.columns[i]]],y_pred)
        avg_precision += len_class * metrics.precision_score(y_test[[y_train.columns[i]]],y_pred,zero_division=0)
        avg_recall += len_class * metrics.recall_score(y_test[[y_train.columns[i]]],y_pred,zero_division=0)
    return (avg_accuracy/y_train.shape[0],avg_precision/y_train.shape[0],avg_recall/y_train.shape[0])

#### Mini_Classifier to test some columns and outputs

In [6]:
def mini_classifier(x_train,y_train,x_test,y_test,model):
    for i in range (len(y_train.columns)):
        model.fit(x_train,y_train.iloc[:,i])
        y_pred = model.predict(x_test)
        print(y_pred)
        print(y_test.iloc[:,i])

best_k = 10
knn = KNeighborsClassifier(n_neighbors=best_k,weights='distance')
reg = LogisticRegression(random_state=0)
bay = GaussianNB()
mini_classifier(train_x,train_y,test_x,test_y,knn)

[0 0 0 0 1 0 0 1 1 0 0 0 1 1 0 0 0 0 1 0 1 1 1 1 1 0 1 1 0 0 0 0 1 0 0 0 1
 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1 0 0 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1
 1 1 1 1 1 0 0 1 0 1 1 1 0 0 1 1 0 0 0 0 1 0 0 1 0 1 0 1 1 1 1 0 1 0 1 0 0
 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 0 0 1 1 0 1 1 0 0 1 0 1 0 0 0 1 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 1 1 1 0 1 0 1 0 1 1 1 1 1 0 1 0 1 1 1 0 1
 1 1 1 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1
 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 0 0 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1
 0 1 0 1 1 1 1 0 0 1 1 1 0 1 1 1 0 1 1 1 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0 0 1
 1 1 1 0 1 1 0 0 1 0 1 1 0 1 1 1 1 1 1 1 0 0 0 0 1 0 0 1 0 1 1 1 0 0 0 1 1
 0 1 1 1 1 1 0 0 0 1 0 0 0 0 1 0 0 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 0 1 1
 0 1 0 1 1 1 1 0 0 0 1 0 1 0 0 1 1 0 1 0 0 1 1 0 1 1 1 1 0 0 0 0 0 1 1 1 0
 0 1 0 1 1 1 1 1 1 0 0 0 0]
Index
1       0
3       0
4       0
7       0
8       1
       ..
1985    1
1989    0
1990    0
1992    1
1994    0
Name: Rock, Length: 420, dtype:

#### Cross-Validation

In [7]:
def cross_validation(model):
    num_splits = 10
    folds = KFold(n_splits=num_splits)

    sum_accu,sum_pre,sum_rec = 0,0,0

    # Train_x here is the full training dataset
    for train_index,test_index in folds.split(train_x):
        X_train, X_test = train_x.iloc[train_index],train_x.iloc[test_index]
        Y_train, Y_test = train_y.iloc[train_index],train_y.iloc[test_index]

        metrics = train_classifier(X_train,Y_train,X_test,Y_test,model)
        sum_accu += metrics[0]
        sum_pre += metrics[1]
        sum_rec += metrics[2]
    return (sum_accu/num_splits,sum_pre/num_splits,sum_rec/num_splits)


#### Find best k (for K Nearest Neighbors)

In [8]:
def sampled_range(mini, maxi, num):
  if not num:
    return []
  lmini = math.log(mini)
  lmaxi = math.log(maxi)
  ldelta = (lmaxi - lmini) / (num - 1)
  out = [x for x in set([int(math.exp(lmini + i * ldelta)) for i in range(num)])]
  out.sort()
  return out

def find_best_k():
  tested_k_values = sampled_range(1, 100, 35)
  max_accuracy, best_k = 0, 0
  for k in tested_k_values:
    model = KNeighborsClassifier(n_neighbors=k,weights='distance')
    accuracy = cross_validation(model)[0]
    if max_accuracy < accuracy:
      max_accuracy = accuracy
      best_k = k
  return best_k

#### Models

In [9]:
best_k = find_best_k()
knn = KNeighborsClassifier(n_neighbors=best_k,weights='distance')
reg = LogisticRegression(random_state=0)
bay = GaussianNB()
svm = SVC(C=10, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear', cache_size=3000)

models = [knn,reg,bay]

In [10]:
best_k

100

#### Launch training

In [11]:
for model in models:
    print(model,cross_validation(model))

KNeighborsClassifier(n_neighbors=100, weights='distance') (0.7311570133517942, 0.28414419902499477, 0.4122047218166668)
LogisticRegression(random_state=0) (0.7170956316461423, 0.323991643426657, 0.3343489468452267)
GaussianNB() (0.7169199830197437, 0.38856750070664847, 0.4003895417792195)


| Model         | Accuracy     | Precision | Recall
|--------------|-----------|------------|---|
| KNeighborsClassifier | 73.0 %     | 28.4 %| 41.1 %
| LogisticRegression      | 71.7 %  | 32.4 %| 33.4 %
| Naive Bayes | 71.7 % | 38.9 % | 40.0 %
| SVC | 61.6 % | 35.4 % | 63.8 % 

In [12]:
model_choosen = [knn,reg,bay]

for model in model_choosen:
    print(model,train_classifier(train_x,train_y,test_x,test_y,model))

KNeighborsClassifier(n_neighbors=100, weights='distance') (0.7444903893357501, 0.2942027962152238, 0.3995684488132342)
LogisticRegression(random_state=0) (0.7327480080057399, 0.3801805648752516, 0.3337867079775495)
GaussianNB() (0.7269532872625656, 0.42545437518088514, 0.4180930571863772)


| Model         | Accuracy     | Precision | Recall
|--------------|-----------|------------|---|
| KNeighborsClassifier | 64.4 %     | 25.4 %| 50.8%
| LogisticRegression      | 64.4 %  | 25.4 %| 50.8 %
| Naive Bayes | 61.6 % |25.4 % | 50.8 %