In [95]:
import math
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold

In [96]:
df = pd.read_csv('train.csv', index_col=0)
train_x = df[["Energy","Danceability","Liveness","Valence","Acousticness","Speechiness","Popularity"]]
train_y = df[["Rock","Pop","Standards","Metal","Indie","Cabaret","Soul","Wave","Invasion","Hip-hop"]]

df_test = pd.read_csv('test.csv', index_col=0)
test_x = df_test[["Energy","Danceability","Liveness","Valence","Acousticness","Speechiness","Popularity"]]
test_y = df_test[["Rock","Pop","Standards","Metal","Indie","Cabaret","Soul","Wave","Invasion","Hip-hop"]]

In [97]:
def simple_distance(data1, data2):
    """Computes the Euclidian distance between data1 and data2.
    Args:
    data1: a list of numbers: the coordinates of the first vector.
    data2: a list of numbers: the coordinates of the second vector (same length as data1).
    Returns:
    The Euclidian distance: sqrt(sum((data1[i]-data2[i])^2)).
    """
    sum_euclidian = 0
    for i in range(len(data1)):
        if type(data1[i]) == str:
            pass
        else:
            sum_euclidian += math.pow((int(data1[i])-int(data2[i])),2)
    return math.sqrt(sum_euclidian)

In [98]:
for i in range (len(train_y.columns)):
    print(train_y.iloc[:, i].shape)

(1261,)
(1261,)
(1261,)
(1261,)
(1261,)
(1261,)
(1261,)
(1261,)
(1261,)
(1261,)


In [109]:
def classifier(train_x, train_y, test_x, test_y, k):
    knn = KNeighborsClassifier(n_neighbors=k, weights='distance')
    avg_score = 0
    for i in range (len(train_y.columns)):
        knn.fit(X=train_x, y=train_y.iloc[:, i])
        nb_in_class = sum(train_y.iloc[:, i])
        avg_score += nb_in_class * knn.score(test_x,test_y.iloc[:, i])
        print("for genre", train_y.columns[i], "score is", knn.score(test_x,test_y.iloc[:, i]))
    
    return avg_score/train_y.shape[0]

classifier(train_x, train_y, test_x, test_y, 5)

for genre Rock score is 0.5833333333333334
for genre Pop score is 0.7952380952380952
for genre Standards score is 0.9095238095238095
for genre Metal score is 0.9333333333333333
for genre Indie score is 0.9428571428571428
for genre Cabaret score is 0.9571428571428572
for genre Soul score is 0.9714285714285714
for genre Wave score is 0.9833333333333333
for genre Invasion score is 0.9833333333333333
for genre Hip-hop score is 0.9904761904761905


0.7311789584985462

In [100]:
def cross_validation(k):
    num_splits = 10
    folds = KFold(n_splits=num_splits)

    sum_score = 0
    for train_index, test_index in folds.split(train_x):
#       train = df.iloc[result[0]]
#       test =  df.iloc[result[1]]
        X_train, X_test = train_x.iloc[train_index], train_x.iloc[test_index]
        y_train, y_test = train_y.iloc[train_index], train_y.iloc[test_index]
        sum_score += classifier(X_train, y_train, X_test, y_test, k)

    return sum_score/num_splits

cross_validation(k = 10)

for genre Rock score is 0.5511811023622047
for genre Pop score is 0.7244094488188977
for genre Standards score is 0.952755905511811
for genre Metal score is 0.8818897637795275
for genre Indie score is 0.937007874015748
for genre Cabaret score is 0.937007874015748
for genre Soul score is 0.968503937007874
for genre Wave score is 0.968503937007874
for genre Invasion score is 1.0
for genre Hip-hop score is 0.9606299212598425
for genre Rock score is 0.48412698412698413
for genre Pop score is 0.6746031746031746
for genre Standards score is 0.9603174603174603
for genre Metal score is 0.9523809523809523
for genre Indie score is 0.9761904761904762
for genre Cabaret score is 0.9444444444444444
for genre Soul score is 0.9841269841269841
for genre Wave score is 0.9047619047619048
for genre Invasion score is 0.9920634920634921
for genre Hip-hop score is 0.9523809523809523
for genre Rock score is 0.5158730158730159
for genre Pop score is 0.5793650793650794
for genre Standards score is 0.98412698412

0.7197668775250378

In [101]:
def sampled_range(mini, maxi, num):
  if not num:
    return []
  lmini = math.log(mini)
  lmaxi = math.log(maxi)
  ldelta = (lmaxi - lmini) / (num - 1)
  out = [x for x in set([int(math.exp(lmini + i * ldelta)) for i in range(num)])]
  out.sort()
  return out

def find_best_k():

  tested_k_values = sampled_range(1, 1000, 10)
  max_accuracy, best_k = 0, 0
  for k in tested_k_values:
    # print("k =", k)
    accuracy = cross_validation(k = k)
    # print("error =", error)
    if max_accuracy < accuracy:
      max_accuracy = accuracy
      best_k = k
  return best_k

find_best_k()

for genre Rock score is 0.5196850393700787
for genre Pop score is 0.6377952755905512
for genre Standards score is 0.905511811023622
for genre Metal score is 0.84251968503937
for genre Indie score is 0.905511811023622
for genre Cabaret score is 0.9212598425196851
for genre Soul score is 0.952755905511811
for genre Wave score is 0.9448818897637795
for genre Invasion score is 0.984251968503937
for genre Hip-hop score is 0.968503937007874
for genre Rock score is 0.5158730158730159
for genre Pop score is 0.6031746031746031
for genre Standards score is 0.9206349206349206
for genre Metal score is 0.9047619047619048
for genre Indie score is 0.9523809523809523
for genre Cabaret score is 0.9206349206349206
for genre Soul score is 0.9444444444444444
for genre Wave score is 0.8888888888888888
for genre Invasion score is 0.9841269841269841
for genre Hip-hop score is 0.9206349206349206
for genre Rock score is 0.5634920634920635
for genre Pop score is 0.5555555555555556
for genre Standards score is 0

99

In [102]:
cross_validation(k = 99)

for genre Rock score is 0.5590551181102362
for genre Pop score is 0.7716535433070866
for genre Standards score is 0.952755905511811
for genre Metal score is 0.889763779527559
for genre Indie score is 0.937007874015748
for genre Cabaret score is 0.937007874015748
for genre Soul score is 0.968503937007874
for genre Wave score is 0.968503937007874
for genre Invasion score is 1.0
for genre Hip-hop score is 0.9606299212598425
for genre Rock score is 0.48412698412698413
for genre Pop score is 0.6984126984126984
for genre Standards score is 0.9603174603174603
for genre Metal score is 0.9444444444444444
for genre Indie score is 0.9761904761904762
for genre Cabaret score is 0.9444444444444444
for genre Soul score is 0.9841269841269841
for genre Wave score is 0.9047619047619048
for genre Invasion score is 0.9920634920634921
for genre Hip-hop score is 0.9523809523809523
for genre Rock score is 0.40476190476190477
for genre Pop score is 0.5634920634920635
for genre Standards score is 0.98412698412

0.7299786737377358

In [103]:
classifier(train_x, train_y, test_x, test_y, 99)

for genre Rock score is 0.5952380952380952
for genre Pop score is 0.819047619047619
for genre Standards score is 0.9142857142857143
for genre Metal score is 0.9452380952380952
for genre Indie score is 0.9476190476190476
for genre Cabaret score is 0.9571428571428572
for genre Soul score is 0.9714285714285714
for genre Wave score is 0.9833333333333333
for genre Invasion score is 0.9833333333333333
for genre Hip-hop score is 0.9904761904761905


0.7432800876099844

In [104]:
simple_distance(df.iloc[1],df.iloc[2])

83.6719785830358

In [105]:
k_nearest_neighbors(train_x.iloc[0],train_x.iloc[2:5],simple_distance,1)

NameError: name 'k_nearest_neighbors' is not defined

In [None]:
simple_distance(train_x.iloc[0],train_x.iloc[5])

114.88254871824527

In [None]:
train_x.iloc[0]

Energy          79
Danceability    50
Liveness        17
Valence         81
Acousticness    17
Speechiness      7
Popularity      39
Name: 2, dtype: int64