In [1]:
import pandas as pd

import numpy as np

import datetime as dt

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

from sklearn.cluster import KMeans

from scipy.optimize import linear_sum_assignment as linear_assignment



In [2]:
DATA_PATH = './data/'

X_FILENAME_BOW = 'model_input_bow.csv'
X_FILENAME_BOW_COV = 'model_input_covariates_bow.csv'
X_FILENAME_BOW_N3_COV = 'model_input_covariates_bow_Tfid_n3.csv'
X_FILENAME_TFIDF = 'model_input_bow_Tfid.csv'
X_FILENAME_TFIDF_COV = 'model_input_covariates_bow_Tfid.csv'
X_FILENAME_TFIDF_N3_COV = 'model_input_covariates_bow_Tfid_n3.csv'

RESULTS_FILENAME = 'results.csv'

DATETIME_FORMAT = '%M:%S'

In [None]:
# load all data and output
bow_df = pd.read_csv(DATA_PATH + X_FILENAME_BOW)

bow_df.head()

In [None]:
# load all data and output
bow_cov_df = pd.read_csv(DATA_PATH + X_FILENAME_BOW_COV)

bow_cov_df.head()

In [None]:
# load all data and output
bow_n3_cov_df = pd.read_csv(DATA_PATH + X_FILENAME_BOW_N3_COV)

bow_n3_cov_df.head()

In [None]:
# load all data and output
tfidf_cov_df = pd.read_csv(DATA_PATH + X_FILENAME_TFIDF_COV)

tfidf_cov_df.head()

In [None]:
# load all data and output
tfidf_n3_cov_df = pd.read_csv(DATA_PATH + X_FILENAME_TFIDF_N3_COV)

tfidf_n3_cov_df.head()

In [None]:
# load all data and output
tfidf_df = pd.read_csv(DATA_PATH + X_FILENAME_TFIDF)

tfidf_df.head()

In [None]:
def MSE(a, b):
    return ((a-b)**2).mean()

def accuracy(a,b):
    return (a == b).sum() / b.shape[0]

# to calculate k means performance, 
# see Ahmad Obeid's answer in 
# https://datascience.stackexchange.com/questions/17461/how-to-test-accuracy-of-an-unsupervised-clustering-model-output
def cluster_acc(y_true, y_pred):
    """
    Calculate clustering accuracy. Require scikit-learn installed
    # Arguments
        y: true labels, numpy.array with shape `(n_samples,)`
        y_pred: predicted labels, numpy.array with shape `(n_samples,)`
    # Return
        accuracy, in [0,1]
    """
    y_true = y_true.astype(np.int64)
    assert y_pred.size == y_true.size
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1
    ind = linear_assignment(w.max() - w)
    
    # new linear_assignment function may return different data format, modified function
    i = 0
    x = 0

    for j in ind[1]:
        x += w[i, j]
        i += 1

    return x * 1.0 / y_hat_train.size

In [None]:
input_df = bow_df

In [None]:
# convert datetime to datetime
input_df['song_runtime'] = pd.to_datetime(input_df['song_runtime'], format=DATETIME_FORMAT)

# take seconds, knowing that we cap minutes at 30 (we don't have to look at hours)
input_df['song_runtime'].dt.minute

input_df = input_df.assign(song_runtime_secs=lambda x: (x['song_runtime'].dt.minute * 60 + x['song_runtime'].dt.second))

In [None]:
# X_train, X_val, y_train, y_val = \
# train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=1234)

In [88]:
#Set up the results data frame
model = []
parameters = []
X_format = []
Y_format = []
train_perf = []
val_perf = []
test_perf = []

In [89]:
k_values = [1, 10, 50, 100, 1000]

for k in k_values:

    model.append('KNN Classifier')
    X_format.append('Bag of words')
    Y_format.append('Artist')
    
    # p : int, default=2
    # Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1) 
    neigh = KNeighborsClassifier(n_neighbors=k, p=1)
    neigh.fit(X_train, y_train)
    
    # check training set performance
    y_hat_train = neigh.predict(X_train)
    train_perf.append(accuracy(y_hat_train, y_train))
    
    # check validation set performance
    y_hat_val = neigh.predict(X_val)
    val_perf.append(accuracy(y_hat_val, y_val))
    
    test_perf.append(np.nan)
    
d = {
    'Model': model, 
    'parameters': k_values,
    'X_format': ,
    'X_format': ,
    'X_format': ,
    'Training set performance (accuracy)': train_perf, 
    'Validation set performance (accuracy)': val_perf,
    'X_format': 
    }

results = pd.DataFrame(data=d)

results    

Unnamed: 0,Model,K value,Training set performance (accuracy),Validation set performance (accuracy)
0,KNN Classifier,1,0.992604,0.177515
1,KNN Classifier,10,0.174556,0.201183
2,KNN Classifier,50,0.181213,0.106509
3,KNN Classifier,100,0.115385,0.076923
4,KNN Classifier,1000,0.203402,0.183432


We don't see validation set performance increase past k=10, for our k-nearest neighbour model, and thus will stick with that value.

In [90]:
n_classes = y.unique().shape[0]

y_train_coded = pd.Categorical(y_train).codes
y_val_coded = pd.Categorical(y_val).codes

k_values.append(np.nan)

model.append('KMeans Classifier')

kmeans = KMeans(n_clusters = n_classes)
kmeans.fit(X_train, y_train)

# check training set performance
y_hat_train = kmeans.predict(X_train)
train_perf.append(cluster_acc(y_train_coded, y_hat_train))

# check validation set performance
y_hat_val = kmeans.predict(X_val)
val_perf.append(cluster_acc(y_val_coded, y_hat_val))

d = {'Model': model, 'K value': k_values, 'Training set performance (accuracy)': train_perf, 'Validation set performance (accuracy)': val_perf}
results = pd.DataFrame(data=d)

results

Unnamed: 0,Model,K value,Training set performance (accuracy),Validation set performance (accuracy)
0,KNN Classifier,1.0,0.992604,0.177515
1,KNN Classifier,10.0,0.174556,0.201183
2,KNN Classifier,50.0,0.181213,0.106509
3,KNN Classifier,100.0,0.115385,0.076923
4,KNN Classifier,1000.0,0.203402,0.183432
5,KMeans Classifier,,0.237426,0.029586


We also see that, for all values of K tested, it beats out an unsupervised model.

We will attempt to increase the scope of the problem, to our intuitive understanding of its most complex, by predicting the artist, where number of artists = ~12000 

In [91]:
y = input_df['Artist Name']

# 80:10:10 split between train/val/test i.e. train:other 80:20
X_train, X_test_val, y_train, y_test_val = \
train_test_split(X, y, test_size=0.2, random_state=1234)

# 80:10:10 split between train/val/test; i.e. val and test are equal size
X_val, X_test, y_val, y_test = \
train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=1234)

In [92]:
ideal_k = 10
k_values.append(ideal_k)

model.append('KNN Classifier')

# p : int, default=2
# Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1) 
neigh = KNeighborsClassifier(n_neighbors=ideal_k, p=1)
neigh.fit(X_train, y_train)

# check training set performance
y_hat_train = neigh.predict(X_train)
train_perf.append(accuracy(y_hat_train, y_train))

# check validation set performance
y_hat_val = neigh.predict(X_val)
val_perf.append(accuracy(y_hat_val, y_val))
    
d = {'Model': model, 'K value': k_values, 'Training set performance (accuracy)': train_perf, 'Validation set performance (accuracy)': val_perf}
results = pd.DataFrame(data=d)

results

Unnamed: 0,Model,K value,Training set performance (accuracy),Validation set performance (accuracy)
0,KNN Classifier,1.0,0.992604,0.177515
1,KNN Classifier,10.0,0.174556,0.201183
2,KNN Classifier,50.0,0.181213,0.106509
3,KNN Classifier,100.0,0.115385,0.076923
4,KNN Classifier,1000.0,0.203402,0.183432
5,KMeans Classifier,,0.237426,0.029586
6,KNN Classifier,10.0,0.116124,0.005917


In [93]:
results.to_csv(DATA_PATH + RESULTS_FILENAME, index=False)