In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

In [2]:
FILENAME = './data/model_input_covariates_bow.csv'

RESULTS_FILENAME = './data/genre_results_bow_artist_eng.csv'

DATETIME_FORMAT = '%H:%M:%S'

In [3]:
def MSE(a, b):
    return ((a-b)**2).mean()

def accuracy(a,b):
    return (a == b).sum() / b.shape[0]

In [18]:
# load all data and output
df = pd.read_csv(FILENAME)

df.head()

Unnamed: 0,artist_name,song_title,song_runtime,song_year,song_word_count,song_genre,10,aber,act,actin,...,young,youre,youth,youve,youyou,yuh,zeit,zone,zu,zum
0,a band of bees,the rip on track,00:04:37,0,122,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,a band of bees,left foot stepdown,00:04:06,0,201,,0,0,0,0,...,0,4,0,0,0,0,0,0,0,0
2,a band of bees,horsemen,00:03:28,0,238,,0,0,0,0,...,0,5,0,0,0,0,0,0,0,0
3,a band of bees,chicken payback madlibs soul distortion vocal ...,00:00:00,0,283,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,a band of bees,these are the ghosts,00:03:08,0,174,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# drop the rows where genre is NaN
df_genre = df.dropna(subset=['song_genre'])

print("Rows in full dataframe: {}".format(df.shape[0]))
print("Rows in dataframe after removing rows where genre is NaN: {}".format(df_genre.shape[0]))

Rows in full dataframe: 216675
Rows in dataframe after removing rows where genre is NaN: 14808


In [20]:
# convert datetime to datetime
df_genre['song_runtime'] = pd.to_datetime(df_genre['song_runtime'], format=DATETIME_FORMAT)

df_genre = df_genre.assign(song_runtime_secs=lambda x: (x['song_runtime'].dt.minute * 60 + x['song_runtime'].dt.second))

artist_dummies = pd.get_dummies(df_genre['artist_name'].astype(object), prefix='artist', prefix_sep='_',)
df_genre = pd.concat([df_genre, artist_dummies], axis=1)

X = df_genre.drop(columns = ['artist_name', 'song_genre', 'song_title', 'song_runtime'])
y = df_genre['song_genre']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genre['song_runtime'] = pd.to_datetime(df_genre['song_runtime'], format=DATETIME_FORMAT)


In [21]:
X.head()

Unnamed: 0,song_year,song_word_count,10,aber,act,actin,acting,action,admit,adore,...,artist_zeca pagodinho,artist_zedd,artist_zero 7,artist_zezé di camargo & luciano,artist_ziggy marley,artist_zion & lennox,artist_zoé,artist_zucchero,artist_zz top,artist_zz ward
40,0,435,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
77,0,271,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
84,0,369,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
124,0,221,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
125,0,221,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
X_train, X_val, y_train, y_val = \
train_test_split(X, y, test_size=0.2, random_state=1234)

In [23]:
k = 1

In [24]:
# p : int, default=2
# Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1) 
neigh = KNeighborsClassifier(n_neighbors=k, p=1)
neigh.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1, p=1)

In [25]:
# check training set performance
y_hat_train = neigh.predict(X_train)

In [26]:
# check validation set performance
y_hat_val = neigh.predict(X_val)

In [27]:
d = {
    'Model': 'KNN Classifier', 
    'parameters': 'k=1',
    'X_format': 'Bag of words (tfidf), ngram3, covariates',
    'Y_format': 'Genre',
    'Training set performance (accuracy)': accuracy(y_hat_train, y_train), 
    'Validation set performance (accuracy)': accuracy(y_hat_val, y_val)
    }

results = pd.DataFrame(data=d, index=[0])

results  

Unnamed: 0,Model,parameters,X_format,Y_format,Training set performance (accuracy),Validation set performance (accuracy)
0,KNN Classifier,k=1,"Bag of words (tfidf), ngram3, covariates",Genre,0.795627,0.211344


In [28]:
results.to_csv(RESULTS_FILENAME, index=False)