In [1]:
import joblib
import numpy as np 
import pandas as pd
import re
import sklearn
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from feature_engine.encoding import OneHotEncoder as fe_OneHotEncoder
from sklearn.neighbors import NearestNeighbors

In [2]:
df = pd.read_csv('./tracks_features.csv')

In [3]:
# Convert the 'explicit' column from Boolean to Integer
def explicit_converter(x):
    if x:
        return 1 
    else:
        return 0

In [4]:
# Bins years into decades
def decade_function(x):
    if x in range(1900,1991):
        return '1900-1990'
    elif x in range(1991,1996):
        return '1991-1995'
    elif x in range(1996,2001):
        return '1996-2000'
    elif x in range(2001,2006):
        return '2001-2005'
    elif x == 2006:
        return '2006' 
    elif x == 2007:
        return '2007'
    elif x == 2008:
        return '2008'
    elif x == 2009:
        return '2009'
    elif x == 2010:
        return '2010'
    elif x == 2011:
        return '2011'
    elif x == 2012:
        return '2012'
    elif x == 2013:
        return '2013'
    elif x == 2014:
        return '2014'
    elif x == 2015:
        return '2015'
    elif x == 2016:
        return '2016'
    elif x == 2017:
        return '2017'
    elif x == 2018:
        return '2018'
    elif x == 2019:
        return '2019'
    elif x == 2020:
        return '2020'
    elif x == 2021:
        return '2021'
        

In [5]:
def wrangle(df):
    df_func = df.copy()
    # Find and remove duplicate songs from dataframe
    df_func['artists_new'] = df_func['artists'].apply(lambda x: re.findall(r"'([^']*)'", x))
    df_func['artists_new2'] = df_func['artists'].apply(lambda x: re.findall('\"(.*?)\"',x))
    df_func['artists_final'] = np.where(df_func['artists_new'].apply(lambda x: not x), 
                                   df_func['artists_new2'], df_func['artists_new'])
    df_func['artists_song'] = df_func.apply(lambda row: row['artists_final'][0]+row['name'],axis = 1)
    df_func.drop_duplicates('artists_song', inplace = True)
    
    # Apply binarizer function
    df_func['explicit'] = df_func['explicit'].apply(explicit_converter)
    
    # Remove observations where the 'year' column is 0
    df_func['year'] = df_func['year'].replace(0, np.nan)
    df_func.dropna(subset = ['year'], inplace = True)
    
    # Apply binning into decades function
    df_func['age'] = df_func['year'].apply(np.int64)
    df_func['age'] = df_func['age'].apply(decade_function)
        
    return df_func

In [6]:
df1 = wrangle(df)

In [7]:
df2 = df1.copy()

In [8]:
# Drop no longer necessary columns
cols_to_drop = ['name', 'album', 'artists_final', 'id', 'album_id','artists', 
                    'artist_ids', 'track_number', 'disc_number', 'release_date', 
                    'artists_new', 'artists_new2', 'artists_song', 'duration_ms']
df2.drop(columns = cols_to_drop, inplace = True)

In [9]:
# One Hot Encode 'age' column
ohe_enc = fe_OneHotEncoder(
    top_categories=None,
    variables=['age'], drop_last=True)  # to return k-1, false to return k

ohe_enc.fit(df2.fillna('Missing'))

OneHotEncoder(drop_last=True, variables=['age'])

In [10]:
# Pickle One-Hot-Encoder for prediction transformation
ohe_filename = 'ohe.joblib' 
joblib.dump(ohe_enc, open(ohe_filename, 'wb'), compress=True)

In [11]:
df2 = ohe_enc.transform(df2.fillna('Missing'))

In [12]:
#Scale numeric columns in dataframe
scale_cols = ['danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature', 'key', 'mode']
scaled = df2[scale_cols]
scaler = MinMaxScaler()
scaled_float_df = pd.DataFrame(scaler.fit_transform(scaled), columns = scaled.columns)

In [13]:
# Pickle Scaler for prediction transformation
scaler_filename = 'scaler.joblib' 
joblib.dump(scaler, open(scaler_filename, 'wb'), compress=True)

In [14]:
# Drop scaled features
dropper = ['danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature', 'year', 'key', 'mode']
df2 = df2.drop(columns = dropper)

In [15]:
df1 = df1.reset_index()
df2 = df2.reset_index()
scaled_float_df.reset_index()

Unnamed: 0,index,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,key,mode
0,0,0.470,0.978,0.812104,0.075026,0.026205,0.000011,0.3560,0.503,0.473644,0.8,0.636364,1.0
1,1,0.599,0.957,0.806675,0.194014,0.012952,0.000071,0.1550,0.489,0.416496,0.8,1.000000,1.0
2,2,0.315,0.970,0.811732,0.498452,0.023494,0.000002,0.1220,0.370,0.601561,0.8,0.636364,1.0
3,3,0.440,0.967,0.805694,0.244582,0.163655,0.000004,0.1210,0.574,0.388665,0.8,1.000000,0.0
4,4,0.426,0.929,0.792322,0.072343,0.001627,0.105000,0.0789,0.539,0.510412,0.8,0.181818,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1126171,1126171,0.264,0.966,0.788738,0.069350,0.009388,0.002240,0.3370,0.415,0.641078,0.8,0.454545,0.0
1126172,1126172,0.796,0.701,0.794211,0.091125,0.104418,0.644000,0.0749,0.781,0.490009,0.8,1.000000,0.0
1126173,1126173,0.785,0.796,0.803760,0.058204,0.030522,0.918000,0.0664,0.467,0.490074,0.8,0.818182,0.0
1126174,1126174,0.665,0.856,0.791445,0.042208,0.000070,0.776000,0.1170,0.227,0.502085,0.8,0.545455,0.0


In [16]:
df_train = pd.concat([df2, scaled_float_df], axis = 1)

In [17]:
# Drop unnecessary index
df_train = df_train.drop(['index'], axis = 1)

In [28]:
df_train.to_csv('df_train.csv')

# Nearest Neighbors (Brute Force Algo)

In [22]:
# Fit on DTM
nn = NearestNeighbors(n_neighbors=6, algorithm= 'brute', n_jobs = -1)
nn.fit(df_train)

NearestNeighbors(algorithm='brute', n_jobs=-1, n_neighbors=6)

In [23]:
# sample a doc from dtm to use as our query point 
doc_index = 1126175
doc = [df_train.iloc[doc_index].values]

# Query Using kneighbors 
neigh_dist, neigh_index = nn.kneighbors(doc)

print(neigh_dist)
print(neigh_index)

[[0.         0.26015085 0.2689481  0.30354453 0.30689897 0.30924778]]
[[1126175  235491  960438  851759  617669  673143]]


# Pickle the Model

In [24]:
model_filename = 'knn_model.joblib' 
joblib.dump(nn, open(model_filename, 'wb'), compress=True)