In [4]:
%load_ext autoreload

In [5]:
#from cap_package import SpotipyCollectPub as scp
from cap_package import ReadTransform as rt
from collections import OrderedDict
from datetime import datetime
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
%matplotlib inline

In [6]:
%autoreload 2

In [7]:
from IPython.utils.text import columnize
def disp_col(list_):
    
    # import -> from IPython.utils.text import columnize
    l = list(map(lambda x:repr(x)+ ',', list_))
    print(columnize(l, displaywidth=120))
def timer(start_time=None):
    if start_time is None:
        start_time = datetime.now()
        return start_time
    else:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [8]:
load_dotenv()
path = Path(os.getenv('PATH_DATASET1.2')).joinpath('user_pl_featstats')

In [9]:
#enc_labels = np.loadtxt(path.joinpath('enc_labels.csv'), delimiter=',')
enc_categories = np.loadtxt(path.joinpath('enc_categories.csv'), delimiter=',', dtype='U')

In [10]:
path_ = path.joinpath('user_pl_segstat')
dfs = []
for f in path_.iterdir():
    dfs.append(pd.read_parquet(f))

segstat_df = pd.concat(dfs, ignore_index=True)

In [11]:
path_ = path.joinpath('user_pl_feat')
dfs = []
for f in path_.iterdir():
    
    df = pd.read_parquet(f)
    pl_col = [f.name.replace('_features.parquet', '')] * len(df)
    df.insert(loc=0, column='playlist', value=pl_col)
    dfs.append(df)
    

full_feat_df = pd.concat(dfs, ignore_index=True)
feat_df = full_feat_df.loc[:, : 'artists_name']

##### We will merge segstats and feature dataframes to create one dataset
But before that we must ensure that columns we will use as keys match in both dataframes

In [12]:
# rename 'name' column to match 'track name' column in segstat_df
feat_df = feat_df.rename(columns={'name': 'track_name'})

# update track name column values to match that of in segstat_df

# add first 3 characters from artists_name
upd_trname = feat_df.track_name + '_' + feat_df.artists_name.apply(lambda x: x[:3])
# remove any special characters
upd_trname.replace(regex=r'[*|><:"?/]|\\', value='', inplace=True)
# find duplicates and add 'dup' to those track names
ind = upd_trname[upd_trname.duplicated()].index
upd_trname.iloc[ind] = upd_trname.iloc[ind].apply(lambda x: x + '_dup')

feat_df['track_name'] = upd_trname

In [13]:
new_df = pd.merge(feat_df, segstat_df, how='outer', on=['track_name', 'playlist'])
disp_col(new_df.columns)

'playlist',          'key_10',              'timbre_04_max',   'timbre_11_mean',      'timbre_06_skewness',
'danceability',      'key_11',              'timbre_05_max',   'timbre_12_mean',      'timbre_07_skewness',
'energy',            'track_name',          'timbre_06_max',   'timbre_01_min',       'timbre_08_skewness',
'loudness',          'artists_name',        'timbre_07_max',   'timbre_02_min',       'timbre_09_skewness',
'speechiness',       'timbre_01_kurtosis',  'timbre_08_max',   'timbre_03_min',       'timbre_10_skewness',
'acousticness',      'timbre_02_kurtosis',  'timbre_09_max',   'timbre_04_min',       'timbre_11_skewness',
'instrumentalness',  'timbre_03_kurtosis',  'timbre_10_max',   'timbre_05_min',       'timbre_12_skewness',
'valence',           'timbre_04_kurtosis',  'timbre_11_max',   'timbre_06_min',       'timbre_01_std',     
'tempo',             'timbre_05_kurtosis',  'timbre_12_max',   'timbre_07_min',       'timbre_02_std',     
'key_0',             'timbre

In [14]:
new_df.tail()

Unnamed: 0,playlist,danceability,energy,loudness,speechiness,acousticness,instrumentalness,valence,tempo,key_0,...,timbre_03_std,timbre_04_std,timbre_05_std,timbre_06_std,timbre_07_std,timbre_08_std,timbre_09_std,timbre_10_std,timbre_11_std,timbre_12_std
367,That familiar trance,0.489011,0.871102,0.348074,0.076179,0.096996,0.03186,0.410039,0.167805,0.0,...,53.760912,28.658233,24.945681,24.083093,23.611131,12.169136,12.933405,11.338227,15.863007,13.098306
368,That familiar trance,0.53663,0.796258,0.411248,0.047158,0.034256,0.92974,0.043677,0.169388,0.0,...,38.556718,30.325408,19.408703,27.986655,18.836288,18.97026,13.694174,19.440721,12.966388,12.459445
369,That familiar trance,0.668498,0.602911,0.425424,0.085852,0.594702,0.901843,0.481747,0.334444,0.0,...,41.415842,39.999842,14.649795,39.054745,18.87563,18.520975,15.517733,16.299007,17.54593,11.477168
370,That familiar trance,0.401099,0.835759,0.502311,0.054817,0.003291,0.908042,0.356258,0.280453,0.0,...,27.21515,30.004148,17.366065,16.325543,20.910253,13.603091,9.409996,12.02694,12.610697,10.76045
371,That familiar trance,0.941392,0.808732,0.58228,0.158807,0.401375,0.877045,0.908735,0.168389,0.0,...,57.739638,68.54128,29.603303,58.980708,20.423573,29.040059,15.927373,23.444979,32.022451,13.650441


In [15]:
data_df = new_df.drop(['track_name', 'artists_name', 'playlist'], axis=1)
labels = new_df.loc[:, 'playlist']
enc_labels = LabelEncoder().fit_transform(labels)