In [163]:
import numpy as np
import pandas as pd
from collections import Counter 
from pprint import pprint
from math import log2
import sys
sys.setrecursionlimit(10000)

In [164]:
df_train = pd.read_csv('spotify_train.csv')
df_test = pd.read_csv('spotify_test.csv')
df_train

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,211682,True,0.732,0.635,1,-7.891,1,0.4100,0.49300,0.000007,0.3880,0.581,147.025,4,hip-hop
1,1,223613,False,0.409,0.570,6,-10.540,0,0.0711,0.68700,0.000000,0.1730,0.336,128.657,4,pop
2,56,243626,False,0.536,0.764,1,-5.174,0,0.0393,0.03020,0.000011,0.1040,0.294,147.585,4,rock
3,8,182413,True,0.843,0.789,7,-2.801,1,0.2470,0.00280,0.000024,0.0322,0.571,125.071,4,hip-hop
4,74,200120,False,0.535,0.765,1,-7.862,0,0.0444,0.05400,0.000000,0.0921,0.373,191.827,4,pop
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,206696,False,0.741,0.695,1,-5.341,1,0.0454,0.02100,0.000000,0.0922,0.861,134.032,4,pop
1996,2,550706,False,0.765,0.833,10,-5.162,0,0.0873,0.03760,0.268000,0.0824,0.837,126.422,4,rock
1997,81,331266,False,0.577,0.828,6,-8.691,0,0.0298,0.00213,0.011400,0.0581,0.604,123.257,4,rock
1998,48,199263,False,0.754,0.802,9,-6.424,1,0.0551,0.03490,0.000000,0.3350,0.583,122.026,4,rock


### Decision Tree Construction

In [165]:
# impurity measures from lab
def total(cnt):
    return sum(cnt.values())

def gini(cnt):
    tot = total(cnt)
    return 1 - sum([(v/tot)**2 for v in cnt.values()])

def entropy(cnt):
    tot = total(cnt)
    return sum([(-v/tot) * log2(v/tot) for v in cnt.values()])
    
def wavg(cnt1, cnt2, measure):
    tot1 = total(cnt1)
    tot2 = total(cnt2)
    tot = tot1 + tot2
    return (measure(cnt1) * tot1 + measure(cnt2) * tot2) / tot

In [166]:
def split_df(df, split_col, feature_val, numeric):
    ''' helper function to split dataframe either if is a categorical feature or continuous numerical'''
    if numeric:
        return df[df[split_col] <= feature_val], df[df[split_col] > feature_val]
    else:
        return df[df[split_col] == feature_val], df[df[split_col] != feature_val]

# split functions from lab (determine best splits based on our criteria)
def evaluate_split(df, class_col, split_col, feature_val, measure, numeric):
    ''' eva
    '''
    df1, df2 = split_df(df, split_col, feature_val, numeric)
    cnt1, cnt2 = Counter(df1[class_col]), Counter(df2[class_col])
    return wavg(cnt1, cnt2, measure)

def best_split_for_column(df, class_col, split_col, method, numeric):
    best_v = ''
    best_meas = float("inf")
    
    for v in set(df[split_col]):
        
        meas = evaluate_split(df, class_col, split_col, v, method, numeric)
        if  meas < best_meas:
            best_v = v
            best_meas = meas
    
    return best_v, best_meas

def best_split(df, class_col, cat_cols, method):
    best_col = 0
    best_v = ''
    best_meas = float("inf")
    
    for split_col in [col for col in df.columns if col != class_col]:
        if split_col in cat_cols:
            numeric = False
        else:
            numeric = True
        v, meas = best_split_for_column(df, class_col, split_col, method, numeric=numeric)
        if meas < best_meas:
            best_v = v
            best_meas = meas
            best_col = split_col
                
    return best_col, best_v, best_meas

In [167]:
# which parts of the dataframe of categorical and targeted col
category = ['explicit', 'key', 'mode', 'time_signature']
targeted_class_col = 'track_genre'

def dtree(train, criterion, max_depth=None, curr_depth=0, min_instances=2, target_impurity=0.0):
    ''' build a decision tree classifier
        args.
            - train (dataframe): a training dataset (pandas)
            - criterion: attribution selection method (gini/entropy)
            - max_depth (int >= 0): max depth of tree -- default=None or no max depth [note - root node has depth 0]
            - curr_depth (int): keeps track of depth of the tree and nodes default 0
            - min_instances (int >= 2): minimum number of instances to perform a further split
            - target_impurity (double 0.0-1.0): target impurity at or below to stop node split
            
        returns.
            a tuple of tuples containing:
            - feature/column name (spliting criterion)
            - feature value threshold (splitting criteria)
            - examples in split
            - majority class
            - impurity score
            - depth
            - left subtree ( <= test threshold )
            - right subtree ( > test threshold )
            
    '''
    
    # if dataset empty
    if train is None or len(train)==0:
        return None
    
    majority = train[targeted_class_col].mode()[0]
    
    # if max instances reached or max depth reached
    if (len(train) < min_instances) or (curr_depth == max_depth):
        return (None, None, train, majority, None, curr_depth, None, None)
    else:
        # evaulate splits
        feat, feat_val, imp_score = best_split(train, class_col=targeted_class_col, cat_cols=category, method=criterion)
        if imp_score <= target_impurity: # if impurity is reached, don't split
            return (None, None, train, majority, None, curr_depth, None, None)
        
        # split into left and right values
        left_vals, right_vals = split_df(train, feat, feat_val, feat not in category)
        return (feat,
                feat_val,
                train,
                majority,
                imp_score,
                curr_depth, 
                dtree(left_vals, criterion=criterion, max_depth=max_depth, curr_depth=curr_depth+1, min_instances=min_instances, target_impurity=target_impurity), 
                dtree(right_vals, criterion=criterion, max_depth=max_depth, curr_depth=curr_depth+1, min_instances=min_instances, target_impurity=target_impurity))  

In [168]:
# example tree (will run into recursion limits with max_depth=None and no good target impurity)
#pprint(dtree(df_train, criterion=entropy, max_depth=4, target_impurity=0.3))
# initially printed without the example split dataframe to visualize

### note delete this once we're all set ###

In [169]:
df_train.head()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,211682,True,0.732,0.635,1,-7.891,1,0.41,0.493,7e-06,0.388,0.581,147.025,4,hip-hop
1,1,223613,False,0.409,0.57,6,-10.54,0,0.0711,0.687,0.0,0.173,0.336,128.657,4,pop
2,56,243626,False,0.536,0.764,1,-5.174,0,0.0393,0.0302,1.1e-05,0.104,0.294,147.585,4,rock
3,8,182413,True,0.843,0.789,7,-2.801,1,0.247,0.0028,2.4e-05,0.0322,0.571,125.071,4,hip-hop
4,74,200120,False,0.535,0.765,1,-7.862,0,0.0444,0.054,0.0,0.0921,0.373,191.827,4,pop


### Model Accuracy
Evaluate validation data and generate a series of predictions

In [170]:
def make_pred(model, row):
    ''' helper function, recursive to make a prediction for one row of data'''
    
    # store variables that make up the tuple
    feat, feat_val, train, majority, imp_score, curr_depth, left_dtree, right_dtree = model
    
    # follow the decision tree model
    if feat == None:
        return majority
    
    # right tree scenario
    if row[feat] > feat_val:
         return make_pred(right_dtree, row) if right_dtree != None else majority
        
    # left tree scenario
    else:
        return make_pred(left_dtree, row) if left_dtree != None else majority

In [171]:
def predict(model, data):
    ''' produce a list of predictions given a dtree model and data'''
    
    if 'track_genre' in data.columns:
        data = data.drop('track_genre', axis=1, inplace=False)
    
    # iterate through each row and make a prediction
    y_pred = [make_pred(model, row) for _, row in data.iterrows()]
    
    # return predictions
    return y_pred

### Hyperparameter Tuning
Search for combination of hyperparameters to produce best validation accuracy
- feature selection (gini/entropy)
- stopping criteria (max_depth, min_instances, target_impurity)

Using 10-fold cross validation to determine overall validation error

In [172]:
# accuracy function helper
def acc(y_pred, y_true):
  assert len(y_pred) == len(y_true)
  num_same = 0
  for x in range(len(y_pred)):
    if y_pred[x] == y_true[x]:
      num_same +=1
  return num_same / len(y_pred)

In [173]:
def n_fold_prediction(model, df, params, folds=10):
    
    # shuffle rows
    df = df.sample(frac=1).reset_index(drop=True)
    
    # initialize vars for best accuracy and model
    best_acc = 0
    best_model = None
    
    for combination in params:
        c1, c2, c3, c4 = combination
        
        accu = 0
        
        # conduct ten fold cross validation for the params
        for f in range(folds):
            
            # create training and test fold
            train_fold = df[df.index % folds != f]
            valid_fold = df[df.index % folds == f]
        
            # build model with training data
            model_train = model(train_fold, criterion=c1, max_depth=c2, min_instances=c3, target_impurity=c4)
            
            # make prediction with valid_fold
            y_pred = predict(model_train, valid_fold)
            
            # define y_true
            y = list(valid_fold.track_genre)
            
            # calc accuracy
            accu += acc(y_pred, y)
            
        # store best accuracy & model
        if accu / folds >= best_acc:
            best_acc = accu / folds
            best_model = model_train
            
    return best_acc, best_model

In [174]:
import itertools
param_grid = {
    'criterion': [gini, entropy],
    'max_depth': [2, 4, 6, 8, 10],
    'min_instances': [2, 5, 10],
    'target_impurity': [0.0, 0.2, 0.4, 0.5]
}
param_values = [v for v in param_grid.values()]
combinations = list(itertools.product(*param_values))

# We truncated combinations because the runtime is really long.
best_acc, best_model = n_fold_prediction(dtree, df_train, params=combinations[0:3])

### Test Final Model
Training using all 2000 songs and testing against test set to report final accuracy
1. only training data used to build model
2. validation data used to estimate model performance with hyperparameters
3. test data only for final estimate of performance and report accuracy

In [175]:
pred = predict(best_model, df_test)
print(f"final accuracy: {acc(pred, df_test.track_genre)}")

final accuracy: 0.649
