# Modelling Data Using Machine Learning Techniques

In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from datetime import date, datetime

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

### Importing Data

In [None]:
umg_data = pd.read_csv('final_merged_data.csv')
umg_data.head()

In [None]:
## creating a new outcome variable called popularity
# For this variable a song is popular if it scores above 30 for spotify_popularity index
umg_data.loc[umg_data['spotify_popularity'] > 50, 'popular'] = 'Yes'
umg_data.loc[umg_data['spotify_popularity'] <= 50, 'popular'] = 'No'

In [None]:
## Creating an age variable
# the age variable calculates the age of the song from today's date
umg_data['original_release_date'] = pd.to_datetime(umg_data['original_release_date'])
umg_data['today_date'] = date.today()
umg_data['today_date'] = pd.to_datetime(umg_data['today_date'])
umg_data['age'] = umg_data['today_date'].sub(umg_data['original_release_date'], axis=0)
umg_data['age'] = (umg_data.age/np.timedelta64(1, 'D')).astype(int)

In [None]:
umg_data.columns
umg_data_model = umg_data[[
    'danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 
         'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'age', 'popular'
         ]]
umg_data_model.head()

In [None]:
umg_data_model.info()

## KNN model

In [None]:
#importing libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
#checking distribution of popular and non-popular songs
plt.figure(figsize=(5,4))
sns.countplot(umg_data_model['popular']);

From the above chart, it looks like there is an unequal distribution of popular and non-popular songs. 

### splitting data into training and test sets

In [None]:
features = ['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 
         'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'age']

X = umg_data_model[features]
y = umg_data_model['popular']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

#scaling the features
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

In [None]:
# fitting the model
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result2 = classification_report(y_test, y_pred)
print('\nClassification Report:')
print(result2)
result3 = accuracy_score(y_test, y_pred)
print("Accuracy: ", result3)

Although the accuracy of our model is about 92 percent. The accuracy canbe misleading. For UMG, the usefulness of our model would be in predicting the few songs that have the potential to be popular. The model above does well in predicting None popular songs since they are the most in our data, but does poorly in predicting popular songs since they are rare. We need to improve to predict the rare popular songs.

### Trying SMOTE to oversample rare class

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [None]:
over = SMOTE(sampling_strategy=0.3)
under = RandomUnderSampler(sampling_strategy=0.5)
X,y = over.fit_resample(X,y)
X,y = under.fit_resample(X,y)

In [None]:
plt.figure(figsize=(5,4))
sns.countplot(y);

### Retraining Knn model of Resampled Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

#scaling the features
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result2 = classification_report(y_test, y_pred)
print('\nClassification Report:')
print(result2)
result3 = accuracy_score(y_test, y_pred)
print("Accuracy: ", result3)

Although our accuracy reduced, the precision level, recall and f1-score improved for the Rare popular songs. Now we can predict the Rare popular songs better than before. It would be instructive to try other models like decision trees using our newly resample data and see how well it does.

### Hyperperameter Tuning

In the hyperparameter tuning phase, we will try to find the best leaf_size, n_neighbors, p and see how if we can find the optimal that can improve our model

In [None]:
leaf_size = list(range(1,50))
n_neighbors = list(range(1,25))
p=[1,2]

In [None]:
hyperparams = dict(leaf_size=leaf_size, n_neighbors = n_neighbors, p=p)

#instantiating knn
knn_2 = KNeighborsClassifier()

# using gridsearch and doing 10 ten fold cross validation
clf = GridSearchCV(knn_2, hyperparams, cv=10)

#fitting model
best_model = clf.fit(X,y)

#Showing results
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

In [36]:
## re-writing model using the new parameters
knn = KNeighborsClassifier(n_neighbors=2, leaf_size=2, p=1)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result2 = classification_report(y_test, y_pred)
print('\nClassification Report:')
print(result2)
result3 = accuracy_score(y_test, y_pred)
print("Accuracy: ", result3)

Confusion Matrix:
[[4642  608]
 [1019 1537]]

Classification Report:
              precision    recall  f1-score   support

          No       0.82      0.88      0.85      5250
         Yes       0.72      0.60      0.65      2556

    accuracy                           0.79      7806
   macro avg       0.77      0.74      0.75      7806
weighted avg       0.79      0.79      0.79      7806

Accuracy:  0.7915705867281578


In [37]:
joblib.dump(knn, "finalKNNmodel.sav")

['finalKNNmodel.sav']

In [38]:
loaded_knn = joblib.load("finalKNNmodel.sav")

y_pred = loaded_knn.predict(X_test)

result = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result)
result2 = classification_report(y_test, y_pred)
print('\nClassification Report:')
print(result2)
result3 = accuracy_score(y_test, y_pred)
print("Accuracy: ", result3)


Confusion Matrix:
[[4642  608]
 [1019 1537]]

Classification Report:
              precision    recall  f1-score   support

          No       0.82      0.88      0.85      5250
         Yes       0.72      0.60      0.65      2556

    accuracy                           0.79      7806
   macro avg       0.77      0.74      0.75      7806
weighted avg       0.79      0.79      0.79      7806

Accuracy:  0.7915705867281578


### Decision Tree/Resampled Data

In [None]:
def fit_decision_tree_classifier(X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 46)

    dt_clf = DecisionTreeClassifier(criterion="entropy", max_depth = 50)
    
    #fit
    dt_clf.fit(X_train,y_train)

    #predict
    y_preds = dt_clf.predict(X_test)
    y_preds_prob = dt_clf.predict_proba(X_test)
    
    #score
    mat = confusion_matrix(y_test, y_preds)
    print(mat)
    print(sns.heatmap(mat, annot=True, cmap='bwr', linewidths=.5))
    acc = accuracy_score(y_test, y_preds)
    print(acc)

    #For cross validation
    print('')
    cv_score = cross_val_score(dt_clf, X, y, cv=10)
    print('*************CV Scores*************')
    print(cv_score)
    print(np.mean(cv_score))
    
    print('')
    print('Sensitivity-Specificity')
    tn, fp, fn, tp = confusion_matrix(y_test, y_preds).ravel()
    print(tn, fp, fn, tp)
    
    spec = tn/(tn+fp)
    sens = tp/(tp+fn)
    
    print(spec, sens)

    return acc

In [None]:
fit_decision_tree_classifier(X, y)

### Random Forest/Resampled Data

In [None]:
def fit_random_forest_classifier(X, y):
   
    #First let's create training and testing data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 46)

    clf = RandomForestClassifier(n_estimators=1000, max_depth=None)

    clf.fit(X_train, y_train)

    y_preds = clf.predict(X_test)

    mat = confusion_matrix(y_test, y_preds)
    print(mat)
    print(sns.heatmap(mat, annot=True, cmap='bwr', linewidths=.5))
    acc = accuracy_score(y_test, y_preds)
    print(acc)
    
    print('')
    cv_score = cross_val_score(clf, X, y, cv=10)
    print('*************CV Scores*************')
    print(cv_score)
    print(np.mean(cv_score))
    
    print('')
    print('Sensitivity-Specificity')
    tn, fp, fn, tp = confusion_matrix(y_test, y_preds).ravel()
    print(tn, fp, fn, tp)
    
    spec = tn/(tn+fp)
    sens = tp/(tp+fn)
    
    print(spec, sens)
    
    return acc

In [None]:
fit_random_forest_classifier(X, y)