# This is Jennifer's first pass at the data to do EDA

### EDA and preprocesssing

In [None]:
from glob import glob
import pandas as pd
import numpy as np
from collections import Counter
import pickle

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')

spotify_color = '#5bb560'

In [None]:
file_path = ('data/track_features/tf_mini.csv')
track_features = pd.read_csv(file_path)
track_features.head()

In [None]:
track_features.info()

In [None]:
log_mini_path = 'data/training_set/log_mini.csv'
log_mini = pd.read_csv(log_mini_path)
log_mini.head()

In [None]:
# what is the distribution of skipping

# plt.hist((log_mini.groupby('session_id')['skip_1'].sum()/log_mini.groupby('session_id')['session_length'].max()), bins = 20, alpha = 0.5, color = 'g')
# plt.hist((log_mini.groupby('session_id')['skip_2'].sum()/log_mini.groupby('session_id')['session_length'].max()), bins = 20, alpha = 0.5, color = 'blue')
# plt.hist((log_mini.groupby('session_id')['skip_3'].sum()/log_mini.groupby('session_id')['session_length'].max()), bins = 20, alpha = 0.5, color = 'purple')

# plt.show();

In [None]:
plt.hist((log_mini.groupby('session_id')['not_skipped'].sum()/log_mini.groupby('session_id')['session_length'].max()) , bins = 20)
plt.title('Distribution of not_skip rates per session')

plt.show();

In [None]:
np.mean(log_mini.groupby('session_id')['not_skipped'].sum()/log_mini.groupby('session_id')['session_length'].max())

# this tells me that on average, people are listening to a third of the session to completion

In [None]:
# log_mini.session_length.unique() # this tells me that I need to normalize the skip rate with the session length

In [None]:
log_mini.info()

In [None]:
# I would be interested in turning the date into a datetime object and then 
# extracting the weekday to see if there is some seasonality happening
from datetime import datetime as dt

log_mini.date = pd.to_datetime(log_mini.date)
log_mini['weekday'] = log_mini.date.dt.dayofweek # Return the day of the week as an integer, where Monday is 0 and Sunday is 6

# this also means that I'm going to need to get dummy variables from weekday since it's categorical
log_mini = pd.concat([log_mini, pd.get_dummies(log_mini['weekday'], prefix = 'weekday', drop_first = True)], axis = 1)
log_mini = pd.concat([log_mini, pd.get_dummies(log_mini['hour_of_day'], prefix = 'hour', drop_first = True)], axis = 1)
log_mini = pd.concat([log_mini, pd.get_dummies(log_mini['session_position'], prefix = 'sess_pos', drop_first = True)], axis = 1)
log_mini = pd.concat([log_mini, pd.get_dummies(log_mini['hist_user_behavior_reason_start'], prefix = 'start_hist_b', drop_first = True)], axis = 1)
log_mini = pd.concat([log_mini, pd.get_dummies(log_mini['hist_user_behavior_reason_end'], prefix = 'end_hist_b', drop_first = True)], axis = 1)
log_mini = pd.concat([log_mini, pd.get_dummies(log_mini['context_type'], prefix = 'context', drop_first = True)], axis = 1)


log_mini.drop(['weekday', 'hour_of_day', 'session_position', 'context_type', 'date','track_id_clean', 'track_id',
               'session_id','short_pause_before_play', 'long_pause_before_play',
               'hist_user_behavior_reason_start', 'hist_user_behavior_reason_end'], axis = 1, inplace = True, errors = 'ignore')



Upon initial reading, I thought the problem was a classification problem in which we're asked to predict whether a user will skip this song or not (binary outcome). Now that I look at the df some more, I see that there are varying degrees of 'skip'. I think I'm going to try to simplify this problem first, and delve deeper when I've made modelling progress.

let's first get familiar with the data by looking at the data for one session log

In [None]:
# session_log_mask = (log_mini.session_id == log_mini.session_id[0])
# one_session = log_mini[session_log_mask]
# one_session_detailed = pd.merge(one_session, track_features, how = 'left', left_on='track_id_clean', right_on = 'track_id')

In [None]:
# to simplify this problem I only care about if they played the entire song or not (skip def)
# one_session_detailed.drop(['skip_1', 'skip_2', 'skip_3', 'session_id', 'track_id_clean', 'track_id'], axis = 1, 
#                           inplace = True, errors = 'ignore')

In [None]:
# one_session_detailed.info()
# my target feature is 'not_skipped'

In [None]:
# i want to see the distribution of songs that play all the way through


# plt.hist(log_mini.groupby('session_id')['not_skipped'].sum(), bins = 20)
# plt.title('Distribution of songs that were not skipped per session')
# plt.show();

In [None]:
# I want to make sure to compare it against a dummy classifier so i can establish a baseline
# i need to scale things

log_mini.info()

### Modelling without dummifying


Justification for starting with RandomForest:



In [None]:
# log_mini.dtypes

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, ADASYN

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, classification_report
from sklearn.linear_model import LogisticRegression

# Feature ranking with recursive feature elimination.
from sklearn.feature_selection import RFE

from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier


log_mini_no_process = pd.read_csv('data/training_set/log_mini.csv')
skipped_data = log_mini_no_process[['track_id_clean', 'not_skipped']].copy()

track_features['is_major'] = (track_features['mode'] == 'major').astype(int)
session_with_track_info = (pd.merge(log_mini_no_process, track_features, left_on='track_id_clean', right_on='track_id')
                            .drop(['track_id_clean', 'track_id'], axis=1))
session_with_track_info.head()

In [None]:
# grid_params = {
#     'n_estimators': [10, 50],
#     'max_depth': [3, 4, 6]
# }

rf_model = RandomForestClassifier() #GridSearchCV(RandomForestClassifier(), params, cv = 5)

features = session_with_track_info.drop(['not_skipped', 'date', 'context_type','hist_user_behavior_reason_start', 'mode', 'hist_user_behavior_reason_end', 'session_id'], axis=1)
target = log_mini_no_process.not_skipped

x_train, x_test, y_train, y_test = train_test_split(features, target)
rf_model.fit(x_train, y_train)

#### trying with only metadata to predict skip

In [None]:
track_features_for_skipped = pd.read_csv('data/track_features/tf_mini.csv')
songs_skipping = (pd.merge(skipped_data, track_features, how='left', left_on='track_id_clean', right_on='track_id')
                  )
songs_skipping.drop(columns=['track_id_clean','mode'], inplace=True)
# songs_skipping['is_major'] = 
# songs_skipping.dtypes

In [None]:
features = songs_skipping.drop(columns=['not_skipped', 'track_id']).copy(deep=True)
target = songs_skipping.not_skipped

sm = SMOTE(random_state= 42)
x_resampled, y_resampled = sm.fit_sample(features, target)

with open('smote_data.pkl', 'wb') as file:
    pickle.dump([features, target], file)
    file.close()
    
print('I have loaded the smoted data into a pickle file')

In [None]:
x_resampled = pd.DataFrame(x_resampled, columns=features.columns)
x_resampled.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled)
songs_rf_model = RandomForestClassifier()
songs_rf_model.fit(x_train, y_train)
predictions = songs_rf_model.predict(x_test)
print(accuracy_score(predictions, y_test))
print(classification_report(predictions, y_test))

In [None]:
sorted(zip(songs_rf_model.feature_importances_, features.columns), reverse=True)

In [None]:
# rf_model.score
predictions = songs_rf_model.predict(x_test)
print(classification_report(predictions, y_test))
print(confusion_matrix(predictions, y_test))

#### Additional EDA to understand the data for Analytics v Machine Learning

In [None]:
# i would be interested to compare the songs skipped based on the is_major column
major_df = pd.DataFrame(songs_skipping.groupby('is_major')['not_skipped'].sum())
plt.bar(major_df.index, major_df.not_skipped, color=spotify_color)
plt.title('Number of Skipped in Dataset')
plt.xticks(major_df.index, ['skipped', 'not_skipped'])
plt.savefig('figures/skipped_song_hist.svg', format='svg')
plt.show();

In [None]:
# i'm interested in how the rate of skips changes with each listening hour

songs_skipping.head()

In [None]:
session_dates = log_mini_no_process[['track_id_clean', 'date', 'hour_of_day']].copy()
session_dates.head()
songs_dates = pd.merge(songs_skipping, session_dates, how='left', left_on='track_id', right_on='track_id_clean')
songs_dates.head()   

In [None]:
songs_dates.groupby('hour_of_day')

In [None]:
len(log_mini_no_process)

In [None]:
log_mini_no_process.dtypes

In [None]:
sorted(zip(songs_rf_model.feature_importances_, features.columns), reverse=True)

In [None]:
# do you know what this is plotting
plt.plot(log_mini_no_process.groupby('session_position')['not_skipped'].sum())
plt.xticks(range(1,21));

### Modelling (Jenn Wong original approach)

Recall, we are trying to predict 'not_skipped' aka played the entire song

In [None]:
x_features = log_mini.drop(['not_skipped'], axis = 1)
y_target = log_mini.not_skipped

# frmo experience running before scaling, scaling doesn't change the metrics but doing it for good measure
scaler = StandardScaler()
x_transformed = scaler.fit_transform(x_features)
x_features = pd.DataFrame(x_transformed, columns = x_features.columns)
x_features.head()

In [None]:
# sm = SMOTE(random_state=42) #this takes a long time so i would suggest saving to pkl and loading the pkl
# x_res, y_res = sm.fit_sample(x_features, y_target)

x_train, x_test, y_train, y_test = train_test_split(x_features, y_target)

In [None]:
# x_train

In [None]:
dummy_classifier = DummyClassifier()
dummy_classifier.fit(x_train, y_train)
print(accuracy_score(dummy_classifier.predict(x_test), y_test))
confusion_matrix(dummy_classifier.predict(x_test), y_test)
# scaler = StandardScaler()
# x_features = scaler.fit_transform(x_features)

This is because it's an imbalanced dataset lmao

In [None]:
lr_model = LogisticRegression()
# selector = RFE(lr_model, 10) # this takes hella long to run nevermind
lr_model.fit(x_train, y_train)
accuracy_score(lr_model.predict(x_test), y_test)

In [None]:
Counter(lr_model.predict(x_test))

In [None]:
# sorted(list(zip(lr_model.coef_[0], x_features.columns)), key= lambda x: abs(x[0]), reverse = True)
lr_coef_df = pd.DataFrame(list(zip(lr_model.coef_[0], x_features.columns)))
lr_coef_df['coef_abs'] = abs(lr_coef_df.iloc[:,0])
lr_coef_df.sort_values('coef_abs', ascending= False, inplace = True)
lr_coef_df

In [None]:
# I would be interested to see how well i can predict if i just had the skip_3 column

In [None]:
skip_3_lr = LogisticRegression()
skip_3_lr.fit(x_train[['skip_3']], y_train)
accuracy_score(skip_3_lr.predict(x_test[['skip_3']]), y_test)

In [None]:
(confusion_matrix(skip_3_lr.predict(x_test[['skip_3']]), y_test))
# why is that even with just this one variable, i am able to get 98% accuracy..

# recall that skip_3 indicates that if false, most of the song was played

Given an external estimator that assigns weights to features (e.g., the coefficients of a linear model), the goal of recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through a coef_ attribute or through a feature_importances_ attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached.

ya know, i never did explore if premium users are different from not-premium users. i would be interested to see if there is clustering possible.

In [None]:
# i should create a column that tells me how many songs was skipped before the current song in the session