# kkbox music recommendation

## Import libraries and data

In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('../input/wsdm-kkbox/train.csv')
test = pd.read_csv('../input/wsdm-kkbox/test.csv')
songs = pd.read_csv('../input/wsdm-kkbox/songs.csv')
members = pd.read_csv('../input/wsdm-kkbox/members.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
songs.head()

In [None]:
members.head()

In [None]:
members.shape
train.info()
print("\n")
songs.info()
print("\n")
members.info()

In [None]:
SMALL_SIZE = 14
MEDIUM_SIZE = 16
BIGGER_SIZE = 20

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(y='source_type',data=train,order=pd.value_counts(train['source_type']).iloc[:10].index)
plt.xlabel('count')
plt.ylabel('source types')
plt.title('Count for top 10 source types in training set')
plt.savefig('source_type.png')

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(y='source_system_tab',data=train,order=pd.value_counts(train['source_system_tab']).index)
plt.xlabel('count')
plt.ylabel('source system tab')
plt.title('Count for source system tab in training set')
plt.savefig('source_system_tab.png')

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(y='source_screen_name',data=train,order=pd.value_counts(train['source_screen_name']).iloc[:10].index)
plt.xlabel('count')
plt.ylabel('source screen name')
plt.title('Count for top 10 source screen name in training set')
plt.savefig('source_screen_name.png')

In [None]:
import matplotlib as mpl

mpl.rcParams['font.size'] = 36
labels = ['Male','Female']
plt.figure(figsize = (10, 10))
sizes = pd.value_counts(members.gender)
patches, texts, autotexts = plt.pie(sizes, 
                                    labels=labels, autopct='%.0f%%',
                                    shadow=False, radius=1,startangle=90)
texts[0].set_fontsize(36)
texts[1].set_fontsize(36)
plt.title("gender distribution",fontsize=40)
plt.savefig('gender.png')


In [None]:

mpl.rcParams['font.size'] = 36
labels = ['1','0']
plt.figure(figsize = (10, 10))
sizes = pd.value_counts(train.target)
patches, texts, autotexts = plt.pie(sizes, 
                                    labels=labels, autopct='%.0f%%',
                                    shadow=False, radius=1,startangle=90)
texts[0].set_fontsize(36)
texts[1].set_fontsize(36)
plt.title("label distribution",fontsize=40)
plt.savefig('label.png')


## Data Processing & Feature Engineering

In [None]:
# Add index feature
index = [i for i in range(len(train))]
train['index'] = index;

# Add timeseries feature
train = train.sort_values(by=['msno','index'])
cur_msno = None
timeseries = []
num = 0
for each in train['msno']:
    if each != cur_msno:
        cur_msno = each
        num = 0
    timeseries.append(num)
    num += 1
train['timeseries'] = timeseries

# Take 20 percent of all user activities
train = train.sample(frac=0.01)


# join songs data
train = pd.merge(train, songs, on='song_id', how='left')
del songs

# join members data
train = pd.merge(train, members, on='msno', how='left')
del members


In [None]:
train.head()

In [None]:
# Replace NA in the data
for i in train.select_dtypes(include=['object']).columns:
    train[i][train[i].isnull()] = 'unknown'
train = train.fillna(value=0)

# Convert time to datetime
train.registration_init_time = pd.to_datetime(train.registration_init_time, format='%Y%m%d', errors='ignore')
train.expiration_date = pd.to_datetime(train.expiration_date,  format='%Y%m%d', errors='ignore')

# Dates to categoty
train['registration_init_time'] = train['registration_init_time'].astype('category')
train['expiration_date'] = train['expiration_date'].astype('category')

# Object data to category
for col in train.select_dtypes(include=['object']).columns:
    train[col] = train[col].astype('category')
    
# Encoding categorical features
for col in train.select_dtypes(include=['category']).columns:
    train[col] = train[col].cat.codes

In [None]:
# Сorrelation matrix
plt.figure(figsize = (10, 7))
sns_plot = sns.heatmap(train.corr())
plt.title('Сorrelation matrix')
fig = sns_plot.get_figure()
fig.savefig('correlation.png')

In [None]:
# From the correlation matrixmm we know that lyricist is very similar to composer
# so we drop the lyricist feature
train = train.drop(['lyricist'], 1)

## Random Forest Model


In [None]:
from sklearn import cross_validation, grid_search, metrics, ensemble

# Train & Test split
target = train.pop('target')
train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(train, target, test_size = 0.3)

# Create Model
random_forest = ensemble.RandomForestClassifier(n_estimators=250, max_depth=25)
random_forest.fit(train_data, train_labels)



In [None]:
# Predicting
predict_labels = random_forest.predict(test_data)

In [None]:
# RF Feature of Importance Plot
plt.figure(figsize = (12, 10))
train_plot = pd.DataFrame({'features': train.columns[train.columns != 'target'],
                        'importances': random_forest.feature_importances_})
train_plot = train_plot.sort_values('importances', ascending=False)

sns_plot = sns.barplot(x = train_plot.importances, y = train_plot.features)
plt.title('Random Forest Importances of Features Plot')
plt.show()
fig = sns_plot.get_figure()
fig.savefig('importance.png')

In [None]:
from sklearn.metrics import accuracy_score

# Print the evaluation metrics
print(accuracy_score(test_labels, predict_labels))
print(metrics.classification_report(test_labels, predict_labels))

## XGboost

In [None]:
import xgboost as xgb

# Create model
xg_boost = xgb.XGBClassifier(learning_rate=0.05, max_depth=18, min_child_weight=5, n_estimators=250)
xg_boost.fit(train_data, train_labels)

In [None]:
# Predicting
predict_labels2 = xg_boost.predict(test_data)

In [None]:
# XGboost Feature of Importance Plot
plt.figure(figsize = (12, 10))
train_plot = pd.DataFrame({'features': train.columns[train.columns != 'target'],
                        'importances': xg_boost.feature_importances_})
train_plot = train_plot.sort_values('importances', ascending=False)

sns_plot = sns.barplot(x = train_plot.importances, y = train_plot.features)
plt.title('XGboost Importances of Features Plot')
plt.show()
fig = sns_plot.get_figure()
fig.savefig('importance_XG.png')

In [None]:

# Evaluation metrics
print(accuracy_score(test_labels, predict_labels2))
print(metrics.classification_report(test_labels, predict_labels2))

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada_boost = AdaBoostClassifier(n_estimators=250, random_state=0, algorithm='SAMME')
ada_boost.fit(train_data, train_labels)

In [None]:
# Predicting
predict_labels3 = ada_boost.predict(test_data)

In [None]:
# AdaBoost Feature of Importance Plot
plt.figure(figsize = (12, 10))
train_plot = pd.DataFrame({'features': train.columns[train.columns != 'target'],
                        'importances': ada_boost.feature_importances_})
train_plot = train_plot.sort_values('importances', ascending=False)

sns_plot = sns.barplot(x = train_plot.importances, y = train_plot.features)
plt.title('Adaboost Importances of Features Plot')
plt.show()
fig = sns_plot.get_figure()
fig.savefig('importance_Ada.png')

In [None]:
# Evaluation metrics
print(accuracy_score(test_labels, predict_labels3))
print(metrics.classification_report(test_labels, predict_labels3))

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gradient_boost = GradientBoostingClassifier()
gradient_boost.fit(train_data, train_labels)

In [None]:
# Predicting
predict_labels4 = gradient_boost.predict(test_data)

In [None]:
# Gradient Feature of Importance Plot
plt.figure(figsize = (12, 10))
train_plot = pd.DataFrame({'features': train.columns[train.columns != 'target'],
                        'importances': gradient_boost.feature_importances_})
train_plot = train_plot.sort_values('importances', ascending=False)

sns_plot = sns.barplot(x = train_plot.importances, y = train_plot.features)
plt.title('Gradientboost Importances of Features Plot')
plt.show()
fig = sns_plot.get_figure()
fig.savefig('importance_Gradient.png')

In [None]:
# Evaluation metrics
print(accuracy_score(test_labels, predict_labels4))

print(metrics.classification_report(test_labels, predict_labels4))

## Model Analysis

In [None]:

# plot all the importance in one graph
df = pd.DataFrame({'features': [str(i) for i in train.columns if i!= 'target'],
                            'random_forest': random_forest.feature_importances_,
                            'xg_boost': xg_boost.feature_importances_,
                            'ada_boost': ada_boost.feature_importances_,
                            'gradient_boost': gradient_boost.feature_importances_,})


plt.figure(figsize = (16, 10))

ax = plt.gca()

df.plot(kind='line', x='features', ax=ax)
plt.xticks(range(len(df.features)), df.features, rotation='vertical')
plt.title('Importances of Features Plot')
plt.show()

## Collaborative Filtering

In [None]:
import surprise


algo = surprise.SVD()
reader = surprise.Reader(rating_scale=(0,1))

# Construct User rating matrix
svd_train_data = train_data[['msno', 'song_id']]
svd_train_data['target'] = train_labels

# Train the model
svd_data = surprise.Dataset.load_from_df(svd_train_data, reader)
svd_data = svd_data.build_full_trainset()

algo.fit(svd_data)


In [None]:

# Make predicitons using the trained model
svd_test_data = test_data[['msno', 'song_id']]

svd_predict_est = []

for index, row in svd_test_data.iterrows():
    est = algo.predict(row['msno'], row['song_id']).est
    svd_predict_est.append((est))
svd_predict_labels = [round(pred) for pred in svd_predict_est]


In [None]:
# Evaluation Metrics
print(accuracy_score(test_labels, svd_predict_labels))
print(metrics.classification_report(test_labels, svd_predict_labels))

## Combined Model

In [None]:

rf_predict_labels = random_forest.predict_proba(test_data) 
xg_predict_labels = xg_boost.predict_proba(test_data) 
rf_predict_est = [tup[1] for tup in rf_predict_labels]
xg_predict_est = [tup[1] for tup in xg_predict_labels]

n = len(rf_predict_est)


# SVD-XG-RF model
combined_predict_labels = [round((rf_predict_est[i]+xg_predict_est[i]+svd_predict_est[i])/3) for i in range(n)]

print(accuracy_score(test_labels, combined_predict_labels))
print(metrics.classification_report(test_labels, combined_predict_labels))


In [None]:
# SVD-XG model
combined_predict_labels2 = [round((xg_predict_est[i]+svd_predict_est[i])/2) for i in range(n)]

print(accuracy_score(test_labels, combined_predict_labels2))
print(metrics.classification_report(test_labels, combined_predict_labels2))

In [None]:
# SVD-RF model 
combined_predict_labels3 = [round((rf_predict_est[i]+svd_predict_est[i])/2) for i in range(n)]

print(accuracy_score(test_labels, combined_predict_labels3))
print(metrics.classification_report(test_labels, combined_predict_labels3))