In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, confusion_matrix, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.svm import SVC
from sklearn.cluster import KMeans, MiniBatchKMeans

import datetime


In [2]:
from dask import dataframe as dd
dask_df_list = []
dask_df_list.append(dd.read_csv('../data/raw/training_set/log_0*.csv')[['session_id','skip_2','track_id_clean']])
dask_df_list.append(dd.read_csv('../data/raw/training_set/log_1*.csv')[['session_id','skip_2','track_id_clean']])
dask_df_list.append(dd.read_csv('../data/raw/training_set/log_2*.csv')[['session_id','skip_2','track_id_clean']])
dask_df_list.append(dd.read_csv('../data/raw/training_set/log_3*.csv')[['session_id','skip_2','track_id_clean']])
dask_df_list.append(dd.read_csv('../data/raw/training_set/log_4*.csv')[['session_id','skip_2','track_id_clean']])
dask_df_list.append(dd.read_csv('../data/raw/training_set/log_5*.csv')[['session_id','skip_2','track_id_clean']])
dask_df_list.append(dd.read_csv('../data/raw/training_set/log_6*.csv')[['session_id','skip_2','track_id_clean']])
dask_df_list.append(dd.read_csv('../data/raw/training_set/log_7*.csv')[['session_id','skip_2','track_id_clean']])
dask_df_list.append(dd.read_csv('../data/raw/training_set/log_8*.csv')[['session_id','skip_2','track_id_clean']])

In [3]:
dask_df = dd.concat(dask_df_list)

dask_df

Unnamed: 0_level_0,session_id,skip_2,track_id_clean
npartitions=4735,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,object,bool,object
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


In [4]:
kmean100_df = pd.read_csv('../data/interim/all_data/mbKMeans100clusters.csv', usecols=['track_id','clus'])
kmean100_df.rename(columns={'track_id':'track_id_clean'}, inplace=True)
kmean100_df.head()

Unnamed: 0,track_id_clean,clus
0,t_2e8f4b71-8a0b-4b9c-b7d8-fb5208e87f9f,94
1,t_dae2ec0e-ec7b-4b3e-b60c-4a884d0eccb0,36
2,t_cf0164dd-1531-4399-bfa6-dec19cd1fedc,28
3,t_0f90acc7-d5c5-4e53-901d-55610fbd090c,4
4,t_36b9ad02-095a-443d-a697-6c7285d9410a,29


In [22]:
dask_df_merged = dask_df.merge(kmean100_df, on=['track_id_clean'])
dask_df_merged

Unnamed: 0_level_0,session_id,skip_2,track_id_clean,clus
npartitions=4735,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,object,bool,object,int64
,...,...,...,...
...,...,...,...,...
,...,...,...,...
,...,...,...,...


In [23]:
dask_df_merged.head()

Unnamed: 0,session_id,skip_2,track_id_clean,clus
0,0_00006f66-33e5-4de7-a324-2d18e439fc1e,False,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,66
1,0_00079a23-1600-486a-91bd-5208be0c745a,False,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,66
2,0_012b0fb4-0cc3-429f-9a78-cc6e622153fb,False,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,66
3,0_013cc010-c476-4ad2-8972-73449e0b2ef4,False,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,66
4,0_01a5f0dc-9938-48c9-92f1-c7e51f34d290,False,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,66


In [24]:
dask_df_merged.memory_usage()

Dask Series Structure:
npartitions=1
    int64
      ...
dtype: int64
Dask Name: series-groupby-sum-agg, 33825 tasks

In [25]:
dask_df_merged = dask_df_merged.astype({'session_id':'category','track_id_clean':'category'})
dask_df_merged.memory_usage()

Dask Series Structure:
npartitions=1
    int64
      ...
dtype: int64
Dask Name: series-groupby-sum-agg, 38560 tasks

In [29]:
for part in dask_df_merged.partitions[0]:
    print(part)

session_id
skip_2
track_id_clean
clus


In [31]:
dask_df_merged.partitions[0].compute()

Unnamed: 0,session_id,skip_2,track_id_clean,clus
0,0_00006f66-33e5-4de7-a324-2d18e439fc1e,False,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,66
1,0_00079a23-1600-486a-91bd-5208be0c745a,False,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,66
2,0_012b0fb4-0cc3-429f-9a78-cc6e622153fb,False,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,66
3,0_013cc010-c476-4ad2-8972-73449e0b2ef4,False,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,66
4,0_01a5f0dc-9938-48c9-92f1-c7e51f34d290,False,t_0479f24c-27d2-46d6-a00c-7ec928f2b539,66
...,...,...,...,...
373897,0_2038af3c-c387-473f-b811-e1403c9020e8,True,t_6d7a161d-8f5b-4341-b6a5-6a61065ff400,36
373898,0_2038af3c-c387-473f-b811-e1403c9020e8,True,t_6d7a161d-8f5b-4341-b6a5-6a61065ff400,36
373899,0_2038ee72-9ee7-4f8b-ac2a-0514ed4a4ff3,True,t_4e85e959-58cb-4bf6-aac1-f0e56689dcb8,56
373900,0_2038ee72-9ee7-4f8b-ac2a-0514ed4a4ff3,True,t_79e052af-b399-4626-a5aa-6f16f544e4bd,6


In [19]:
sessions = dask_df_merged.drop(columns=['skip_2','track_id_clean','clus']).drop_duplicates()

In [17]:
print(sessions.shape)

(Delayed('int-b970d9ec-1786-426e-9645-d3e7b6d8595b'), 1)


In [21]:
dask_df_merged['session_id'].nunique().compute()

  df = reader(bio, **kwargs)


ValueError: Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.

+-------------+--------+----------+
| Column      | Found  | Expected |
+-------------+--------+----------+
| hour_of_day | object | int64    |
+-------------+--------+----------+

The following columns also raised exceptions on conversion:

- hour_of_day
  ValueError("invalid literal for int() with base 10: '10\\x012018-08-14'")

Usually this is due to dask's dtype inference failing, and
*may* be fixed by specifying dtypes manually by adding:

dtype={'hour_of_day': 'object'}

to the call to `read_csv`/`read_table`.

In [19]:
from functions import cal_similarMat

In [None]:
sim_output = cal_similarMat(dask_df_merged)

In [None]:
def cal_similarMat(df_train):
    import numpy as np
    import pandas as pd
    
    
    sessions = list(np.sort(df_train['session_id'].unique())) 
    tracks = list(df_train['clus'].unique()) 
    no_skip_2 = (list(df_train['skip_2']==False))*1 # use *1 to convert bool to integer
    
    DfSessionUnique = []
    DfSessionUnique = pd.DataFrame(sessions,columns=['sessions'])
    
    from scipy import sparse
    from pandas.api.types import CategoricalDtype

    rows = df_train['session_id'].astype(CategoricalDtype(categories=sessions)).cat.codes # unique sessions (index)

    # Get the associated row indices
    cols = df_train['clus'].astype(CategoricalDtype(categories=tracks)).cat.codes # unique tracks (column)
    
    
    # Get the associated column indices
    #Compressed Sparse Row matrix
    listeningSparse = []
    listeningSparse = sparse.csr_matrix((no_skip_2, (rows, cols)), shape=(len(sessions), len(tracks)))
    #csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
    #where data, row_ind and col_ind satisfy the relationship a[row_ind[k], col_ind[k]] = data[k]. , see https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html

    listeningSparse
    #a sparse matrix is not a pandas dataframe, but sparse matrices are efficient for row slicing and fast matrix vector products
    
    
    DataBinary = df_train.copy()
    DataBinary['ListenYes'] = (DataBinary['skip_2'] == False)*1
    
    data2=DataBinary[['session_id','clus','ListenYes']]

    data2['ListenYes'].replace(0, -1, inplace = True)

    data3 = data2.groupby(['session_id', 'clus']).agg({'ListenYes':['sum']})
    data3 = data3.reset_index()
    data3.columns = data3.columns.droplevel(level = 1) # take out the unwanted level
    
    
    DfMatrix = pd.pivot_table(data3, values='ListenYes', index='session_id', columns='clus')

    DfMatrix=DfMatrix.fillna(0) #NaN values need to get replaced by 0, meaning they have not been listened yet.
    
    DfResetted = DfMatrix.reset_index().rename_axis(None, axis=1) 

    DfTracksListen = DfResetted.drop(columns=['session_id'])

    #Normalization
    import numpy as np
    DfTracksListenNorm = DfTracksListen / np.sqrt(np.square(DfTracksListen).sum(axis=0)) 

    #### similarity and correlation
    # Calculating with Vectors to compute Cosine Similarities
    TrackTrackSim = DfTracksListenNorm.transpose().dot(DfTracksListenNorm) 

    #Another approach to the above would be using correlation
    TrackTrackCorr = DfTracksListenNorm.corr()
    
    #Spearman correlation
    TrackTrackSpearCorr = DfTracksListenNorm.corr(method = 'spearman')
    
    #Kendall correlation
    TrackTrackKendCorr = DfTracksListenNorm.corr(method = 'kendall')
    
    from scipy.spatial.distance import cdist
    
    #### distances
    # Euclidean distance
    TrackTrackEuclDist = pd.DataFrame(cdist(DfTracksListenNorm.T,DfTracksListenNorm.T, 'euclidean'), index = TrackTrackSim.index, columns = TrackTrackSim.columns)

    # Squared Euclidean distance
    TrackTrackSqEuclDist = pd.DataFrame(cdist(DfTracksListenNorm.T,DfTracksListenNorm.T, 'sqeuclidean'), index = TrackTrackSim.index, columns = TrackTrackSim.columns)
    
    # Manhattan distance
    TrackTrackManhDist = pd.DataFrame(cdist(DfTracksListenNorm.T,DfTracksListenNorm.T, 'cityblock'), index = TrackTrackSim.index, columns = TrackTrackSim.columns)

    # Canberra distance
    TrackTrackCanbDist = pd.DataFrame(cdist(DfTracksListenNorm.T,DfTracksListenNorm.T, 'canberra'), index = TrackTrackSim.index, columns = TrackTrackSim.columns)

    #### boolean distances
    # Hamming distance
    TrackTrackHammDist = pd.DataFrame(cdist(DfTracksListenNorm.T>0,DfTracksListenNorm.T>0, 'hamming'), index = TrackTrackSim.index, columns = TrackTrackSim.columns)

 

    # Create a place holder matrix for similarities, and fill in the session column
    SessTrackSimilarity = pd.DataFrame(index=DfResetted.index, columns=DfResetted.columns)
    SessTrackSimilarity.iloc[:,:1] = DfResetted.iloc[:,:1]
    SessTrackCorrelation = SessTrackSimilarity.copy()
    SessTrackSpearCorr = SessTrackSimilarity.copy()
    SessTrackKendCorr = SessTrackSimilarity.copy()
    SessTrackEuclDist = SessTrackSimilarity.copy()
    SessTrackSqEuclDist = SessTrackSimilarity.copy()
    SessTrackManhDist = SessTrackSimilarity.copy()
    SessTrackCanbDist = SessTrackSimilarity.copy()
    SessTrackHammDist = SessTrackSimilarity.copy()

    #We now loop through the rows and columns filling in empty spaces with similarity scores.
    
    SessionListening = []
    TrackTopSimilarity = []

    for i in range(0,len(SessTrackSimilarity.index)):
        for j in range(1,len(SessTrackSimilarity.columns)):

            ses = SessTrackSimilarity.index[i]
            tra = SessTrackSimilarity.columns[j]

            SessionListening = DfTracksListen.loc[ses,]
            TrackSimilarity = TrackTrackSim[tra]
            TrackCorrelation = TrackTrackCorr[tra]
            TrackSpearCorr = TrackTrackSpearCorr[tra]
            TrackKendCorr = TrackTrackKendCorr[tra]
            TrackEuclDist = TrackTrackEuclDist[tra]
            TrackSqEuclDist = TrackTrackSqEuclDist[tra]
            TrackManhDist = TrackTrackManhDist[tra]
            TrackCanbDist = TrackTrackCanbDist[tra]
            TrackHammDist = TrackTrackHammDist[tra]

            SessTrackSimilarity.loc[i][j] = sum(SessionListening*TrackSimilarity)/sum(TrackSimilarity)
            SessTrackCorrelation.loc[i][j] = sum(SessionListening*TrackCorrelation)/sum(TrackCorrelation)
            SessTrackSpearCorr.loc[i][j] = sum(SessionListening*TrackSpearCorr)/sum(TrackSpearCorr)
            SessTrackKendCorr.loc[i][j] = sum(SessionListening*TrackKendCorr)/sum(TrackKendCorr)
            SessTrackEuclDist.loc[i][j] = sum(SessionListening*TrackEuclDist)/sum(TrackEuclDist)
            SessTrackSqEuclDist.loc[i][j] = sum(SessionListening*TrackSqEuclDist)/sum(TrackSqEuclDist)
            SessTrackManhDist.loc[i][j] = sum(SessionListening*TrackManhDist)/sum(TrackManhDist)
            SessTrackCanbDist.loc[i][j] = sum(SessionListening*TrackCanbDist)/sum(TrackCanbDist)
            SessTrackHammDist.loc[i][j] = sum(SessionListening*TrackHammDist)/sum(TrackHammDist)

    
    SessTrackSimilarity.set_index('session_id', inplace = True)
    SessTrackCorrelation.set_index('session_id', inplace = True)
    SessTrackSpearCorr.set_index('session_id', inplace = True)
    SessTrackKendCorr.set_index('session_id', inplace = True)
    SessTrackEuclDist.set_index('session_id', inplace = True)
    SessTrackSqEuclDist.set_index('session_id', inplace = True)
    SessTrackManhDist.set_index('session_id', inplace = True)
    SessTrackCanbDist.set_index('session_id', inplace = True)
    SessTrackHammDist.set_index('session_id', inplace = True)
    
    
    sim_output = [SessTrackSimilarity, SessTrackCorrelation, SessTrackSpearCorr, SessTrackKendCorr, SessTrackEuclDist, SessTrackSqEuclDist, SessTrackManhDist, SessTrackCanbDist, SessTrackHammDist]
    
    
    return sim_output