In [19]:
import pandas as pd
import numpy as np
import seaborn as sns

sns.set_theme()
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")
sessions = pd.read_json("data/sessions.jsonl", lines=True).dropna().astype({'user_id': 'int32'})
artists = pd.read_json("data/artists.jsonl", lines=True).dropna()
tracks = pd.read_json("data/tracks.jsonl", lines=True).dropna()
track_storage = pd.read_json("data/track_storage.jsonl", lines=True)
users = pd.read_json("data/users.jsonl", lines=True)

In [2]:
tracks

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artist,release_date,danceability,energy,key,mode,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,,เพียงก่อนนั้น,12.0,236583,0,5pLJF8oFCs1YpPZaARy1Cz,1986-01-01,0.514,0.2720,4,0.0,-14.678,0.0249,0.732000,0.544000,0.1130,0.1260,90.077,4
1,7GDczVNdqle26Ff2LrzsLr,עתיד מתוק,27.0,223800,0,7iE2GZiF0xVFzlA2E93m3g,1985-11-01,0.654,0.8420,4,0.0,-7.279,0.0360,0.381000,0.011300,0.1300,0.7650,142.966,4
2,4AyUgUtX0nMsBjzZFDWVgM,スピード,28.0,287831,0,2DiecQcRbDuSJuSPKtirrX,1991-02-21,0.452,0.9610,9,1.0,-3.379,0.0704,0.083100,0.000322,0.0851,0.4680,132.790,4
3,4gxYRhp7DeB11eC5VQOJ7w,Wir ham' noch lange nicht genug,57.0,292000,0,,1991,0.555,0.7220,0,1.0,-11.298,0.0271,0.000767,0.000027,0.0813,0.6590,148.105,4
4,78Eqonqp6yxXuYQtnffk0W,Faço Como,43.0,201253,0,7gsMzXapXdZ5XzVPJp4rBd,2018-05-19,0.715,0.4020,0,0.0,-11.527,0.1490,0.350000,0.345000,0.1060,0.2460,183.948,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21603,5FWbKOPrGinhW9TNiyD3Qz,Lorelay,5.0,224760,0,4zaslnpNr10iUFJGx0XqpM,1978,0.264,0.4050,0,,-7.925,0.0295,0.844000,0.000006,0.1150,0.0765,72.999,4
21604,1XiyW1CAoo5jfsiq941XMe,סעידה סולטנה,13.0,280560,0,03IVYRlAC972SOHPb86Usr,1998-01-01,0.849,0.6780,1,,-9.047,0.0515,0.001470,0.114000,0.1560,0.8010,116.081,4
21605,1mlzd3LMQXBTCGTzu24rcO,Zo klein als ik ben,21.0,80093,0,62lP9hgxTVphRNmeREeFdC,1986-10-01,0.680,0.0792,7,,-17.518,0.0815,0.933000,0.000000,0.1040,0.5490,94.748,4
21606,54jLw1x7AdfEhGF7JEcUKE,New Church,23.0,212200,0,5JEhWD9S2znCiQRiGj2OUk,1982-01-01,0.414,0.8930,9,,-5.465,0.0533,0.000183,0.003490,0.1530,0.4070,91.330,4


Because tracks don't have assigned genres I joined genres from artists to tracks
For tracks which have been skipped by user I checked if intersection of track genres and user favourite genres could tell something

In [62]:
track_genre = tracks.join(artists.set_index('id')['genres'], on='id_artist', rsuffix='_author').set_index('id')['genres']
skip_events = sessions[sessions['event_type']=='skip'][['user_id', 'track_id']]
like_events = sessions[sessions['event_type']=='like'][['user_id', 'track_id']]
play_events = sessions[sessions['event_type']=='play'][['user_id', 'track_id']]
play_events = pd.concat([play_events, skip_events]).drop_duplicates(keep=False)

pd.concat([play_events, skip_events]).drop_duplicates(keep=False)

def skip_by_genre(df):
    return df.join(track_genre, on='track_id').dropna().set_index('user_id').join(users.set_index('user_id')['favourite_genres']).dropna()

def n_common_genres(x):
    track_g, fav_g = x['genres'], x['favourite_genres']
    return len(set(fav_g).intersection(set(track_g)))
{
    'skip': skip_by_genre(skip_events).apply(n_common_genres, axis=1).groupby('user_id').sum().agg(['mean', 'std']),

'like': skip_by_genre(like_events).apply(n_common_genres, axis=1).groupby('user_id').sum().agg(['mean', 'std']),
    'play': skip_by_genre(play_events).apply(n_common_genres, axis=1).groupby('user_id').sum().agg(['mean', 'std'])
}



{'skip': mean    0.102564
 std     0.307355
 dtype: float64,
 'like': mean    0.205882
 std     0.478597
 dtype: float64,
 'play': mean    0.428571
 std     0.667827
 dtype: float64}

## Skip and popularity by user

In [72]:
skip_events = sessions[sessions['event_type']=='skip'][['user_id', 'track_id']]
skip_events.join(tracks.set_index('id')['popularity'], on='track_id', how="inner").groupby('user_id').agg({'popularity':['mean', 'std', 'count']})


Unnamed: 0_level_0,popularity,popularity,popularity
Unnamed: 0_level_1,mean,std,count
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
101,34.0,,1
102,14.5,12.020815,2
103,38.333333,18.903263,3
104,39.0,10.392305,5
105,17.0,,1
106,40.0,0.0,2
107,14.0,12.727922,2
108,16.0,12.727922,2
110,26.5,2.12132,2
112,42.5,28.991378,2
