In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

sns.set_theme()
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")
sessions = pd.read_json("data/sessions.jsonl", lines=True).dropna().astype({'user_id': 'int32'})
artists = pd.read_json("data/artists.jsonl", lines=True).dropna()
tracks = pd.read_json("data/tracks.jsonl", lines=True).dropna()
track_storage = pd.read_json("data/track_storage.jsonl", lines=True)
users = pd.read_json("data/users.jsonl", lines=True)

In [2]:
tracks

Unnamed: 0,id,name,popularity,duration_ms,explicit,id_artist,release_date,danceability,energy,key,mode,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,7sj6ynHVC4yVa7xJUYoD97,เพียงก่อนนั้น,12,236583,0,5pLJF8oFCs1YpPZaARy1Cz,1986-01-01,0.514,0.272,4,0.0,-14.678,0.0249,0.732000,0.544000,0.1130,0.126,90.077,4
1,7GDczVNdqle26Ff2LrzsLr,עתיד מתוק,27,223800,0,7iE2GZiF0xVFzlA2E93m3g,1985-11-01,0.654,0.842,4,0.0,-7.279,0.0360,0.381000,0.011300,0.1300,0.765,142.966,4
2,4AyUgUtX0nMsBjzZFDWVgM,スピード,28,287831,0,2DiecQcRbDuSJuSPKtirrX,1991-02-21,0.452,0.961,9,1.0,-3.379,0.0704,0.083100,0.000322,0.0851,0.468,132.790,4
3,4gxYRhp7DeB11eC5VQOJ7w,Wir ham' noch lange nicht genug,57,292000,0,6x9jRPDmA8Ihpw3A9FBowD,1991,0.555,0.722,0,1.0,-11.298,0.0271,0.000767,0.000027,0.0813,0.659,148.105,4
4,78Eqonqp6yxXuYQtnffk0W,Faço Como,43,201253,0,7gsMzXapXdZ5XzVPJp4rBd,2018-05-19,0.715,0.402,0,0.0,-11.527,0.1490,0.350000,0.345000,0.1060,0.246,183.948,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4316,54DeCpYmRbhoiAORhd6r6Q,"當愛變成習慣 - 電影 ""非常偵探"" 歌曲",26,299773,0,1Hu58yHg2CXNfDhlPd7Tdd,1994-01-01,0.618,0.164,4,0.0,-17.472,0.0340,0.426000,0.000001,0.1110,0.157,77.932,4
4317,62SaQu39s3Ce5OJ80i3XXl,Lanternen - Live Version,8,292933,0,6jvJrJNRIBqin5CYaT2cup,1987-11-12,0.346,0.849,8,1.0,-10.527,0.1360,0.040800,0.068600,0.6940,0.193,127.659,4
4318,7aBjLA5CsRufTkt0hGmCA3,El Necio,53,273499,0,4rUyBlggM5tZUH5QZn9ZuO,1992,0.564,0.194,4,0.0,-16.030,0.2320,0.684000,0.000002,0.0857,0.151,161.533,4
4319,6UCUXn2mmXL5i3JUJb5vFE,La Piña Colada,24,210707,0,0R7hVTyBZQ9ApxMtDEAwyL,2002-01-10,0.802,0.876,0,0.0,-5.835,0.0921,0.170000,0.000000,0.0279,0.960,92.999,4


Because tracks don't have assigned genres I joined genres from artists to tracks
For tracks which have been skipped by user I checked if intersection of track genres and user favourite genres could tell something

In [3]:
track_genre = tracks.join(artists.set_index('id')['genres'], on='id_artist', rsuffix='_author').set_index('id')['genres']
skip_events = sessions[sessions['event_type']=='skip'][['user_id', 'track_id']]
like_events = sessions[sessions['event_type']=='like'][['user_id', 'track_id']]
play_events = sessions[sessions['event_type']=='play'][['user_id', 'track_id']]
play_events = pd.concat([play_events, skip_events]).drop_duplicates(keep=False)

pd.concat([play_events, skip_events]).drop_duplicates(keep=False)

def skip_by_genre(df):
    return df.join(track_genre, on='track_id').dropna().set_index('user_id').join(users.set_index('user_id')['favourite_genres']).dropna()

def n_common_genres(x):
    track_g, fav_g = x['genres'], x['favourite_genres']
    return len(set(fav_g).intersection(set(track_g)))
{
    'skip': skip_by_genre(skip_events).apply(n_common_genres, axis=1).groupby('user_id').sum().agg(['mean', 'std']),

'like': skip_by_genre(like_events).apply(n_common_genres, axis=1).groupby('user_id').sum().agg(['mean', 'std']),
    'play': skip_by_genre(play_events).apply(n_common_genres, axis=1).groupby('user_id').sum().agg(['mean', 'std'])
}



{'skip': mean     3.880000
 std     13.225887
 dtype: float64,
 'like': mean    2.520000
 std     7.251291
 dtype: float64,
 'play': mean    4.387755
 std     6.815474
 dtype: float64}

## Skip and popularity by user

In [4]:
skip_events = sessions[sessions['event_type']=='skip'][['user_id', 'track_id']]
skip_events.join(tracks.set_index('id')['popularity'], on='track_id', how="inner").groupby('user_id').agg({'popularity':['mean', 'std', 'count']})


Unnamed: 0_level_0,popularity,popularity,popularity
Unnamed: 0_level_1,mean,std,count
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
101,39.125,14.065535,8
102,26.111111,11.709445,9
103,32.136364,10.534729,22
104,29.189189,17.478301,37
105,76.6,1.074968,10
106,77.384615,2.567931,52
107,40.777778,20.209163,27
108,28.2,16.146207,5
109,76.666667,1.61433,12
110,77.75,3.058945,8
