In [1]:
from evaluation.EvalRSRunner import ChallengeDataset

import gc
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [2]:
dataset = ChallengeDataset()

LFM dataset already downloaded. Skipping download.
Loading dataset.
Generating Train/Test Split.
Generating dataset hashes.


In [3]:
train, test = dataset.get_sample_train_test()
train.shape, test.shape

((6869679, 6), (29722, 3))

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6869679 entries, 0 to 37926426
Data columns (total 6 columns):
 #   Column            Dtype
---  ------            -----
 0   user_id           int64
 1   artist_id         int64
 2   album_id          int64
 3   track_id          int64
 4   timestamp         int64
 5   user_track_count  int64
dtypes: int64(6)
memory usage: 366.9 MB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29722 entries, 37926134 to 211
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   user_id   29722 non-null  int64
 1   track_id  29722 non-null  int64
 2   fold      29722 non-null  int64
dtypes: int64(3)
memory usage: 928.8 KB


In [6]:
df = train[['user_id', 'track_id', 'timestamp', 'user_track_count']].sort_values('timestamp')
df = pd.DataFrame(df).join(dataset.df_users, on='user_id', how='left')

In [7]:
df.head()

Unnamed: 0,user_id,track_id,timestamp,user_track_count,country,age,gender,playcount,registered_unixtime,country_id,...,relative_le_per_hour14,relative_le_per_hour15,relative_le_per_hour16,relative_le_per_hour17,relative_le_per_hour18,relative_le_per_hour19,relative_le_per_hour20,relative_le_per_hour21,relative_le_per_hour22,relative_le_per_hour23
37508031,2405701,1147113,1113164704,1,,-1,,230,1113151438,-1,...,0.0296,0.0329,0.0197,,0.0428,0.0461,0.0691,0.0329,0.1283,0.0921
37508030,2405701,10722,1113164995,2,,-1,,230,1113151438,-1,...,0.0296,0.0329,0.0197,,0.0428,0.0461,0.0691,0.0329,0.1283,0.0921
37508029,2405701,171719,1113165276,3,,-1,,230,1113151438,-1,...,0.0296,0.0329,0.0197,,0.0428,0.0461,0.0691,0.0329,0.1283,0.0921
37508028,2405701,93031,1113166830,2,,-1,,230,1113151438,-1,...,0.0296,0.0329,0.0197,,0.0428,0.0461,0.0691,0.0329,0.1283,0.0921
37508027,2405701,12590028,1113167111,1,,-1,,230,1113151438,-1,...,0.0296,0.0329,0.0197,,0.0428,0.0461,0.0691,0.0329,0.1283,0.0921


playcount data sections

In [8]:
# playcount seperation
pc_1 = df[df['playcount'] <= 10].groupby(['user_id'], sort=False)['track_id'].agg(list)
pc_2 = df[(10 < df['playcount']) & (df['playcount'] <= 100)].groupby(['user_id'], sort=False)['track_id'].agg(list)
pc_3 = df[(100 < df['playcount']) & (df['playcount'] <= 1000)].groupby(['user_id'], sort=False)['track_id'].agg(list)
pc_4 = df[1000 < df['playcount']].groupby(['user_id'], sort=False)['track_id'].agg(list)


# gender seperation
p_m = df[df['gender'] == 'm'].groupby(['user_id'], sort=False)['track_id'].agg(list)
p_f = df[df['gender'] == 'f'].groupby(['user_id'], sort=False)['track_id'].agg(list)
p_n = df[(df['gender'] != 'm') & (df['gender'] != 'f')].groupby(['user_id'], sort=False)['track_id'].agg(list)


# user_track_count seperation
df_trackid = df.groupby(['user_id'], sort=False)['track_id'].agg(list)
df = pd.DataFrame(df_trackid).join(df.groupby('user_id', as_index=True, sort=False)[['user_track_count']].sum(), on='user_id', how='left')
df = pd.DataFrame(df).join(dataset.df_users, on='user_id', how='left')

tc_1 = df[df['user_track_count'] <= 100]['track_id']
tc_2 = df[(100 < df['user_track_count']) & (df['user_track_count'] <= 1000)]['track_id']
tc_3 = df[1000 < df['user_track_count']]['track_id']

In [9]:
tc_1.head()

user_id
2787521    [175341, 29447816, 220796, 29447932, 28047, 53...
2687521    [161560, 21770, 35285, 16972, 30128456, 208697...
3736127    [162691, 10627, 2309458, 115972, 4192625, 8903...
3734112    [23819431, 23649011, 1371578, 24339267, 449417...
5788356    [1633241, 11131, 10114, 18333, 17802570, 17388...
Name: track_id, dtype: object

In [10]:
def calculate_distinct_tracks(playcount_groups):
    """
    Calculate the number of distinct track IDs in each playcount group.

    :param playcount_groups: Dictionary with playcount group name as key and list of track ID lists as value
    :return: Dictionary with playcount group name as key and number of distinct tracks as value
    """
    distinct_tracks = {}
    for group_name, track_lists in playcount_groups.items():
        # Flatten the list of lists and calculate the number of unique elements
        distinct_tracks[group_name] = len(set([track_id for sublist in track_lists for track_id in sublist]))
    return distinct_tracks

groups = {
    "pc_1": pc_1.values.tolist(),
    "pc_2": pc_2.values.tolist(),
    "pc_3": pc_3.values.tolist(),
    "pc_4": pc_4.values.tolist(),
    "tc_1": tc_1.values.tolist(),
    "tc_2": tc_2.values.tolist(),
    "tc_3": tc_3.values.tolist(),
    "p_m": p_m.values.tolist(),
    "p_f": p_f.values.tolist(),
    "p_n": p_n.values.tolist()
}

distinct_track_counts = calculate_distinct_tracks(groups)
distinct_track_counts


{'pc_1': 2217,
 'pc_2': 42889,
 'pc_3': 213735,
 'pc_4': 227095,
 'tc_1': 67572,
 'tc_2': 228044,
 'tc_3': 195559,
 'p_m': 222219,
 'p_f': 193310,
 'p_n': 226649}