In [1]:
import pandas as pd
import json
from tqdm.notebook import tqdm
from pandas import DataFrame
import scipy.sparse as sp
from collections import Counter

In [4]:
def get_artists_data(path: str) -> DataFrame:
    df = pd.read_csv(path, sep='\t', header=None)
    
    df['artist_name'] = pd.io.json.json_normalize(df[3].apply(json.loads))['name']  # parse names of artists
    df['artist_name'] = df['artist_name'].str.replace('+', ' ', regex=True)
    df.drop([0, 2, 3, 4], axis=1, inplace=True)  # drop columns with useless data
    df.rename(columns={1: "artist_id"}, inplace=True)
    df.to_pickle('artists_df.pkl')
    
    return df

In [5]:
get_artists_data('ThirtyMusic/entities/persons.idomaar')

  


Unnamed: 0,artist_id,artist_name
0,145148,Everything Is Illuminated
1,297899,Robin O%27Brien
2,250429,Nicholas Gunn (2012)
3,32765,Aspasia Stratigou
4,18689,Allison Veltz
...,...,...
595044,544215,Sanaa Kariakoo
595045,298403,Rock-a-teens
595046,450896,Jennifer Lopez Ft. DJ Mustard
595047,53831,Bobby Sanabria Conducting The Manhattan School...


In [4]:
def get_tracks_data(path: str) -> DataFrame:
    df = pd.read_csv(path, sep='\t', header=None)
    
    df['track_duration'] = pd.io.json.json_normalize(df[3].apply(json.loads))['duration']
    df['playcount'] = pd.io.json.json_normalize(df[3].apply(json.loads))['playcount']
    df['track_name'] = pd.io.json.json_normalize(df[3].apply(json.loads))['name']
    
    artist_ids = []
    for i in range(len(df)):
        row = df.iloc[i, 4]
        a_id = json.loads(row)['artists'][0]['id']
        artist_ids.append(a_id)
    
    df['artist_id'] = artist_ids

    df.rename(columns={1: "track_id"}, inplace=True)
    df = df[['track_id', 'artist_id']]
    df.to_pickle('tracks_df.pkl')
    
    return df

In [6]:
get_tracks_data('ThirtyMusic/entities/tracks.idomaar')

  after removing the cwd from sys.path.
  """
  


Unnamed: 0,track_id,artist_id
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4
...,...,...
5675138,5023105,187223
5675139,4588451,28556
5675140,5023106,595140
5675141,5023107,549669


In [7]:
def get_sessions_data(path: str) -> DataFrame:
    df = pd.read_csv(path, sep='\t', header=None)
    
    users = []
    numtracks = []
    playtime_session = []
    track_list = []
    playtime_track_list = []
    
    for i in range(len(df)):
        row = df.iloc[i, 3].split()
        user = json.loads(row[1])['subjects'][0]['id']
        users.append(user)
        nt = json.loads(row[0])['numtracks']
        numtracks.append(nt)
        pt = json.loads(row[0])['playtime']
        playtime_session.append(pt)
        
        tracks = []
        track_playtime = []
        objects = json.loads(row[1])['objects']
        
        for obj in objects:  # O(n^2)
            if obj['playtime'] > 60:
                tr = obj['id']
                tracks.append(tr)
                tr_pt = obj['playtime']
                track_playtime.append(tr_pt)
            
        track_list.append(tracks)
        playtime_track_list.append(track_playtime)
        

    df['user_id'] = users
    df['numtracks'] = numtracks
    df['playtime_session'] = playtime_session
    df['track_list'] = track_list
    df['playtime_list'] = playtime_track_list
    
    df.drop([0, 1, 2, 3], axis=1, inplace=True)
    
    df = (
        df[
            (df.numtracks > 5)
            & (df.playtime_session > 900)
        ]
        .reset_index(drop=True)
        .groupby('user_id').agg({'track_list': 'sum'})
    )
    
    df.to_pickle('sessions_df.pkl')
    
    return df

In [8]:
get_sessions_data('ThirtyMusic/relations/sessions.idomaar')

Unnamed: 0_level_0,track_list
user_id,Unnamed: 1_level_1
1,"[82832, 82973, 82731, 82967, 82782, 82912, 827..."
2,"[3691382, 3691359, 3691357, 3691411, 3691312, ..."
3,"[179531, 3549670, 950182, 2557656, 2557723, 94..."
4,"[2618677, 163458, 2022511, 3701273, 2967724, 2..."
5,"[2819368, 630812, 3681135, 2971568, 2850438, 2..."
...,...
45171,"[2790987, 3527319, 3737643, 1795988, 3007755, ..."
45172,"[3424843, 3098731, 3296368, 969116, 3608040, 1..."
45173,"[3779408, 2637220, 2637279, 2637127, 2637054, ..."
45174,"[3482931, 1772548, 2762092, 3866638, 3866644, ..."


In [9]:
sessions_df = pd.read_pickle('sessions_df.pkl')

In [10]:
tracks_df = pd.read_csv('tracks.csv')
tracks = tracks_df['track_id']
artists = tracks_df['artist_id']
d = dict(zip(tracks, artists))

def track_id_to_artist_id_mapping(track_list):
    '''
    Track list to artist list mapping
    
    Returns dict with artist listining counts
    '''
    
    artist_list = []
    for track in track_list:
        try:
            artist = d[track]
            artist_list.append(artist)
        except KeyError:
            pass
    
    artist_counter = dict(Counter(artist_list))
    
    return artist_counter

In [11]:
sessions_df['artist_counts'] = sessions_df['track_list'].apply(lambda x: track_id_to_artist_id_mapping(x))

In [12]:
sessions_df

Unnamed: 0_level_0,track_list,artist_counts
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[82832, 82973, 82731, 82967, 82782, 82912, 827...","{11467: 53, 370733: 13, 303270: 53, 236243: 1,..."
2,"[3691382, 3691359, 3691357, 3691411, 3691312, ...","{459046: 118, 294948: 34, 375103: 1, 11606: 1,..."
3,"[179531, 3549670, 950182, 2557656, 2557723, 94...","{23280: 13, 443880: 3, 121274: 39, 320798: 50,..."
4,"[2618677, 163458, 2022511, 3701273, 2967724, 2...","{328047: 1, 21422: 1, 253041: 1, 460501: 1, 36..."
5,"[2819368, 630812, 3681135, 2971568, 2850438, 2...","{352379: 1, 76355: 4, 458023: 2, 368160: 3, 35..."
...,...,...
45171,"[2790987, 3527319, 3737643, 1795988, 3007755, ...","{349824: 1, 440873: 1, 465400: 1, 225435: 1, 3..."
45172,"[3424843, 3098731, 3296368, 969116, 3608040, 1...","{427743: 1, 247367: 1, 408513: 1, 123799: 1, 4..."
45173,"[3779408, 2637220, 2637279, 2637127, 2637054, ...","{83045: 2, 330575: 16, 27224: 2, 231700: 2, 32..."
45174,"[3482931, 1772548, 2762092, 3866638, 3866644, ...","{435185: 1, 222396: 1, 346433: 2, 273829: 2, 4..."


In [13]:
sessions_df.to_pickle('sessions_counts.pkl')