In [1]:
import pandas as pd
import os
import json
import re
from tqdm import tqdm, trange
import logging

from pprint import PrettyPrinter
from collections import defaultdict

## Steps to reproduction
1. Download all 5 GB of data, or clone the repository at https://github.com/gilpasternak35/SongRecommender.git


2. **If Downloaded**:\
    a. keep only first 81,000 playlists  
    b. Place these in a directory called data, one step below current root
    
    
3. Run this notebook from repository root

In [2]:
# Listing directory
files = os.listdir("./data")

# Regular expression for desired filenames
desired_filename = re.compile("mpd.*")

# Pretty printer instantiation
pp = PrettyPrinter(width=25)

In [3]:
def dataloader_pipeline(file_list: list) -> list:
    """
    Pipeline for loading in data
    
    @param file_list: A list of files to load in
    @returns data: A list of playlists from these files
    """
    # Resulting data (hopefully to be stored in list)
    data = []
    
    # Traversing through available datafiles
    print("Starting Dataloading...")
    for file in tqdm(file_list):
        
        # Ensuring filename valid
        if desired_filename.match(file):
            
            # Opening and preprocessing
            with open("./data/" + file, 'r') as file_reader:
                data += json.load(file_reader)["playlists"]
    
    print("Finished Dataloading...")
   
    return data

In [4]:
def build_relevant_ds(data: list) -> (list, dict, dict, dict):
    """
    Preprocesses data, simultaneously building relevant data structures
    
    @param data - a data list of playlist dictionaries to preprocess
    @returns a list of tracks per user, users per track, watered down data list
    """
    
    def process_uri(uri:str):
        """URI Processing method"""
        return uri.split(":")[2]
        
    
    print("Preprocessing started...")
    tracks_per_user, users_per_track, artists_per_user  = defaultdict(list), defaultdict(list), defaultdict(list)
    
    # Traversing through data and preprocessing
    for playlist in data:       
        user = playlist['pid']
        for track in playlist['tracks']:
            # obtaining necessary data
            track, artist, album = track['track_name'], track['artist_name'], track['album_name']
            
            # Appending data to data structures
            tracks_per_user[user].append(track)
            users_per_track[track].append(user)
            artists_per_user[user].append(artist)
            
    return tracks_per_user, users_per_track, artists_per_user
            

In [5]:
# Loading in data
data = dataloader_pipeline(files)

Starting Dataloading...


100%|███████████████████████████████████████████| 81/81 [00:17<00:00,  4.52it/s]

Finished Dataloading...





## Our Data:

In [10]:
pp.pprint(data[2])

{'collaborative': 'false',
 'duration_ms': 15021695,
 'modified_at': 1505433600,
 'name': 'offline ',
 'num_albums': 48,
 'num_artists': 45,
 'num_edits': 16,
 'num_followers': 1,
 'num_tracks': 63,
 'pid': 7002,
 'tracks': [{'album_name': 'Greatest '
                           'Hits '
                           '- '
                           '18 '
                           'Kids',
             'album_uri': 'spotify:album:0xBQEVKKlFKaetqike1qXK',
             'artist_name': 'Keith '
                            'Urban',
             'artist_uri': 'spotify:artist:0u2FHSq3ln94y5Q57xazwf',
             'duration_ms': 231293,
             'pos': 0,
             'track_name': "You'll "
                           'Think '
                           'Of '
                           'Me',
             'track_uri': 'spotify:track:0lZxd99ZIjA0zUdQAY3FXr'},
            {'album_name': 'Me '
                           'And '
                           'My '
                           'Gang',
     

In [7]:
# Building relevant data structures
tracks_per_user, users_per_track, artists_per_user = build_relevant_ds(data)

Preprocessing started...


In [None]:
tracks_per_user[7002]