In [None]:
import pandas as pd
import os
import json
import re
from tqdm import tqdm
import logging
from pprint import PrettyPrinter
from collections import defaultdict
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns

## Steps to reproduction
1. Download all 5 GB of data, or clone the repository at https://github.com/gilpasternak35/SongRecommender.git


2. **If Downloaded**:\
    a. keep only first 50,000 playlists  
    b. Place these in a directory called data, one step below current root
    
    
3. Run this notebook from repository root

# Preprocessing and Data Structure Creation

In [None]:
# Listing directory
files = os.listdir("./data")

# Regular expression for desired filenames
desired_filename = re.compile("mpd.*")

# Pretty printer instantiation
pp = PrettyPrinter(width=25)

In [None]:
def dataloader_pipeline(file_list: list) -> list:
    """
    Pipeline for loading in data
    
    @param file_list: A list of files to load in
    @returns data: A list of playlists from these files
    """
    # Resulting data (hopefully to be stored in list)
    data = []
    
    # Traversing through available datafiles
    print("Starting Dataloading...")
    for file in tqdm(file_list):
        
        # Ensuring filename valid
        if desired_filename.match(file):
            
            # Opening and preprocessing
            with open("./data/" + file, 'r') as file_reader:
                data += json.load(file_reader)["playlists"]
    
    print("Finished Dataloading...")
   
    return data

In [None]:
def build_relevant_ds(data: list) -> (list, dict, dict, dict):
    """
    Preprocesses data, simultaneously building relevant data structures
    
    @param data - a data list of playlist dictionaries to preprocess
    @returns a list of tracks per user, users per track, watered down data list
    """
    
    def process_uri(uri:str):
        """URI Processing method"""
        return uri.split(":")[2]
        
    
    print("Preprocessing started...")
    tracks_per_user, users_per_track, users_per_artist  = defaultdict(list), defaultdict(list), defaultdict(list)
    
    # Traversing through data and preprocessing
    for playlist in data:       
        user = playlist['pid']
        for track in playlist['tracks']:
            # obtaining necessary data
            track, artist, album = track['track_name'], track['artist_name'], track['album_name']
            
            # Appending data to data structures
            tracks_per_user[user].append(track)
            users_per_track[track].append(user)
            users_per_artist[artist].append(user)
            
    return tracks_per_user, users_per_track, users_per_artist
            

In [None]:
# Loading in data
data = dataloader_pipeline(files)

## Our Data:

In [None]:
pp.pprint(data[1])

In [None]:
# Building relevant data structures
tracks_per_user, users_per_track, users_per_artist = build_relevant_ds(data)

# Exploratory Analysis

In [None]:
# Computing total users and tracks
total_users, total_tracks = len(tracks_per_user), len(users_per_track)
print(f"Total Users: {total_users}", f"Unique Tracks: {total_tracks}", sep = '\n')

In [None]:
# Total user-track pairs, average tracks per user
tracks_nonunique = 0
for user, entry in tracks_per_user.items():
    tracks_nonunique += len(entry)
    
print(f"Total User-Track Pairs: {tracks_nonunique}", "Average Tracks Per User: " + "%.2f" % (tracks_nonunique/total_users), sep = '\n')

**Meaning**: We have a LOT of data, because users make long playlists. 

We can easily split up into train, val test, and utilize a smaller portion of the data so that models train at a reasonable timeframe.

## Most Popular Songs

In [None]:
# Most Popular Tracks, Distribution of track popularity, based on a sample
counter  = 10000
popularities = []

# Computing number of listeners per track
listeners_per_track = [(len(listeners), track) for track, listeners in users_per_track.items()]
listeners_per_artist = [(len(listeners), artist) for artist, listeners in users_per_artist.items()]

# Top 30 songs
print("\033[1m" + "Top 30 songs, by number of listeners, in the dataset: " + "\033[0m" + "\n")
sorted_listeners_per_track = sorted(listeners_per_track, reverse = True)
counter = 1

for listeners, track in sorted_listeners_per_track[:30]:
    print(f"{counter}. {track}, {listeners} Playlists")
    counter += 1
    
# Top 30 songs
print("\n\n" +  "\033[1m" + "Top 30 artists, by number of listeners, in the dataset: " + "\033[0m" +"\n")
sorted_listeners_per_artist = sorted(listeners_per_artist, reverse = True)
counter = 1

for listeners, artist in sorted_listeners_per_artist[:30]:
    print(f"{counter}. {artist}, {listeners} Playlists")
    counter += 1

## Listeners Distribution

In [None]:
listeners_only = [listeners for listeners, track in sorted_listeners_per_track]

In [None]:
print(f"Average Listeners Per Track: {np.mean(listeners_only): 4.2f} \nMedian Listeners Per Track: {np.median(listeners_only): 4.0f} \nDeviation: {np.std(listeners_only): 4.2f}")


**Note**: Our distribution is heavily skewed to the right. 

Our median song has a single listener, and yet our standard deviation is massive and the average number of songs per track is significantly above the median. 

It seems most songs have only one listener, and a few songs have thousands. This could present a cold start issue, as well as cause problems with only the most popular songs being recommended. 

We'll have to find a weighting scheme so that the more popular songs don't simply dominate.

**Further evidence of this can be seen below:**

In [None]:
plt.hist(listeners_only, bins = [0] + [5 * (2**i) for i in range(1,10)])
plt.show()

Let's zoom in!

In [None]:
plt.hist(listeners_only, bins = [i for i in range(10)])
plt.show()

It is now clear that the vast majority of our songs have only a single listener. In fact, approximately 175k of our roughly 330k unique tracks have only a single listener. Fortunately, we have enough data to not utilize these rare tracks for which there is only a single entry if not necessary.

#### Something must be done about the rare song issue, however. Let's check how many of our songs "Aren't rare"

In [None]:
non_obscure_song_listens = list(filter(lambda x: x >= 5, listeners_only))


In [None]:
non_obscure_song_entries, non_obscure_song_num = sum(non_obscure_song_listens), len(non_obscure_song_listens)
print(f"Number of user-song pairs for non-obscure songs: {non_obscure_song_entries}", 
      f"Number of unique non-obscure songs: {non_obscure_song_num}", sep = '\n')

As can be seen, we have a sufficient non-obscure songs to cast out obscure ones and train a model considering only reasonably "popular music". This would fundamentally assume people do not want to be recommended not completely obscure songs, which is an assumption that needs to be considered

## Are User Distributions Similarly Skewed?

In [None]:
track_num_per_user = [len(tracks) for (user, tracks) in tracks_per_user.items()]
sorted_track_num_per_user = sorted(track_num_per_user, reverse=True)
print(f"Longest Playlist Length: {max(sorted_track_num_per_user): 4.2f}",
      f"Mean Playlist Length: {np.mean(sorted_track_num_per_user): 4.2f}",
      f"Median Playlist Length {np.median(sorted_track_num_per_user): 4.2f}", sep = '\n')

In [None]:
plt.hist(sorted_track_num_per_user)
plt.show()

Let's Zoom in Again!

In [None]:
plt.hist(sorted_track_num_per_user, bins = [i*3 for i in range(1,20)])
plt.show()

Our data per user is also skewed to the right, but it is significantly less skewed than our song data and might be acceptable. It is likely that Spotify cast out playlists longer than 250 songs in the making of this dataset.

## A Quick Nullity Check

Let's Perform a quick check to ensure none of our artists or tracks are null!

In [None]:
# Ensuring we have no null users or tracks
None in tracks_per_user or np.nan in tracks_per_user, None in users_per_track or np.nan in users_per_track

## Correlational Investigation, Playlist Statistics

Playlist Metadata may be a powerful modeling tool - let's check if there are any factors that correlate with popularity, or number of followers!

In [None]:
# Data storage
playlist_pop, playlist_durations, playlist_last_mod = [], [], []
playlist_edits, playlist_artist_num, playlist_track_num = [], [], []


# Computing various statistics
for playlist in data:
    
    # Obtaining relevant data
    num_followers, duration_ms, modified = playlist['num_followers'], playlist['duration_ms'], playlist['modified_at']
    num_edits, num_artists, num_tracks = playlist['num_edits'], playlist['num_artists'], playlist['num_tracks']
    
    
    # Inserting data
    playlist_pop.append(num_followers)
    playlist_durations.append(duration_ms)
    playlist_last_mod.append(modified)
    playlist_edits.append(num_edits)
    playlist_artist_num.append(num_artists)
    playlist_track_num.append(num_tracks)

In [None]:
popularities_sorted = sorted(playlist_pop, reverse = True)

In [None]:
# Number of followers for 5 most popular playlists
print("Number of followers for five most popular playlists, ranked: \n")
print(popularities_sorted[0:5])

In [None]:
df = pd.DataFrame().assign(num_followers = playlist_pop, duration_ms = playlist_durations, 
                          last_modified = playlist_last_mod, num_edits = playlist_edits,
                          num_artists = playlist_artist_num, num_tracks = playlist_track_num)

In [None]:
# Checking for correlations
df.corr()

There don't seem to be many interesting correlations, beyond the fact that longer playlists tend to be edited more and modified more recently. 

It is also possible that there may be a nonlinear association within the data that we haven't captured with pearson correlation. 

**For Better visualization of the correlation, A Heatmap**

In [None]:
# Correlational heatmap, showing strong correlation
sns.heatmap(df.corr(), cmap = sns.color_palette("rocket", as_cmap = True))

## Do the most Popular Songs Appear more often on the Most Popular Playlists?

In [None]:
tracks_per_playlist = defaultdict(list)

# Appending tracks for every playlist, muting output
mute = [tracks_per_playlist[playlist['pid']].append(track['track_name']) for playlist in data for track in playlist['tracks']]
        

In [173]:
num_most_popular = []
most_popular_songs = sorted_listeners_per_track[:50]

for playlist in data:
    # Checking number of most popular tracks in each playlist, with most popular being in top 50
    song_inclusion  = [song[1] in tracks_per_playlist[playlist['pid']] for song in most_popular_songs]
    num_most_popular.append(sum(song_inclusion))
    

In [177]:
# Checking correlations
df['num_most_popular'] = num_most_popular
df.corr()['num_followers']

num_followers       1.000000
duration_ms         0.003919
last_modified       0.001880
num_edits          -0.001402
num_artists        -0.000225
num_tracks          0.004287
num_most_popular    0.000541
Name: num_followers, dtype: float64

#### The Answer, unfortunately, is that the correlation here is slim. Hence it is likely that playlist popularity is not a particularly good predictor of song popularity.

## Do the most Popular Artists Appear more often on the Most Popular Playlists?

In [178]:
artists_per_playlist = defaultdict(list)

# Appending tracks for every playlist, muting output
mute = [artists_per_playlist[playlist['pid']].append(track['artist_name']) for playlist in data for track in playlist['tracks']]
  

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,