In [89]:
# import standard libraries
import os
import pandas as pd
import sys
import numpy as np
import scipy as sp
import sklearn as sk
import itertools
import math

# import model tools
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

# import models objects
from sklearn import tree
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# import viz libraries
import graphviz
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# get slice names
path_stem = "/Users/danielagarcia/OneDrive - Harvard University/Sophomore Year/CS109a/jupyter/cs109a-final-project/Data/data/"
file_names = os.listdir(path_stem)
assert len(file_names) == 1000

In [3]:
# create empty data frame to store concatenated slices
playlist_data_raw = pd.DataFrame()

# combine each slice into one data frame 
for file in file_names:
    data_temp = pd.read_csv(path_stem + file)
    playlist_data_raw = playlist_data_raw.append(data_temp, ignore_index = True)    

In [4]:
# view first five rows of full data frame
display(playlist_data_raw.head())

Unnamed: 0.1,Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,0,2014,False,292000,1509235200,88,58,16,"[{'pos': 0, 'artist_name': 'Ty Dolla $ign', 't...",40,18032849,55,
1,1,Backyard BBQ,False,292001,1502582400,116,109,1,"[{'pos': 0, 'artist_name': 'Matt and Kim', 'tr...",11,25842835,82,
2,2,old school,False,292002,1442534400,103,60,2,"[{'pos': 0, 'artist_name': 'Lil Wayne', 'track...",12,26331174,41,
3,3,Broken,False,292003,1502236800,172,138,1,"[{'pos': 0, 'artist_name': 'Heather Headley', ...",23,46857118,106,
4,4,travel,False,292004,1473811200,37,30,1,"[{'pos': 0, 'artist_name': 'Of Monsters and Me...",4,8241213,28,


In [97]:
# sample data 
playlist_data_sample = playlist_data_raw.sample(10000, random_state = 109, ignore_index = True)
playlist_data_sample.head()

Unnamed: 0.1,Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,798,good songs,False,327798,1508457600,17,16,1,"[{'pos': 0, 'artist_name': 'Rihanna', 'track_u...",5,3825874,15,
1,936,edm,False,626936,1434067200,104,81,1,"[{'pos': 0, 'artist_name': 'deadmau5', 'track_...",2,25658396,59,
2,331,Dance Tunes,False,661331,1450742400,110,98,2,"[{'pos': 0, 'artist_name': 'Rihanna', 'track_u...",65,26698687,81,
3,820,Work,False,626820,1497225600,5,5,1,"[{'pos': 0, 'artist_name': 'Jeezy', 'track_uri...",4,1315638,4,
4,575,throwbacks,False,766575,1485388800,120,94,4,"[{'pos': 0, 'artist_name': 'Rihanna', 'track_u...",5,27613869,58,


In [98]:
# clean data

# initialize popular names, artists, songs
popular_names = ["country", "chill", "rap", "workout", "oldies", "christmas", "rock", "party", "throwback", 
                 "jams", "worship", "summer", "feels", "new", "disney", "lit", "throwbacks", "music", "sleep", "vibes"]

popular_artists = ["Drake", "Kanye West", "Kendrick Lamar", "Rihanna", "The Weeknd", "Eminem", "Ed Sheeran", "Future", 
                   "Justin Bieber", "J. Cole", "Beyoncé", "The Chainsmokers", "Chris Brown", "Calvin Harris", 
                   "Twenty One Pilots", "Lil Uzi Vert", "Post Malone", "Big Sean", "Maroon 5", "JAY Z"]

popular_tracks = ["HUMBLE. by Kendrick Lamar", "One Dance by Drake", "Broccoli (feat. Lil Yachty) by DRAM", 
                  "Closer by The Chainsmokers", "Congratulations by Post Malone", "Caroline by Aminé", 
                  "iSpy (feat. Lil Yachty) by KYLE", "Bad and Boujee (feat. Lil Uzi Vert) by Migos", "Location by Khalid",
                  "XO TOUR Llif3 by Lil Uzi Vert", "Bounce Back by Big Sean", "Ignition - Remix by R. Kelly", 
                  "No Role Modelz by J. Cole", "Mask Off by Future", "No Problem (feat. Lil Wayne & 2 Chainz) by Chance The Rapper",
                  "I'm the One by DJ Khaled", "Jumpman by Drake", "goosebumps by Travis Scott", "Fake Love by Drake",
                  "Despacito - Remix by Luis Fonsi"]

# extract predictor variables that don't need cleaning
playlist_data = playlist_data_sample[["num_tracks", "num_albums", "num_followers", "duration_ms", "num_artists"]].copy()

# initialize binary variables
playlist_data[["collaborative"]] = np.where(playlist_data_sample[["collaborative"]] == True, 1, 0)
playlist_data[["has_description"]] = np.where(playlist_data_sample[["description"]].isna(), 0, 1)

# create popular_name variable
popular_name = []
for playlist_name in playlist_data_sample["name"]:
    popular_name.append(np.where(any([name in playlist_name.lower() for name in popular_names]), 1, 0))
playlist_data[["popular_name"]] = pd.DataFrame(popular_name)

# extract track information
total_popular_tracks = []
total_popular_artists = []
playlist_tracks = playlist_data_sample["tracks"]

# iterate through each playlist
for track_data in playlist_tracks:
    # convert string to code
    track_data = eval(track_data)
    
    # initialize empty data frames to store data for each song in playlist
    pop_tracks = []
    pop_artists = []
    
    # iterate through each song in playlist
    for track_dict in track_data:
        track_name = track_dict["track_name"]
        artist_name = track_dict["artist_name"]
        
        # check if track name is popular
        if track_name + " by " + artist_name in popular_tracks:
            pop_tracks.append(1)
        else:
            pop_tracks.append(0)
        
        # check if artist name is popular
        if artist_name in popular_artists:
            pop_artists.append(1)
        else:
            pop_artists.append(0)
        
    
    # compute sum of popular songs and artists for that playlist
    total_popular_tracks.append(sum(pop_tracks))
    total_popular_artists.append(sum(pop_artists))

# add sums as new columns in data frame  
playlist_data[["total_popular_tracks"]] = pd.DataFrame(total_popular_tracks)
playlist_data[["total_popular_artists"]] = pd.DataFrame(total_popular_artists)


In [99]:
# display cleaned data
display(playlist_data.head())
display(playlist_data.describe())

Unnamed: 0,num_tracks,num_albums,num_followers,duration_ms,num_artists,collaborative,has_description,popular_name,total_popular_tracks,total_popular_artists
0,17,16,1,3825874,15,0,0,0,0,2
1,104,81,1,25658396,59,0,0,0,0,4
2,110,98,2,26698687,81,0,0,0,0,8
3,5,5,1,1315638,4,0,0,0,0,1
4,120,94,4,27613869,58,0,0,1,0,18


Unnamed: 0,num_tracks,num_albums,num_followers,duration_ms,num_artists,collaborative,has_description,popular_name,total_popular_tracks,total_popular_artists
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,66.0394,49.2246,7.2222,15535670.0,37.7263,0.0245,0.0161,0.2661,0.7087,5.6236
std,53.418336,39.731522,464.454494,12818370.0,29.89725,0.154603,0.125866,0.441939,1.873178,11.172216
min,5.0,2.0,1.0,740718.0,3.0,0.0,0.0,0.0,0.0,0.0
25%,26.0,20.0,1.0,5940957.0,16.0,0.0,0.0,0.0,0.0,0.0
50%,49.0,37.0,1.0,11443610.0,29.0,0.0,0.0,0.0,0.0,1.0
75%,91.0,67.0,1.0,21274370.0,51.0,0.0,0.0,1.0,0.0,6.0
max,250.0,233.0,45942.0,119649000.0,213.0,1.0,1.0,1.0,18.0,187.0


In [100]:
# split data into training and test data
X_data = playlist_data[["popular_name", "has_description", "num_artists", "num_albums", "num_tracks", 
                        "duration_ms", "collaborative", "total_popular_tracks", "total_popular_artists"]].copy()
y_data = playlist_data["num_followers"].copy()

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, train_size = 0.8, random_state = 109)