### Creating a Neural Network to Predict Songs 

The goal of this notebook is to show an entire process of creting a variety of neural networks to 
predict the playlist a song should belong to 

The libraries we're using - the spotipy library is the most important, though it is very posible, and fairly easy 
to make a library using requests to do the exact same process 

In [1]:
import spotipy 
import spotipy.util as util 
import time 
import json 
import pandas as pd 
import numpy as np
from bs4 import BeautifulSoup
import requests
from spotipy.oauth2 import SpotifyClientCredentials 

You should save your parameters in your own json file for ease of use - this will make it so that your entire project is somewhat abstracted and also to protect you from privacy concerns; if you are uploading this to github you can just the parameter file to the .gitignore and don't have to worry about anyone seeing your keys

In [2]:
with open('./parameters.json') as f: 
    parameters = json.load(f)

In [3]:
print("The Various Keys in the Parameter Json")
for a in parameters.keys(): 
    print(a)

The Various Keys in the Parameter Json
username
spotify_CLIENT_ID
spotify_CLIENT_SECRET
genius_CLIENT_ID
genius_CLIENT_SECRET
genius_TOKEN
scope
redirect_uri


This function generate the spotipy token for us, and allows us to call it whenever we need so that it will refresh 

In [4]:
def generate_host():
    token = util.prompt_for_user_token(parameters["username"], parameters["scope"], parameters["spotify_CLIENT_ID"], parameters["spotify_CLIENT_SECRET"], parameters["redirect_uri"])
    if token:
        sp = spotipy.Spotify(auth=token)
        print('Generated a Spotify Class Instance')
    else:
        raise ValueError('enter valid credentials')
    return sp


In [5]:
sp = generate_host()

Generated a Spotify Class Instance


#### These are not mine
I adapted these from an online source and it uses the requests module to send requests to the Genius API 

In [6]:
def request_song_info(song_title, artist_name, token):
    #the first part of starting the Genius API search form (modified from William Soares article)
    #Arguments:
        #song_title = String, title of the song as close as it is written in general context
        #artist_name = String, name of the artist as close as it is written in general context
        #token = Genius Token for access to its  API
    base_url = 'https://api.genius.com'
    headers = {'Authorization': 'Bearer ' + token}
    search_url = base_url + '/search'
    data = {'q': song_title + ' ' + artist_name}
    response = requests.get(search_url, data=data, headers=headers)
    return response

def helper_lyric_retrieval(song_title, artist_name, token):
    #helper function to get the lyrics for a song of of Genius
    #Arguments:
        #song_title = String, title of the song as close as it is written in general context
        #artist_name = String, name of the artist as close as it is written in general context
        #token = Genius Token for access to its API
    response = request_song_info(song_title, artist_name, token)
    json = response.json()
    for hit in json['response']['hits']:
        if artist_name.lower() in hit['result']['primary_artist']['name'].lower():
            remote_song_info = hit
            return remote_song_info

def lyric_retrieval(song_title, artist_name, token):
    #function to get the lyrics for a song off of Genius utilizing a search query (needs internet connection)
    song_info = helper_lyric_retrieval(song_title, artist_name, token)
    if song_info:
        url = song_info['result']['url']
        page = requests.get(url)
        html_object = BeautifulSoup(page.text, 'html.parser')
        lyrics = html_object.find('div', class_ = 'lyrics').get_text()
        return lyrics
    return None



In [7]:
user = sp.current_user()
playlists = sp.user_playlists(user["id"])

This code block below gives us all the playlists we have access to - I have quite a bit which is why this makes it interesting to work with, but it doesn't necessarily mean we have to use all of these. 

In [8]:
all_playlists = {}

while True: 
    items = playlists['items']
    for a in items: 
        all_playlists[a['name']] = a['id']
    if not playlists['next']: 
        break
    playlists = sp.next(playlists)

In [9]:
all_playlists

{'End of Year Video': '5VRmO8IN5QvTPsIfHgmdoP',
 '𝚃𝙾𝙿 𝟷𝟻': '4n8tbjdSaAut9uKysy1tH2',
 'Close Songs Khairiyat': '4uLw9OR2FrOJkh5xlt8sIw',
 'Unsorted': '3RwdN9FQ4ibejGQw7uuB9j',
 '1.0 ': '3KxKBoU0GricuPaXgqngHr',
 '1.5 ': '6nayGqX29DlL1t1RDs7L1p',
 '2.0 ': '6MCpOTOkSVa4IRm5UgMCfR',
 '3.0': '4Vum8sy9ZX6tad7yrSCxZl',
 '3.5': '52KfcqyvkLcpryXEVVmWA7',
 '4.0': '10sKUQMFwjlQifuLpsewBm',
 '5.0 ': '04W6n8MzTvAQpFY3Ba3Ai4',
 '6.0 ': '4vwCEFVoreoLEIuXJZCVn8',
 '6.9 ': '56O3FRaHtG3cN2rAL6CXtp',
 '7.0': '6ge5MUGdCg4js0cXAE6l4Q',
 '8.0': '3xjgqEUm4UDiwhNQJRXWg0',
 '9.0': '1k1iL5jtT3B2sjWa92H8Dv',
 '10.0': '1B5N9daaStOuqzdm4ZxOk9',
 '11.0': '7xk3j830boikaQ2Vhhqz4Z',
 '12.0': '5FoejgcDu6LQGa08jGI1TK',
 '13.0': '3KwZfKYcR5tzAeZgdt5vjy',
 '14.0': '10UoDhrksLkQ68kkm3LYx5',
 '15.0': '7kuTQNJnePXZ9pQaI4Fheq',
 '16.0': '5bwYi9DQOoTQTLa82IzVUx',
 '17.0': '7gPp8q9qhQWelXhL9MN6IK',
 '18.0 ': '22szdfMmNXfiypWduxPRDc',
 '19.0': '6q7mrfToND7mw348ROJrh6',
 '20.0': '2bVSr0azDr3gbXhsWHAeQE',
 'Songs that make ｄｅｓｉｓ 

In [10]:
def add_playlist(playlist_name, playlist_id): 
    tracks = sp.user_playlist(user['id'],playlist_id )['tracks']
    all_tracks = []
    while True: 
        for song in tracks["items"]:
            song_id = song["track"]["id"]
            if song_id: 
                new_list = []
                new_list.append(song_id)
                new_list.append(song["track"]["name"])
                temp = sp.track(song_id)
                new_list.append(temp["popularity"])
                new_list.append(temp["artists"][0]["name"])
                new_list.append(temp["explicit"])
                new_list.append(temp["album"]["release_date"])
                audio_features = sp.audio_features(song["track"]["id"])[0]
                new_list.append(audio_features["danceability"])
                new_list.append(audio_features["energy"])
                new_list.append(audio_features["loudness"])
                new_list.append(audio_features["mode"])
                new_list.append(audio_features["speechiness"])
                new_list.append(audio_features["acousticness"])
                new_list.append(audio_features["instrumentalness"])
                new_list.append(audio_features["liveness"])
                new_list.append(audio_features["valence"])
                new_list.append(audio_features["tempo"])
                new_list.append(sp.artist(temp["artists"][0]["uri"])["genres"])
                new_list.append(playlist_name)
            all_tracks.append(new_list)
        if not tracks['next']: 
            break 
        else: 
            tracks = sp.next(tracks)
    print(f'Finished{playlist_name}')
    return all_tracks


In [11]:
listsongs = [[
    "id",
    "name",
    "popularity",
    "artist_name",
    "explicit",
    "release date",
    "danceability",
    "energy",
    "loudness",
    "mode",
    "speechiness",
    "acousticness",
    "instrumentalness",
    "liveness",
    "valence",
    "tempo",
    "genres", 
    "playlist name"
]]

for name, ids in all_playlists.items(): 
    listsongs.extend(add_playlist(name, ids))
    print(len(listsongs))

FinishedEnd of Year Video
9
Finished𝚃𝙾𝙿 𝟷𝟻
25
FinishedClose Songs Khairiyat
42
FinishedUnsorted
181
Finished1.0 
239
Finished1.5 
257
Finished2.0 
311
Finished3.0
344
Finished3.5
368
Finished4.0
422
Finished5.0 
471
Finished6.0 
529
Finished6.9 
567
Finished7.0
631
Finished8.0
670
Finished9.0
716
Finished10.0
739
Finished11.0
768
Finished12.0
797
Finished13.0
800
Finished14.0
824
Finished15.0
839
Finished16.0
854
Finished17.0
886
Finished18.0 
937
Finished19.0
1168
Finished20.0
1197
FinishedSongs that make ｄｅｓｉｓ go crazy
1334
Finished𝒟𝒾𝓈𝓃ℯ𝓎
1398
Finished𝚘𝚕𝚍𝚒𝚎𝚜 𝚋𝚞𝚝 𝚐𝚘𝚕𝚍𝚒𝚎𝚜
1498
FinishedSoft Throwbacks
1499
FinishedＥＴＥＲＮＡＬ  Bops
1650
Finished𝔽𝕦𝕟𝕟𝕪 𝕒𝕤 𝔽𝕦𝕔𝕜
1695
Finished🅑🅡🅞🅐🅓🅦🅐🅨
1728
Finishedˡᵒʷᵏᵉʸ RAP 
1786
Finished𝓕𝓤𝓝 𝓽𝓲𝓶𝓮𝓼 
1853
Finished𝒟𝓇𝒶𝓂𝒶𝓉𝒾𝒸  Sing-Alongs
1961
FinishedLiked from Radio
2007
FinishedƎuןıƃɥʇǝuıuƃ music
2131
Finished𝐇𝐘𝐏𝐄
2368
Finishedᵘᵖˡᶦᶠᵗᶦⁿᵍ
2452
Finisheds͛t͛a͛t͛e͛ ͛o͛f͛ ͛m͛i͛n͛d͛ ͛
2532
Finishedᴍᴇʟʟᴏᴡ ᴠɪʙᴇs
2763
Finished𝔹𝕆ℙℙ𝕐 𝓋𝒾𝒷ℯ𝓈
3021
Finished𝑣𝑖𝑏𝑒
3238
FinishedＭＥＬＯＤ

ReadTimeout: HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)

In [12]:
song_df = pd.DataFrame(listsongs[1:], columns = listsongs[0])
song_df.to_csv("outputs.csv")

### Need to Perform Some Data Cleaning

In [13]:
song_df["explicit"] = song_df["explicit"].astype(int)

In [14]:
all_genres = {} 
for genre_entry in song_df["genres"]: 
    for entry in genre_entry: 
        if entry not in all_genres.keys(): 
            all_genres[entry] = 0 
        else: 
            all_genres[entry] += 1

In [15]:
sorted_genres = sorted(all_genres.keys(), key = lambda x: all_genres[x], reverse = True)
print(f"Total Number of Genres is {len(sorted_genres)}")
for a in sorted_genres[: 50]:
    print(f"Genre {a} has  {all_genres[a]} songs that belong to it.")

Total Number of Genres is 324
Genre pop rap has  1732 songs that belong to it.
Genre rap has  1551 songs that belong to it.
Genre pop has  1341 songs that belong to it.
Genre hip hop has  1081 songs that belong to it.
Genre dance pop has  620 songs that belong to it.
Genre trap has  407 songs that belong to it.
Genre conscious hip hop has  345 songs that belong to it.
Genre southern hip hop has  325 songs that belong to it.
Genre indie pop rap has  325 songs that belong to it.
Genre post-teen pop has  304 songs that belong to it.
Genre pop rock has  224 songs that belong to it.
Genre modern rock has  214 songs that belong to it.
Genre desi pop has  210 songs that belong to it.
Genre atl hip hop has  201 songs that belong to it.
Genre modern bollywood has  198 songs that belong to it.
Genre rock has  197 songs that belong to it.
Genre desi hip hop has  184 songs that belong to it.
Genre r&b has  184 songs that belong to it.
Genre melodic rap has  178 songs that belong to it.
Genre deep 

In [16]:
def in_or_nah(genres, entry): 
    if entry in genres: 
        return 1
    else: 
        return 0 

In [17]:
for entry in sorted_genres: 
    song_df[entry] = song_df["genres"].apply(lambda x: in_or_nah(x, entry))

In [18]:
song_df

Unnamed: 0,id,name,popularity,artist_name,explicit,release date,danceability,energy,loudness,mode,...,basshall,dutch hip hop,dutch pop,dutch trap pop,deep southern trap,deep new americana,new americana,melancholia,puerto rican pop,melodipop
0,3MXCnmak0GBBduWbahRY8G,This Is For Rachel,47,Gxrrixon Productions,1,2020-01-16,0.900,0.461,-14.453,1,...,0,0,0,0,0,0,0,0,0,0
1,5gQcxYXqnofyocKgEbGYKt,Bounce Out With That,72,YBN Nahmir,1,2018-09-07,0.864,0.664,-7.315,1,...,0,0,0,0,0,0,0,0,0,0
2,5DI9jxTHrEiFAhStG7VA8E,Started From the Bottom,66,Drake,1,2013-01-01,0.794,0.522,-7.829,1,...,0,0,0,0,0,0,0,0,0,0
3,1R7ChEm1x3mGhDWXKnPSXn,"Daaru Desi (From ""Cocktail"")",38,Benny Dayal,0,2012-06-12,0.703,0.786,-6.779,0,...,0,0,0,0,0,0,0,0,0,0
4,1R7ChEm1x3mGhDWXKnPSXn,"Daaru Desi (From ""Cocktail"")",38,Benny Dayal,0,2012-06-12,0.703,0.786,-6.779,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4301,26h6qDR3KtieqIWgnveqmC,Over And Over Again,56,Nathan Sykes,0,2016-11-04,0.463,0.245,-9.438,1,...,0,0,0,0,0,0,0,0,0,0
4302,6FRLCMO5TUHTexlWo8ym1W,Girls Like You (feat. Cardi B),78,Maroon 5,1,2018-05-30,0.851,0.541,-6.825,1,...,0,0,0,0,0,0,0,0,0,0
4303,6Umac95Mt46VcwAM9s9mOa,To the Moon,65,Phora,0,2017-08-18,0.540,0.572,-8.665,1,...,0,0,0,0,0,0,0,0,0,0
4304,2v8YyHvDPBfydhVOTvuHl9,I Think I Love You,0,Phora,1,2016-02-15,0.544,0.446,-11.735,1,...,0,0,0,0,0,0,0,0,0,0


In [19]:
import torch

In [20]:
non_genres_df = song_df.iloc[:, :18]
non_genres_floats = non_genres_df.drop(["id", "name", "artist_name", "release date", "genres"], axis = 1)

In [21]:
playlist_mapping = {}
index = 0
for a in non_genres_floats["playlist name"]: 
    if a not in playlist_mapping.keys(): 
        playlist_mapping[a] = index 
        index += 1
        
reverse_playlist_mapping = {value: key for key, value in playlist_mapping.items()}
non_genres_floats['playlist name'] = non_genres_floats["playlist name"].replace(playlist_mapping) 
non_genres_floats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

def clean_give_data(dataframe): 
    X = np.array(dataframe.iloc[ : , :-1])
    Y = np.array(dataframe.iloc[:, -1: ])
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = .1)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = .1)
    return x_train, x_test, x_val, y_train, y_test, y_val 
    
x_train, x_test, x_val, y_train, y_test, y_val = clean_give_data(non_genres_floats)


FEATURES_NUM = len(x_train[0])
CLASS_NUM = len(playlist_mapping.keys())


def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    acc = torch.round(acc) * 100
    
    return acc



In [22]:
y_train.shape

(3487, 1)

### First Model 
For this model we're just going to be using a few of the basic features of the songs 

In [23]:
class ClassifierFirstDataset(torch.utils.data.Dataset): 
    
    def __init__(self, X_data, y_data): 
        self.X = torch.tensor(X_data, dtype = torch.float)
        self.Y = torch.reshape(torch.tensor(y_data, dtype = torch.long), (-1,))
        
    def __getitem__(self, index): 
        return self.X[index], self.Y[index]
    
    def __len__(self): 
        return len(self.X) 
    





class SongClassifierBeg(torch.nn.Module): 
    
    def __init__(self, num_feature, num_class): 
        super(SongClassifierBeg, self).__init__() 
        
        self.layer1 = torch.nn.Linear(num_feature, 64) 
        self.layer2 = torch.nn.Linear(64, num_class) 
        
        self.relu = torch.nn.ReLU() 
        
    def forward(self, x): 
        x = self.layer1(x) 
        x = self.relu(x) 
        x = self.layer2(x) 
        
        return x 

train_data = ClassifierFirstDataset(x_train, y_train) 
val_data = ClassifierFirstDataset(x_val, y_val) 
test_data = ClassifierFirstDataset(x_test, y_test) 

train_loader = torch.utils.data.DataLoader(dataset = train_data, batch_size = 60)
val_loader = torch.utils.data.DataLoader(dataset = val_data, batch_size = 1) 
test_loader = torch.utils.data.DataLoader(dataset = test_data, batch_size = 1)


    
parameters_beg = {
    "FEATURES_NUM": len(x_train[0]), 
    "CLASS_NUM": len(playlist_mapping.keys()), 
    "EPOCHS": 600, 
    "lr": .0001
}
    
    
model = SongClassifierBeg(parameters_beg["FEATURES_NUM"], parameters_beg["CLASS_NUM"])


criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = parameters_beg["lr"])



### Ready to Train 

for e in range(1, parameters_beg["EPOCHS"]): 
    
    model.train() ## This is to indicate that we are in training mode - this doesn't matter for this model 
    
    for x_train_batch, y_train_batch in train_loader:
        optimizer.zero_grad() 
        
        y_train_pred = model(x_train_batch) 
        train_loss = criterion(y_train_pred, y_train_batch) 
        train_acc = multi_acc(y_train_pred, y_train_batch) 
        
        train_loss.backward()
        optimizer.step() 
        
    if (e % 100 == 99): 
        print(f"At epoch {e} the loss is {train_loss} and the accuracy is {train_acc}. ")





At epoch 99 the loss is 3.3015358448028564 and the accuracy is 0.0. 
At epoch 199 the loss is 3.2176172733306885 and the accuracy is 0.0. 
At epoch 299 the loss is 3.1290409564971924 and the accuracy is 0.0. 
At epoch 399 the loss is 3.0526106357574463 and the accuracy is 0.0. 
At epoch 499 the loss is 2.9941940307617188 and the accuracy is 0.0. 
At epoch 599 the loss is 2.934602737426758 and the accuracy is 0.0. 


In [24]:
a =non_genres_floats.iloc[0:1, :-1]
model(torch.tensor(np.array(a), dtype = torch.float))

tensor([[ 4.7137e-01,  3.2705e-03,  8.3952e-02,  4.6191e+00,  1.3156e+00,
          6.5947e-01,  3.2702e+00,  3.1045e+00,  1.0445e+00,  1.1427e+00,
          9.4095e-01,  1.3816e+00,  2.1353e+00,  3.2103e+00,  1.9326e+00,
          1.7444e+00,  2.2414e+00,  7.5233e-01,  1.7327e+00, -3.5179e+00,
          1.6299e+00,  1.6918e+00, -1.4416e+00,  2.0032e+00,  2.3833e+00,
          4.0063e+00,  1.5949e+00, -4.3315e+00,  2.4868e+00,  4.5492e-01,
         -3.3098e+01, -1.3587e+00,  4.5423e+00,  2.7861e+00,  1.6983e+00,
          3.0420e+00,  1.6333e+00,  3.6291e+00,  3.8996e+00,  3.8604e+00,
          3.0293e+00,  3.2732e+00,  3.9801e+00,  4.4064e+00,  2.9356e+00,
          1.1120e+00,  8.2049e-04,  4.4180e-01, -5.0107e-01,  1.9879e+00,
          4.3679e+00,  4.5688e+00,  1.8373e-01,  2.3552e+00, -1.8149e+00,
          2.5251e+00,  1.1640e+00, -7.0723e-01,  2.4785e+00]],
       grad_fn=<AddmmBackward>)

### Things to Note 

There are too many classes and not nearly enough labels for there to be any type of accuracy - instead it might make more sense to pick a few playlist and try to sort based on those instead of shotting for all of them 
    We also have to factor in the fact that songs are part of many different playlists and this is confusing the model's fitting. 

### First Model, Second Round 

In order to address the fact that there are too many playlists, I'm going to do some data engineering and rerun the model on the same stuff 





In [25]:
non_duplicates = non_genres_df.groupby("id").first()
counts = non_duplicates.groupby("playlist name").count()["name"]
relevant_playlists = counts[counts > 40].drop(labels = ["Unsorted", "Queen Best Of"])
relevant_playlist_df = non_genres_df[non_genres_df["playlist name"].isin(relevant_playlists.index)]
print(relevant_playlist_df.shape, non_genres_df.shape) 
previous = len(non_genres_df["playlist name"].unique())
current = len(relevant_playlist_df["playlist name"].unique())
print(f"We had {previous} playlists and now we have {current} playlists")

(2216, 18) (4306, 18)
We had 59 playlists and now we have 20 playlists


In [26]:
relevant_playlist_df.columns
with_names_relevant_playlist = relevant_playlist_df.copy()
relevant_playlist_df.drop(["id", "name", "artist_name", "release date", "genres"], axis = 1, inplace = True)

playlist_mapping = {}
index = 0
for a in relevant_playlist_df["playlist name"]: 
    if a not in playlist_mapping.keys(): 
        playlist_mapping[a] = index 
        index += 1
        
reverse_playlist_mapping = {value: key for key, value in playlist_mapping.items()}
relevant_playlist_df["playlist name"].replace(playlist_mapping, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [27]:
x_train, x_test, x_val, y_train, y_test, y_val = clean_give_data(relevant_playlist_df)
FEATURES_NUM = len(x_train[0])
CLASS_NUM = len(playlist_mapping.keys())


In [28]:

train_data = ClassifierFirstDataset(x_train, y_train) 
val_data = ClassifierFirstDataset(x_val, y_val) 
test_data = ClassifierFirstDataset(x_test, y_test) 

train_loader = torch.utils.data.DataLoader(dataset = train_data, batch_size = 60)
val_loader = torch.utils.data.DataLoader(dataset = val_data, batch_size = 1) 
test_loader = torch.utils.data.DataLoader(dataset = test_data, batch_size = 1)


    
parameters_beg = {
    "FEATURES_NUM": len(x_train[0]), 
    "CLASS_NUM": len(playlist_mapping.keys()), 
    "EPOCHS": 600, 
    "lr": .0001
}
    
    
model = SongClassifierBeg(parameters_beg["FEATURES_NUM"], parameters_beg["CLASS_NUM"])


criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = parameters_beg["lr"])


### Ready to Train 

for e in range(1, parameters_beg["EPOCHS"]): 
    
    model.train() ## This is to indicate that we are in training mode - this doesn't matter for this model 
    
    for x_train_batch, y_train_batch in train_loader:
        optimizer.zero_grad() 
        
        y_train_pred = model(x_train_batch) 
        train_loss = criterion(y_train_pred, y_train_batch) 
        train_acc = multi_acc(y_train_pred, y_train_batch) 
        
        train_loss.backward()
        optimizer.step() 
        
    if (e % 100 == 99): 
        print(f"At epoch {e} the loss is {train_loss} and the accuracy is {train_acc}. ")



At epoch 99 the loss is 2.5459704399108887 and the accuracy is 0.0. 
At epoch 199 the loss is 2.4367964267730713 and the accuracy is 0.0. 
At epoch 299 the loss is 2.3307957649230957 and the accuracy is 0.0. 
At epoch 399 the loss is 2.2349696159362793 and the accuracy is 0.0. 
At epoch 499 the loss is 2.167545795440674 and the accuracy is 0.0. 
At epoch 599 the loss is 2.1246747970581055 and the accuracy is 0.0. 


It's pretty apparent that the accuracy is off for some reason so I've done an assessment below on the whole set

In [29]:
y_pred = model(torch.tensor(np.array(relevant_playlist_df.iloc[:, :-1]), dtype = torch.float))
y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
_, y_pred_tags = torch.max(y_pred_softmax, dim = 1) 
with_names_relevant_playlist["predicted"] = y_pred_tags.numpy()
with_names_relevant_playlist["predicted"] = with_names_relevant_playlist["predicted"].replace(reverse_playlist_mapping)
with_names_relevant_playlist

np.mean(with_names_relevant_playlist["playlist name"] == with_names_relevant_playlist["predicted"])

0.24368231046931407

After I removed the duplicate songs and we gt rid of the irrelevant playlists (smaller and duplicates) we are down to about half as many songs with about 25% accuracy - the next step is to run the same model but this time include the genres with it as well 

I see a variety of drawbacks in using a sparesly populated one hot encoding for the genres on the nodes for the neural network so it might be smart to just utilize the top genres to reduce the sparseness at least - we can run both versions and see what happens 

In [47]:
with_genres_df = song_df.drop(["name", "artist_name", "release date", "genres"], axis = 1)

#with_top_genres = with_genres_df[[]]

#sorted_genres
relevant_columns = with_genres_df.columns[:14]
total_columns = list(relevant_columns) + sorted_genres[:50]
print(total_columns)
with_top_genres = with_genres_df[total_columns]
with_top_genres

['id', 'popularity', 'explicit', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'playlist name', 'pop rap', 'rap', 'pop', 'hip hop', 'dance pop', 'trap', 'conscious hip hop', 'southern hip hop', 'indie pop rap', 'post-teen pop', 'pop rock', 'modern rock', 'desi pop', 'atl hip hop', 'modern bollywood', 'rock', 'desi hip hop', 'r&b', 'melodic rap', 'deep underground hip hop', 'dmv rap', 'filmi', 'sufi', 'urban contemporary', 'chicago rap', 'viral pop', 'alternative r&b', 'canadian pop', 'underground hip hop', 'indian folk', 'glam rock', 'neo mellow', 'canadian hip hop', 'toronto rap', 'miami hip hop', 'gangster rap', 'deep pop r&b', 'uk pop', 'soundtrack', 'movie tunes', 'nc hip hop', 'emo rap', 'hollywood', 'dfw rap', 'baroque pop', 'vapor trap', 'piano rock', 'show tunes', 'soft rock', 'detroit hip hop']


Unnamed: 0,id,popularity,explicit,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,...,nc hip hop,emo rap,hollywood,dfw rap,baroque pop,vapor trap,piano rock,show tunes,soft rock,detroit hip hop
0,3MXCnmak0GBBduWbahRY8G,47,1,0.900,0.461,-14.453,1,0.5150,0.1610,0.000002,...,0,0,0,0,0,0,0,0,0,0
1,5gQcxYXqnofyocKgEbGYKt,72,1,0.864,0.664,-7.315,1,0.1040,0.0476,0.000000,...,0,0,0,0,0,1,0,0,0,0
2,5DI9jxTHrEiFAhStG7VA8E,66,1,0.794,0.522,-7.829,1,0.1590,0.0328,0.000000,...,0,0,0,0,0,0,0,0,0,0
3,1R7ChEm1x3mGhDWXKnPSXn,38,0,0.703,0.786,-6.779,0,0.0483,0.1980,0.000712,...,0,0,0,0,0,0,0,0,0,0
4,1R7ChEm1x3mGhDWXKnPSXn,38,0,0.703,0.786,-6.779,0,0.0483,0.1980,0.000712,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4301,26h6qDR3KtieqIWgnveqmC,56,0,0.463,0.245,-9.438,1,0.0387,0.9570,0.000000,...,0,0,0,0,0,0,0,0,0,0
4302,6FRLCMO5TUHTexlWo8ym1W,78,1,0.851,0.541,-6.825,1,0.0505,0.5680,0.000000,...,0,0,0,0,0,0,0,0,0,0
4303,6Umac95Mt46VcwAM9s9mOa,65,0,0.540,0.572,-8.665,1,0.1150,0.6240,0.000000,...,0,0,0,0,0,0,0,0,0,0
4304,2v8YyHvDPBfydhVOTvuHl9,0,1,0.544,0.446,-11.735,1,0.5360,0.6070,0.000000,...,0,0,0,0,0,0,0,0,0,0


In [50]:
non_duplicates = with_top_genres.groupby("id").first()
counts = non_duplicates.groupby("playlist name").count()["popularity"]
relevant_playlists = counts[counts > 40].drop(labels = ["Unsorted", "Queen Best Of"])
relevant_playlist_genres_df = with_top_genres[with_top_genres["playlist name"].isin(relevant_playlists.index)]
print(relevant_playlist_genres_df.shape, with_top_genres.shape) 
previous = len(non_genres_df["playlist name"].unique())
current = len(relevant_playlist_df["playlist name"].unique())
print(f"We had {previous} playlists and now we have {current} playlists")

(2216, 64) (4306, 64)
We had 59 playlists and now we have 20 playlists


#### First Model with Top Genres

Also Defining a couple functions to make live easier

In [None]:
def make_playlist_mapping(df): 
    playlist_mapping = {}
    index = 0
    for a in df["playlist name"]: 
        if a not in playlist_mapping.keys(): 
            playlist_mapping[a] = index 
            index += 1

    reverse_playlist_mapping = {value: key for key, value in playlist_mapping.items()}
    df["playlist name"].replace(playlist_mapping, inplace = True)
    return df, playlist_mapping, reverse_mapping 




In [None]:
df, playlist_mapping, reverse_mapping  = make_playlist_mapping(relevant_playlist_genres_df)
x_train, x_test, x_val, y_train, y_test, y_val = clean_give_data(relevant_playlist_genres_df)
FEATURES_NUM = len(x_train[0])
CLASS_NUM = len(playlist_mapping.keys())


In [None]:
class SongClassifer(torch.nn.Module): 
    
    def __init__ (self, num_feature, num_class): 
        super(SongClassifier, self).__init__()
        
        self.layer1 = torch.nn.Linear(num_feature, 512) 
        self.layer2 = torch.nn.Linear(512, 128) 
        self.layer3 = torch.nn.Linear(128, 64) 
        self.layer_out = torch.nn.Linear(64, num_class) 
        
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(p = .2)
        self.batchnorm1 = torch.nn.BatchNorm1d(512)  ## applies batch normalization across the batch 
        self.batchnorm2 = torch.nn.BatchNorm1d(128) 
        self.batchnorm3 = torch.nn.BatchNorm1d(64) 
    
    def forward(self, x): 
        x = self.layer1(x) 
        x = self.batchnorm1(x) 
        x = self.relu(x) 
        x = self.dropout(x) 
        
        x = self.layer2(x) 
        x = self.batchnorm2(x) 
        x = self.relu(x) 
        x = self.dropout(x) 
        
        x = self.layer3(x) 
        x = self.batchnorm3(x) 
        x = self.relu(x) 
        x = self.dropout(x) 
        
        x = self.layer_out(x) 
        
        return x 
        