In [2]:
import importlib
import random

import mysklearn.myutils as myutils
import mysklearn.plotutils as plotutils
import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

import mysklearn.myclassifiers as myclassifiers
from mysklearn.myclassifiers import MyKNeighborsClassifier, MyNaiveBayesClassifier

import mysklearn.myevaluation as myevaluation

In [None]:
spotify_table = MyPyTable()
spotify_table.load_from_file("./input_data/spotify_tracks.csv")
print("Number of instances:", len(spotify_table.data))

rows_to_remove = []
genre_row = spotify_table.get_column("track_genre")
genres = myutils.get_unique_values(genre_row)
print(genres)
for i in range(len(spotify_table.data)):
    if spotify_table.data[i][spotify_table.column_names.index("speechiness")] >= 0.85 and spotify_table.data[i][-1] in ["kids", "children", "comedy"]:
        rows_to_remove.append(i)
spotify_table.drop_rows(rows_to_remove)
print("number of instances with stories removed:", len(spotify_table.data))

In [None]:
rows_to_remove = []
track_ids = []
for i in range(len(spotify_table.data)):
    if spotify_table.data[i][1] not in track_ids:
        track_ids.append(spotify_table.data[i][1])
    else:
        rows_to_remove.append(i)
spotify_table.drop_rows(rows_to_remove)
print("Number of instances with duplicates removed:", len(spotify_table.data))

In [None]:
popularity_index_list = []
popularity_column = spotify_table.get_column("popularity")
for i in range(len(popularity_column)):
    popularity_index_list.append([popularity_column[i], i])
popularity_index_list = sorted(popularity_index_list)

rows_to_remove = [instance[1] for instance in popularity_index_list]
rows_to_remove = rows_to_remove[:(len(popularity_index_list) - 10000)]
spotify_table.drop_rows(rows_to_remove)
print("Final number of instances:", len(spotify_table.data))
print(spotify_table.data[:10])

In [3]:
spotify_table = MyPyTable()
spotify_table.load_from_file("./input_data/cleaned_spotify_tracks.csv")

<mysklearn.mypytable.MyPyTable at 0x7fbf9019da00>

Once we narrowed our data down to the 10,000 most ppular songs, we looked at the ways it needed to be cleaned before classification. Track_genre was the only column that seemed to need cleaning, so we tried a few ways to clean track_genre to make it useable for the classifiers. There were so many unique values and some had so many more instances than others that it was difficult to find a logical way to regroup them while maintaining a reasonable distribution. Ultimately, we decided that track_genre probably wouldn't even be a good enough predictor for it to be worth the effort it would take to properly clean the data.

We ran our knn classifier on our 10,000 instance dataset, and it took about 30 minutes to come up with a model. Because it took so long with so many instances, we decided to cut our data down to only 1,000 instances using random sampling. We then got rid of the index column and the track id column because they no longer added any information to our data.

In [4]:
spotify_table.data = random.sample(spotify_table.data, k=1000)

for i in range(len(spotify_table.data)):
    spotify_table.data[i] = spotify_table.data[i][2:]
spotify_table.column_names = spotify_table.column_names[2:]
print(spotify_table.column_names, spotify_table.data[:10])

['artists', 'album_name', 'track_name', 'popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'track_genre'] [['Deftones', 'Saturday Night Wrist', 'Cherry Waves', 75.0, 317706.0, 'False', 0.47, 0.859, 2.0, -3.663, 1.0, 0.0537, 0.000362, 0.00691, 0.142, 0.17, 124.01, 4.0, 'metal'], ['Bethel Music;Kristene DiMarco', 'You Make Me Brave (Live)', 'It Is Well - Live', 62.0, 385053.0, 'False', 0.35, 0.25, 4.0, -10.648, 0.0, 0.0314, 0.698, 1.46e-05, 0.11, 0.11, 129.82, 4.0, 'ambient'], ['Dominic Fike;Zendaya', 'Elliot\'s Song (From "Euphoria" An HBO Original Series)', 'Elliot\'s Song - From "Euphoria" An HBO Original Series', 70.0, 150320.0, 'False', 0.394, 0.327, 4.0, -14.291, 1.0, 0.114, 0.849, 0.0, 0.125, 0.411, 93.358, 4.0, 'alt-rock'], ['Disturbed', 'Unstoppable', 'Unstoppable', 68.0, 238109.0, 'False', 0.529, 0.988, 1.0, -1.914, 1.0, 0.0768, 7.15e-0

Once we had cut our data down to a more reasonable size, we saved it to a new file so we could easily reload it without running our whole datacleaning notebook again.

In [5]:
spotify_table.save_to_file("input_data/sampled_cleaned_spotify_tracks.csv")