In [1]:
import os
import numpy as np
import pandas as pd
get_ipython().run_line_magic('matplotlib', 'inline')
import IPython.display as ipd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition

import matplotlib.patches as mpatches
from mpl_toolkits.mplot3d import Axes3D
from pandas.plotting import scatter_matrix

import random
from tqdm import tqdm # Progress Bar

from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

### Read information of tracks

In [2]:
# read in tracks data
tracks = pd.read_csv("../Data/tracks.csv")
tracks.shape

(106576, 53)

In [3]:
# reformatting tracks data and removes duplicates
tracks = tracks.drop(1)
labels = tracks.iloc[0].values
labels[0] = "track_id"
tracks.columns = labels
print(labels)
tracks = tracks.drop(0)
tracks = tracks.drop_duplicates()

['track_id' 'comments' 'date_created' 'date_released' 'engineer'
 'favorites' 'id' 'information' 'listens' 'producer' 'tags' 'title'
 'tracks' 'type' 'active_year_begin' 'active_year_end' 'associated_labels'
 'bio' 'comments' 'date_created' 'favorites' 'id' 'latitude' 'location'
 'longitude' 'members' 'name' 'related_projects' 'tags' 'website'
 'wikipedia_page' 'split' 'subset' 'bit_rate' 'comments' 'composer'
 'date_created' 'date_recorded' 'duration' 'favorites' 'genre_top'
 'genres' 'genres_all' 'information' 'interest' 'language_code' 'license'
 'listens' 'lyricist' 'number' 'publisher' 'tags' 'title']


Tracks may be assigned to multiple genres or no genre at all.

In [4]:
tracks.iloc[5]["genres_all"]

'[17, 10, 76, 103]'

In [5]:
# keeping only columns with useful features
tracks = tracks[['track_id', 'title', 'location', 'name', 'bit_rate', 'genres_all']].copy()
tracks.head(20)

Unnamed: 0,track_id,title,title.1,location,name,bit_rate,genres_all
2,2,AWOL - A Way Of Life,Food,New Jersey,AWOL,256000,[21]
3,3,AWOL - A Way Of Life,Electric Ave,New Jersey,AWOL,256000,[21]
4,5,AWOL - A Way Of Life,This World,New Jersey,AWOL,256000,[21]
5,10,Constant Hitmaker,Freeway,,Kurt Vile,192000,[10]
6,20,Niris,Spiritual Level,Colchester England,Nicky Cook,256000,"[17, 10, 76, 103]"
7,26,Niris,Where is your Love?,Colchester England,Nicky Cook,256000,"[17, 10, 76, 103]"
8,30,Niris,Too Happy,Colchester England,Nicky Cook,256000,"[17, 10, 76, 103]"
9,46,Niris,Yosemite,Colchester England,Nicky Cook,256000,"[17, 10, 76, 103]"
10,48,Niris,Light of Light,Colchester England,Nicky Cook,256000,"[17, 10, 76, 103]"
11,134,AWOL - A Way Of Life,Street Music,New Jersey,AWOL,256000,[21]


In [6]:
# removing instances that do not have a value for genres_all
tracks.drop(tracks.index[tracks["genres_all"].isnull()], inplace=True)
tracks.drop(tracks.index[tracks["genres_all"].equals("['']")], inplace=True)
tracks.head(10)
tracks.shape

(106574, 7)

In [7]:
#get list of track ids to help group sync up
track_ids = tracks.iloc[:]["track_id"].values

In [8]:
MyFile=open('../Data/tracks.txt','w')
for element in track_ids:
    MyFile.write(str(element))
    MyFile.write('\n')
MyFile.close()

In [9]:
# replace list of genre's with the top genre's id and rename column
genres_formatted = []
for index in range(len(tracks)):
    genres = tracks.iloc[index]["genres_all"]
    if(type(genres) is str):
        genres2 = genres[1:-1]
        if("," in genres):
            genres_formatted=list(genres2.split(", "))
            genres_formatted=[int(x) for x in genres_formatted]
            genre = genres_formatted[0]
        else:
            genre=genres2
    else:
        genre = genres2
 
    tracks.iloc[index]["genres_all"] = genre
    genres_formatted=[]


In [10]:
tracks.head(10)

Unnamed: 0,track_id,title,title.1,location,name,bit_rate,genres_all
2,2,AWOL - A Way Of Life,Food,New Jersey,AWOL,256000,21
3,3,AWOL - A Way Of Life,Electric Ave,New Jersey,AWOL,256000,21
4,5,AWOL - A Way Of Life,This World,New Jersey,AWOL,256000,21
5,10,Constant Hitmaker,Freeway,,Kurt Vile,192000,10
6,20,Niris,Spiritual Level,Colchester England,Nicky Cook,256000,17
7,26,Niris,Where is your Love?,Colchester England,Nicky Cook,256000,17
8,30,Niris,Too Happy,Colchester England,Nicky Cook,256000,17
9,46,Niris,Yosemite,Colchester England,Nicky Cook,256000,17
10,48,Niris,Light of Light,Colchester England,Nicky Cook,256000,17
11,134,AWOL - A Way Of Life,Street Music,New Jersey,AWOL,256000,21


In [11]:
#rename genres_all column to genre_id
tracks = tracks.rename(columns={'genres_all': 'genre_id'})
tracks.head(10)

Unnamed: 0,track_id,title,title.1,location,name,bit_rate,genre_id
2,2,AWOL - A Way Of Life,Food,New Jersey,AWOL,256000,21
3,3,AWOL - A Way Of Life,Electric Ave,New Jersey,AWOL,256000,21
4,5,AWOL - A Way Of Life,This World,New Jersey,AWOL,256000,21
5,10,Constant Hitmaker,Freeway,,Kurt Vile,192000,10
6,20,Niris,Spiritual Level,Colchester England,Nicky Cook,256000,17
7,26,Niris,Where is your Love?,Colchester England,Nicky Cook,256000,17
8,30,Niris,Too Happy,Colchester England,Nicky Cook,256000,17
9,46,Niris,Yosemite,Colchester England,Nicky Cook,256000,17
10,48,Niris,Light of Light,Colchester England,Nicky Cook,256000,17
11,134,AWOL - A Way Of Life,Street Music,New Jersey,AWOL,256000,21


In [12]:
tracks.shape

(106574, 7)

### Read features

In [13]:
features = pd.read_csv("../Data/features.csv")

In [14]:
# reformatting features data
features = features.drop(1)
features = features.drop(2)
features = features.drop(0)

columnNames = features.columns.values
columnNames[0] = "track_id"
features.columns = columnNames
features.head(10)

Unnamed: 0,track_id,chroma_cens,chroma_cens.1,chroma_cens.2,chroma_cens.3,chroma_cens.4,chroma_cens.5,chroma_cens.6,chroma_cens.7,chroma_cens.8,...,tonnetz.39,tonnetz.40,tonnetz.41,zcr,zcr.1,zcr.2,zcr.3,zcr.4,zcr.5,zcr.6
3,2,7.1806526184,5.2303090096,0.24932080507,1.3476201296,1.4824777842,0.53137123585,1.4815930128,2.691454649,0.86686819792,...,0.054125156254,0.012225749902,0.012110591866,5.758890152,0.45947265625,0.085629448295,0.0712890625,0.0,2.0898721218,0.061448108405
4,3,1.8889633417,0.76053929329,0.34529656172,2.2952005863,1.6540306807,0.067592434585,1.3668476343,1.0540937185,0.10810308903,...,0.063831120729,0.014211839065,0.017740072682,2.8246941566,0.46630859375,0.084578499198,0.06396484375,0.0,1.7167237997,0.0693301633
5,5,0.52756297588,-0.077654317021,-0.27961030602,0.6858831048,1.9375696182,0.880838871,-0.92319184542,-0.92723226547,0.66661673784,...,0.040730185807,0.012690781616,0.014759079553,6.8084154129,0.375,0.05311408639,0.04150390625,0.0,2.1933031082,0.044860601425
6,10,3.7022454739,-0.29119303823,2.1967420578,-0.234449476,1.3673638105,0.9984113574,1.7706941366,1.6045658588,0.52121698856,...,0.074357867241,0.01795193553,0.013921394013,21.434211731,0.4521484375,0.077514506876,0.07177734375,0.0,3.542324543,0.040800448507
7,20,-0.19383698702,-0.19852678478,0.20154602826,0.25855624676,0.77520370483,0.084794059396,-0.28929358721,-0.81641042233,0.043850939721,...,0.095002755523,0.022492416203,0.021355332807,16.669036865,0.4697265625,0.047224905342,0.0400390625,0.0009765625,3.18983078,0.030992921442
8,26,-0.69953453541,-0.68415790796,0.048824872822,0.042658798397,-0.81896692514,-0.91712284088,-0.9018342495,-0.066844828427,-0.29103723168,...,0.10371652246,0.025541320443,0.023846302181,41.645809174,0.25048828125,0.018387714401,0.015625,0.0,4.6905956268,0.014598459937
9,30,-0.7214871645,-0.84855991602,0.89090377092,0.088619679213,-0.44551330805,-1.2711701393,-1.2401897907,-1.3437650204,-0.90560036898,...,0.14169253409,0.02042612806,0.025417611003,8.1665945053,0.546875,0.054416511208,0.0361328125,0.00244140625,2.2447082996,0.052673552185
10,46,-0.11970755458,-0.85881441832,2.3625464439,0.10658428818,-1.3159115314,-1.2203541994,-1.668161869,-0.51603251696,-0.53639507294,...,0.13263167441,0.033211655915,0.021309997886,14.731082916,0.2236328125,0.036600999534,0.0322265625,0.0,2.8487360477,0.020713411272
11,48,-1.0540534258,0.93233942986,0.52806353569,-1.0353376865,-1.0006815195,-1.1193039417,1.1669902802,-1.0026028156,-1.0949990749,...,0.1419545114,0.024653503671,0.025202710181,24.550788879,0.37158203125,0.033015336841,0.0224609375,0.0,4.4067325592,0.039016269147
12,134,0.91844475269,0.67414724827,0.5778182745,1.2811170816,0.93374562263,0.078176945448,1.1992042065,-0.17522314191,0.92548191547,...,0.05876616016,0.016322381794,0.015819497406,4.731086731,0.41943359375,0.064369551837,0.05078125,0.0,1.8061059713,0.054622855037


In [15]:
features.columns

Index(['track_id', 'chroma_cens', 'chroma_cens.1', 'chroma_cens.2',
       'chroma_cens.3', 'chroma_cens.4', 'chroma_cens.5', 'chroma_cens.6',
       'chroma_cens.7', 'chroma_cens.8',
       ...
       'tonnetz.39', 'tonnetz.40', 'tonnetz.41', 'zcr', 'zcr.1', 'zcr.2',
       'zcr.3', 'zcr.4', 'zcr.5', 'zcr.6'],
      dtype='object', length=519)

In [16]:
features.shape

(106574, 519)

In [17]:
# combine features with tracks by track_id
final_dataset = pd.merge(tracks, features)
final_dataset.shape

(91213, 525)

In [18]:
final_dataset.to_csv(r'..\Data\preprocessed_data.csv', index=False)