# Tidy

In [1]:
import pandas as pd 
import numpy as np

In [2]:
data = pd.read_csv('/data/danield2255/music.csv')
data.head()

Unnamed: 0,artist.hotttnesss,artist.id,artist.name,artist_mbtags,artist_mbtags_count,bars_confidence,bars_start,beats_confidence,beats_start,duration,...,start_of_fade_out,tatums_confidence,tatums_start,tempo,terms,terms_freq,time_signature,time_signature_confidence,title,year
0,0.401998,ARD7TVE1187B99BFB1,Casual,,0.0,0.643,0.58521,0.834,0.58521,218.93179,...,218.932,0.779,0.28519,92.198,hip hop,1.0,4.0,0.778,I Didn't Mean To,0
1,0.4175,ARMJAGH1187FB546F3,The Box Tops,classic pop and rock,1.0,0.007,0.71054,1.0,0.20627,148.03546,...,137.915,0.969,0.20627,121.274,blue-eyed soul,1.0,4.0,0.384,Soul Deep,1969
2,0.343428,ARKRRTF1187B9984DA,Sonora Santanera,,0.0,0.98,0.73152,0.98,0.73152,177.47546,...,172.304,0.482,0.42132,100.07,salsa,1.0,1.0,0.0,Amor De Cabaret,0
3,0.454231,AR7G5I41187FB4CE6C,Adam Ant,uk,1.0,0.017,1.30621,0.809,0.81002,233.40363,...,217.124,0.601,0.56254,119.293,pop rock,0.988584,4.0,0.0,Something Girls,1982
4,0.401724,ARXR32B1187FB57099,Gob,,0.0,0.175,1.06368,0.883,0.13576,209.60608,...,198.699,1.0,0.13576,129.738,pop punk,0.887288,4.0,0.562,Face the Ashes,2007


# Drop Extraneous Columns
The latitude and longitude columns were not needed since there was a categorical artist location variable. We also drop similar because it is a recomendation of similar artists, and is not in relation to the actual song observations. Plus, we dropped tatums_confidence and tatums_start columns because we will analyze beats in the beat related variables, and do not need to worry about tatums. Finally we delete the artist_mbtags and artist_mbtags_count because we will use the hotness levels as a measure of popularity. 

In [3]:
data = data.drop(columns = ['latitude', 'longitude', 'similar', 'tatums_confidence', 'tatums_start', 'artist_mbtags', 'artist_mbtags_count'])

# Appropriate Categorical Values

The Echonest software encoded the song keys, modes, and time signatures using integers, but because they will be used as categories in the analysis they were put back into more appropriate terms for observation. 

In [4]:
new_vals = {'mode': {0: "Minor", 1: "Major", -1: np.nan}, 
            'key': {0: "C", 1: "C#", 2: "D", 3: "D#", 4: "E", 5: "F", 6: "F#", 7: "G", 8: "G#", 9: "A", 10: "A#", 11: "B"},
           'time_signature': {-1: np.nan, 1: "Complex or Changing", 4: "4/4", 5: "5/4", 6: "6/4", 7: "7/4"}}
data.replace(new_vals, inplace= True)

## Missing Values

In [5]:
null_data = len(data[data['song.hotttnesss'].isnull()])
null_data

4351

In [6]:
data['year'] = data['year'].replace(0, np.nan)
data['artist.id'] = data['artist.id'].replace(-1, np.nan)
data['release.id'] = data['release.id'].replace(-1, np.nan)
data['location']= data['location'].replace('Not available', np.nan)

In [7]:
data['familiarity'] = data['familiarity'].fillna((data['familiarity'].mean(skipna = True)))
data['song.hotttnesss']=data['song.hotttnesss'].fillna(0)

Here we needed to adjust the familiarity value because there were missing values and it was a quantitative variable. I set missing values of song hotness to 0 because they were songs not popular enough to be measured by the scaling program and there were so many values missing that using the average would likely not have been appropriately represenative. 

In [8]:
assert len(data[data['artist.hotttnesss'].isnull()]) == 0
assert len(data[data['bars_confidence'].isnull()]) == 0
assert len(data[data['bars_start'].isnull()]) ==0
assert len(data[data['beats_confidence'].isnull()]) ==0
assert len(data[data['beats_start'].isnull()]) ==0
assert len(data[data['duration'].isnull()]) ==0
assert len(data[data['end_of_fade_in'].isnull()]) ==0
assert len(data[data['familiarity'].isnull()]) ==0
assert len(data[data['key_confidence'].isnull()]) ==0
assert len(data[data['loudness'].isnull()]) ==0
assert len(data[data['mode_confidence'].isnull()]) ==0
assert len(data[data['song.hotttnesss'].isnull()]) ==0
assert len(data[data['start_of_fade_out'].isnull()]) ==0
assert len(data[data['tempo'].isnull()]) ==0
assert len(data[data['terms_freq'].isnull()]) ==0
assert len(data[data['time_signature_confidence'].isnull()]) ==0


# Rename Columns
Here I renamed some of the columns that were not very clear as to what variable was being described. The general format is that if there is a period in the variable name, the variable is refrencing what is before the period. Example : 'artist.hotness' is a hotness popularity rating of an artist, not the song that is the row observation. 

In [9]:
data = data.rename(columns = {'artist.id': 'artist_id', 'artist.name': 'artist_name', 'artist.hotttnesss': 'artist_hotness', 'familiarity':'artist_familiarity', 'location': 'artist_location', 'terms': 'genre', 'release.id': 'release_id', 'release.name': 'release_name', 'song.id': 'song_id', 'terms_freq': 'artist_genre_freq', 'song.hotttnesss': 'song_hotness', 'terms_weight': 'artist_terms_weight', 'year': 'release_year', 'title':'song_title'})

## Checking Type Continuity
First I had to change some float values to integers for discrete quantitative variables.
After this, the following tests ensure that the data types are what we expect, and are the same through each column. 

In [10]:
data['key'] = data['key'].astype('object')
data['mode'] = data['mode'].astype('object')
data['release_id'] = data['release_id'].astype('int')
data['time_signature'] = data['time_signature'].astype('object')
data['release_year'] = data['release_year'].astype('object')

In [11]:
assert np.dtype(data['artist_hotness']) == 'float'
assert np.dtype(data['artist_id']) == 'object'
assert np.dtype(data['artist_name']) == 'object'
assert np.dtype(data['bars_confidence']) == 'float'
assert np.dtype(data['bars_start']) == 'float'
assert np.dtype(data['beats_confidence']) == 'float'
assert np.dtype(data['beats_start']) == 'float'
assert np.dtype(data['duration']) == 'float'
assert np.dtype(data['end_of_fade_in']) == 'float'
assert np.dtype(data['artist_familiarity']) == 'float'
assert np.dtype(data['key']) == 'object'
assert np.dtype(data['key_confidence']) == 'float'
assert np.dtype(data['artist_location']) == 'object'
assert np.dtype(data['loudness']) == 'float'
assert np.dtype(data['mode']) == 'object'
assert np.dtype(data['mode_confidence']) == 'float'
assert np.dtype(data['release_id']) == 'int'
assert np.dtype(data['release_name']) == 'object'
assert np.dtype(data['song_hotness']) == 'float'
assert np.dtype(data['song_id']) == 'object'
assert np.dtype(data['start_of_fade_out']) == 'float'
assert np.dtype(data['tempo']) == 'float'
assert np.dtype(data['genre']) == 'object'
assert np.dtype(data['artist_genre_freq']) == 'float'
assert np.dtype(data['time_signature']) == 'object'
assert np.dtype(data['time_signature_confidence']) == 'float'
assert np.dtype(data['song_title']) == 'object'
assert np.dtype(data['release_year']) == 'object'


# One to One Relationships
Here there are no one to one relationships as the only thing unique to each song is the song.id. Every other feature is able to potentially applied to multiple observations.

In [12]:
assert len(data['song_id'].unique()) == 10000

In [13]:
data['song_id'].describe(include = 'all')

count                  10000
unique                 10000
top       SOIMREK12A58A792CD
freq                       1
Name: song_id, dtype: object

# One to Many Relationships

In this data, since there were so many columns to consider, there were a great number of one to many relationships. The following subsets are a few of these one to many relationships which will be used to answer questions of interest. 


In [14]:
song_popularity =data[['song_id', 'song_hotness']]
song_year = data[['song_id', 'release_year']]
song_genres = data[['song_id', 'genre']]
song_tempo = data[['song_id', 'tempo']]
song_key = data[['song_id', 'key']]
song_mode = data[['song_id', 'mode']]
song_length = data[['song_id', 'duration']]

# Many to Many Relationships

In the music dataset, there are repeats for genre across observations, and the other features also have repeat values. This can create a many to many relationship which can be useful in predicting relationships between features.

In [15]:
genre_loudness = data[['genre', 'loudness']]
genre_mode = data[['genre', 'mode']]
songpop_artistpop = data[['song_hotness', 'artist_hotness']]

# Save Final Sets

In [16]:
data.to_json('/data/danield2255/tidy_music.json', orient = 'records')
song_popularity.to_json('/data/danield2255/song_popularity.json', orient = 'records')
song_year.to_json('/data/danield2255/song_year.json', orient = 'records')
song_genres.to_json('/data/danield2255/song_genres.json', orient = 'records')
song_tempo.to_json('/data/danield2255/song_tempo.json', orient = 'records')
song_key.to_json('/data/danield2255/song_key.json', orient = 'records')
song_mode.to_json('/data/danield2255/song_mode.json', orient = 'records')
song_length.to_json('/data/danield2255/song_length.json', orient = 'records')
genre_loudness.to_json('/data/danield2255/genre_loudness.json', orient = 'records')
genre_mode.to_json('/data/danield2255/genre_mode.json', orient = 'records')
songpop_artistpop.to_json('/data/danield2255/songpop_artistpop.json', orient = 'records')