# Pre-processing and Training for Capstone Two: Music & Happiness

### Table of Contents

* [Introduction](#start)
    * [Import relevant libraries](#import)
* [Pre-processing](#preprocess)
    * [Encode dummy variables for countries](#dummies)
    * [Scale data using StandardScale](#scaling)
* [Training](#train)
    * [Split the data](#split)

## 1 - Introduction <a name="start"></a>

### 1.1 - Import relevant libraries <a name="import"></a>

In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Retrieve dataframes stored in the EDA phase
%store -r songs_data_df
%store -r songs_data_stats_country
%store -r songs_data_stats_region

## 2 - Pre-processing <a name="import"></a>

### 2.1 - Encode dummy variables for countries <a name="dummies"></a>

In [3]:
songs_data_stats_country.head()

Unnamed: 0,country,popularity,is_explicit,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,Argentina,79.683897,30.206966,183280.889854,0.731863,0.686005,625.03786,-5.248106,0.488339,0.088648,0.238702,0.004518,0.189616,0.644262,114.017433,4
1,Australia,88.36665,36.016469,201002.420998,0.637842,0.622122,473.669583,-6.857,0.702213,0.071042,0.256684,0.008286,0.172767,0.497093,125.523188,4
2,Austria,81.920905,30.942392,180443.48368,0.655136,0.653844,501.647396,-6.832973,0.654763,0.096732,0.258236,0.019095,0.172822,0.516225,128.266714,4
3,Belarus,63.649356,53.788417,159353.026372,0.685833,0.641679,549.700781,-6.880253,0.47165,0.117817,0.215263,0.064024,0.212752,0.50115,124.893331,4
4,Belgium,84.965859,26.959596,191042.98899,0.633213,0.648327,489.050505,-6.599179,0.562828,0.078712,0.303627,0.02727,0.167556,0.521314,125.448159,4


In [4]:
df_encoded = pd.get_dummies(songs_data_stats_country.drop(['key', 'time_signature'], axis=1), columns=['country'], prefix=['country'])

In [5]:
df_encoded.head()

Unnamed: 0,popularity,is_explicit,duration_ms,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,...,country_Taiwan,country_Thailand,country_Turkey,country_Ukraine,country_United Arab Emirates,country_United Kingdom,country_United States,country_Uruguay,country_Venezuela,country_Vietnam
0,79.683897,30.206966,183280.889854,0.731863,0.686005,-5.248106,0.488339,0.088648,0.238702,0.004518,...,False,False,False,False,False,False,False,False,False,False
1,88.36665,36.016469,201002.420998,0.637842,0.622122,-6.857,0.702213,0.071042,0.256684,0.008286,...,False,False,False,False,False,False,False,False,False,False
2,81.920905,30.942392,180443.48368,0.655136,0.653844,-6.832973,0.654763,0.096732,0.258236,0.019095,...,False,False,False,False,False,False,False,False,False,False
3,63.649356,53.788417,159353.026372,0.685833,0.641679,-6.880253,0.47165,0.117817,0.215263,0.064024,...,False,False,False,False,False,False,False,False,False,False
4,84.965859,26.959596,191042.98899,0.633213,0.648327,-6.599179,0.562828,0.078712,0.303627,0.02727,...,False,False,False,False,False,False,False,False,False,False


In [6]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 85 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   popularity                    72 non-null     float64
 1   is_explicit                   72 non-null     float64
 2   duration_ms                   72 non-null     float64
 3   danceability                  72 non-null     float64
 4   energy                        72 non-null     float64
 5   loudness                      72 non-null     float64
 6   mode                          72 non-null     float64
 7   speechiness                   72 non-null     float64
 8   acousticness                  72 non-null     float64
 9   instrumentalness              72 non-null     float64
 10  liveness                      72 non-null     float64
 11  valence                       72 non-null     float64
 12  tempo                         72 non-null     float64
 13  country

### 2.2 - Scale data using StandardScaler <a name="scaling"></a>

In [7]:
# Since we already encoded the dummy variables, let's exclude Booleans from our DataFrame so we can apply the 
# StandardScaler without losing our country data.
df_no_bools = df_encoded.select_dtypes(exclude='bool')
df_bools = df_encoded.select_dtypes(include='bool')

# Make scaler object
scaler = StandardScaler()

# Fit the model to the data and transform it
scaled_df = scaler.fit_transform(df_no_bools)
scaled_df = pd.DataFrame(df_no_bools, columns=df_no_bools.columns)

# Combine the scaled data with df_bools to complete the DataFrame
df = pd.concat([scaled_df, df_bools], axis=1)

In [8]:
df.head()

Unnamed: 0,popularity,is_explicit,duration_ms,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,...,country_Taiwan,country_Thailand,country_Turkey,country_Ukraine,country_United Arab Emirates,country_United Kingdom,country_United States,country_Uruguay,country_Venezuela,country_Vietnam
0,79.683897,30.206966,183280.889854,0.731863,0.686005,-5.248106,0.488339,0.088648,0.238702,0.004518,...,False,False,False,False,False,False,False,False,False,False
1,88.36665,36.016469,201002.420998,0.637842,0.622122,-6.857,0.702213,0.071042,0.256684,0.008286,...,False,False,False,False,False,False,False,False,False,False
2,81.920905,30.942392,180443.48368,0.655136,0.653844,-6.832973,0.654763,0.096732,0.258236,0.019095,...,False,False,False,False,False,False,False,False,False,False
3,63.649356,53.788417,159353.026372,0.685833,0.641679,-6.880253,0.47165,0.117817,0.215263,0.064024,...,False,False,False,False,False,False,False,False,False,False
4,84.965859,26.959596,191042.98899,0.633213,0.648327,-6.599179,0.562828,0.078712,0.303627,0.02727,...,False,False,False,False,False,False,False,False,False,False


## 3 - Training <a name="train"></a>

### 3.1 - Split data <a name="split"></a>