# Preprocessing

## Imports

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from collections import Counter

from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler


In [2]:
# import function from local cleaning.py for column renaming
from cleaning import col_mapper

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 736 entries, 0 to 735
Data columns (total 33 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Timestamp                     736 non-null    object 
 1   Age                           735 non-null    float64
 2   Primary streaming service     735 non-null    object 
 3   Hours per day                 736 non-null    float64
 4   While working                 733 non-null    object 
 5   Instrumentalist               732 non-null    object 
 6   Composer                      735 non-null    object 
 7   Fav genre                     736 non-null    object 
 8   Exploratory                   736 non-null    object 
 9   Foreign languages             732 non-null    object 
 10  BPM                           629 non-null    float64
 11  Frequency [Classical]         736 non-null    object 
 12  Frequency [Country]           736 non-null    object 
 13  Frequ

  df = df.replace({


In [3]:
# Initialize engine connecting to the SQLite database
engine = create_engine('sqlite:///resources/cleaned.db')

# SQL query
query = 'SELECT * FROM main'

# Execute query and read the data into a DataFrame
pd.set_option('display.max_columns', None)
df = pd.read_sql_query(query, con=engine)

# set uuid as index
df = df.set_index('uuid')
df.head(2)

Unnamed: 0_level_0,age,primary_streaming_service,hours_per_day,while_working,instrumentalist,composer,fav_genre,exploratory,foreign_languages,frequency_classical,frequency_country,frequency_edm,frequency_folk,frequency_gospel,frequency_hip_hop,frequency_jazz,frequency_k_pop,frequency_latin,frequency_lofi,frequency_metal,frequency_pop,frequency_r&b,frequency_rap,frequency_rock,frequency_video_game_music,anxiety,depression,insomnia,ocd,music_effects
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
d42f116b-9c55-4ca1-afd5-d0cf88b2a8b7,18.0,Spotify,4.0,No,No,No,Video game music,No,Yes,Never,Never,Very frequently,Never,Never,Rarely,Rarely,Very frequently,Never,Sometimes,Sometimes,Rarely,Never,Rarely,Rarely,Very frequently,7.0,7.0,10.0,2.0,0.0
7202a658-ab9e-4c13-b489-86ca976ec793,61.0,YouTube Music,2.5,Yes,No,Yes,Jazz,Yes,Yes,Sometimes,Never,Never,Rarely,Sometimes,Never,Very frequently,Sometimes,Very frequently,Sometimes,Never,Sometimes,Sometimes,Never,Never,Never,9.0,7.0,3.0,3.0,1.0


In [4]:
# making a copy of the df imported from SQL
df_encoded = df.copy()

## Encoding
Converting objects into numeric for ML

In [5]:
# making yes/no columns binary
columns_for_conversion = ['instrumentalist',
                          'composer',
                          'while_working',
                          'exploratory',
                          'foreign_languages'
                          ]

for col in columns_for_conversion:
    df_encoded[col] = df_encoded[col].map({
        'Yes': 1,
        'No' : 0
        })

df_encoded.head(2)

Unnamed: 0_level_0,age,primary_streaming_service,hours_per_day,while_working,instrumentalist,composer,fav_genre,exploratory,foreign_languages,frequency_classical,frequency_country,frequency_edm,frequency_folk,frequency_gospel,frequency_hip_hop,frequency_jazz,frequency_k_pop,frequency_latin,frequency_lofi,frequency_metal,frequency_pop,frequency_r&b,frequency_rap,frequency_rock,frequency_video_game_music,anxiety,depression,insomnia,ocd,music_effects
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
d42f116b-9c55-4ca1-afd5-d0cf88b2a8b7,18.0,Spotify,4.0,0,0,0,Video game music,0,1,Never,Never,Very frequently,Never,Never,Rarely,Rarely,Very frequently,Never,Sometimes,Sometimes,Rarely,Never,Rarely,Rarely,Very frequently,7.0,7.0,10.0,2.0,0.0
7202a658-ab9e-4c13-b489-86ca976ec793,61.0,YouTube Music,2.5,1,0,1,Jazz,1,1,Sometimes,Never,Never,Rarely,Sometimes,Never,Very frequently,Sometimes,Very frequently,Sometimes,Never,Sometimes,Sometimes,Never,Never,Never,9.0,7.0,3.0,3.0,1.0


In [6]:
# change frequency_<genre> values to numeric
frequency_mapping = {
    'Never': 0,
    'Rarely': 1,
    'Sometimes': 2,
    'Very frequently': 3
}

# loop to change all frequency_<genre> columns
for col in df_encoded.columns:
    if col.startswith('frequency_'):
        df_encoded[col] = df_encoded[col].map(frequency_mapping)

df_encoded.head(2)

Unnamed: 0_level_0,age,primary_streaming_service,hours_per_day,while_working,instrumentalist,composer,fav_genre,exploratory,foreign_languages,frequency_classical,frequency_country,frequency_edm,frequency_folk,frequency_gospel,frequency_hip_hop,frequency_jazz,frequency_k_pop,frequency_latin,frequency_lofi,frequency_metal,frequency_pop,frequency_r&b,frequency_rap,frequency_rock,frequency_video_game_music,anxiety,depression,insomnia,ocd,music_effects
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
d42f116b-9c55-4ca1-afd5-d0cf88b2a8b7,18.0,Spotify,4.0,0,0,0,Video game music,0,1,0,0,3,0,0,1,1,3,0,2,2,1,0,1,1,3,7.0,7.0,10.0,2.0,0.0
7202a658-ab9e-4c13-b489-86ca976ec793,61.0,YouTube Music,2.5,1,0,1,Jazz,1,1,2,0,0,1,2,0,3,2,3,2,0,2,2,0,0,0,9.0,7.0,3.0,3.0,1.0


In [7]:
# cols_for_processing = df_encoded.columns.drop('uuid')
df_encoded = pd.get_dummies(df_encoded)#[cols_for_processing])
df_encoded.head(2)


Unnamed: 0_level_0,age,hours_per_day,while_working,instrumentalist,composer,exploratory,foreign_languages,frequency_classical,frequency_country,frequency_edm,frequency_folk,frequency_gospel,frequency_hip_hop,frequency_jazz,frequency_k_pop,frequency_latin,frequency_lofi,frequency_metal,frequency_pop,frequency_r&b,frequency_rap,frequency_rock,frequency_video_game_music,anxiety,depression,insomnia,ocd,music_effects,primary_streaming_service_Apple Music,primary_streaming_service_I do not use a streaming service.,primary_streaming_service_Other streaming service,primary_streaming_service_Pandora,primary_streaming_service_Spotify,primary_streaming_service_YouTube Music,fav_genre_Classical,fav_genre_Country,fav_genre_EDM,fav_genre_Folk,fav_genre_Gospel,fav_genre_Hip hop,fav_genre_Jazz,fav_genre_K pop,fav_genre_Latin,fav_genre_Lofi,fav_genre_Metal,fav_genre_Pop,fav_genre_R&B,fav_genre_Rap,fav_genre_Rock,fav_genre_Video game music
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
d42f116b-9c55-4ca1-afd5-d0cf88b2a8b7,18.0,4.0,0,0,0,0,1,0,0,3,0,0,1,1,3,0,2,2,1,0,1,1,3,7.0,7.0,10.0,2.0,0.0,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
7202a658-ab9e-4c13-b489-86ca976ec793,61.0,2.5,1,0,1,1,1,2,0,0,1,2,0,3,2,3,2,0,2,2,0,0,0,9.0,7.0,3.0,3.0,1.0,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False


In [8]:
df_encoded.rename(columns=col_mapper, inplace=True)
print(df_encoded.columns)

Index(['age', 'hours_per_day', 'while_working', 'instrumentalist', 'composer',
       'exploratory', 'foreign_languages', 'frequency_classical',
       'frequency_country', 'frequency_edm', 'frequency_folk',
       'frequency_gospel', 'frequency_hip_hop', 'frequency_jazz',
       'frequency_k_pop', 'frequency_latin', 'frequency_lofi',
       'frequency_metal', 'frequency_pop', 'frequency_r&b', 'frequency_rap',
       'frequency_rock', 'frequency_video_game_music', 'anxiety', 'depression',
       'insomnia', 'ocd', 'music_effects',
       'primary_streaming_service_apple_music',
       'primary_streaming_service_i_do_not_use_a_streaming_service.',
       'primary_streaming_service_other_streaming_service',
       'primary_streaming_service_pandora',
       'primary_streaming_service_spotify',
       'primary_streaming_service_youtube_music', 'fav_genre_classical',
       'fav_genre_country', 'fav_genre_edm', 'fav_genre_folk',
       'fav_genre_gospel', 'fav_genre_hip_hop', 'fav_genre_ja

## Define testing and training data
Split, scale, and resample.

### I. Splitting encoded data into training and test data

In [9]:
# define target and feature data
target_col = 'music_effects'

y = df_encoded[target_col].values
X = df_encoded.drop(columns=target_col).values

In [10]:
# split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

print(f"Training features shape: {X_train.shape}")
print(f"Training targets shape: {y_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Testing targets shape: {y_test.shape}")

Training features shape: (574, 49)
Training targets shape: (574,)
Testing features shape: (144, 49)
Testing targets shape: (144,)


### II. Scale

In [11]:
# Define and instantiate scaler
scaler = preprocessing.StandardScaler().fit(X_train)
scaler

In [12]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
len(X_train_scaled)

574

### III. Resample with imbalamced-learn

In [14]:
df_encoded['music_effects'].value_counts()

music_effects
1.0    535
0.0    183
Name: count, dtype: int64

In [15]:
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
print(sorted(Counter(y_resampled).items()))

[(0.0, 423), (1.0, 423)]


-