# Preprocessing

## Imports

In [4]:
import pandas as pd
from sqlalchemy import create_engine
from cleaning import col_mapper

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [None]:
# Initialize engine connecting to the SQLite database
engine = create_engine('sqlite:///resources/cleaned.db')

# SQL query
query = 'SELECT * FROM main'

# Execute query and read the data into a DataFrame
pd.set_option('display.max_columns', None)
df = pd.read_sql_query(query, con=engine)

# set uuid as index
df = df.set_index('uuid')
df.head(2)

## Encoding
Converting objects into numeric for ML

In [None]:
# making a copy of the df imported from SQL
df_encoded = df.copy()
df_encoded.info()

In [None]:
# making yes/no columns binary
columns_for_conversion = ['instrumentalist',
                          'composer',
                          'while_working',
                          'exploratory',
                          'foreign_languages'
                          ]

for col in columns_for_conversion:
    df_encoded[col] = df_encoded[col].map({
        'Yes': 1,
        'No' : 0
        })

df_encoded.head(2)

In [None]:
# change frequency_<genre> values to numeric
frequency_mapping = {
    'Never': 0,
    'Rarely': 1,
    'Sometimes': 2,
    'Very frequently': 3
}

# loop to change all frequency_<genre> columns
for col in df_encoded.columns:
    if col.startswith('frequency_'):
        df_encoded[col] = df_encoded[col].map(frequency_mapping)

df_encoded.head(2)

In [None]:
# cols_for_processing = df_encoded.columns.drop('uuid')
df_encoded = pd.get_dummies(df_encoded)#[cols_for_processing])
df_encoded.head(2)


In [None]:
df_encoded.rename(columns=col_mapper, inplace=True)
print(df_encoded.columns)

## Define testing and training data

### I. Splitting encoded data into training and test data

In [None]:
# define target and feature data
target_col = 'music_effects'

y = df_encoded[target_col].values
X = df_encoded.drop(columns=target_col).values

### II. imbalamced learn

In [None]:
df_encoded['music_effects'].value_counts()

these music_effects are quite skewed - needed to introduce imbalanced-learn