# 3. Transformation
We will  address splitting the data, but also the main goals of Transformation, reshaping the data to fir our algorithms. Specifically we should focus correct data types and scaling.

In [2]:
import sys, site, platform, pandas as pd
import os
from utils_io import load_step, save_step
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Load the dataset
print(os.getcwd())
df = load_step("df_preprocessed")
df_no_zero = load_step("df_no_zero_preprocessed")

/home/joes-data/DataMiningProject/datamining_group12


### Splitting
We are splitting our Target and our Features

In [3]:
# Defining Target Attribute
target = "popularity"

# Spliting Target and Features used for prediction
X = df.drop(columns=[target])
y = df[target]

X_no_zero = df_no_zero.drop(columns=[target])
y_no_zero = df_no_zero[target]

# Train/Test Split normal
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train/Test Split without zeros (no zeros = nz)
X_train_nz, X_test_nz, y_train_nz, y_test_nz = train_test_split(X_no_zero, y_no_zero, test_size=0.2, random_state=42)

### Scaling
Some of our numerical values are way larger than others. We should solve this issue by scaling our data. As mentioned, we will use StandardScaler. It is important not to do this AFTER splitting our data so as to avoid data leakage. We choose to split the "duration_ms", "tempo" and "loudness" features.

In [4]:
# Initiate Scaler
scaler = StandardScaler()

# Choosing Numeric Columns we want to Scale
cols_to_scale = ['duration_ms', 'tempo', 'loudness']

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_nz_scaled = X_train_nz.copy()
X_test_nz_scaled = X_test_nz.copy()

# Scale 
X_train_scaled[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
X_test_scaled[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

X_train_nz_scaled[cols_to_scale] = scaler.fit_transform(X_train_nz[cols_to_scale])
X_test_nz_scaled[cols_to_scale] = scaler.transform(X_test_nz[cols_to_scale])

## Save Transformation

In [5]:
# normal (with zeros)
save_step(X_train, "X_train")
save_step(X_test, "X_test")
save_step(y_train, "y_train")
save_step(y_test, "y_test")

# without zeros
save_step(X_train_nz, "X_train_nz")
save_step(X_test_nz, "X_test_nz")
save_step(y_train_nz, "y_train_nz")
save_step(y_test_nz, "y_test_nz")

# scaled normal (with zeros)
save_step(X_train_scaled, "X_train_scaled")
save_step(X_test_scaled, "X_test_scaled")

# scaled without zeros
save_step(X_train_nz_scaled, "X_train_nz_scaled")
save_step(X_test_nz_scaled, "X_test_nz_scaled")

Saved X_train.csv
Saved X_test.csv
Saved y_train.csv
Saved y_test.csv
Saved X_train_nz.csv
Saved X_test_nz.csv
Saved y_train_nz.csv
Saved y_test_nz.csv
Saved X_train_scaled.csv
Saved X_test_scaled.csv
Saved X_train_nz_scaled.csv
Saved X_test_nz_scaled.csv
