# 3. Transformation
We will  address splitting the data, but also the main goals of Transformation, reshaping the data to fir our algorithms. Specifically we should focus correct data types and scaling.

In [1]:
import sys, site, platform, pandas as pd
import os
from utils_io import load_step, save_step
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Load the dataset
print(os.getcwd())
df = load_step("step2_preprocessing")

/home/joes-data/DataMiningProject/datamining_group12


### Boolean Values
Since "explicit" is boolean, we need to convert it to an integer

In [2]:
# Converting Explicit Attribute from boolean to int
df["explicit"] = df["explicit"].astype(int)

# # Verify that we have converted this correctly
# print('Dataset Info without Irrelevant Features:')
# print(df.info())

### Object Attributes
We are now ready to get rid of the "track_id" and "track_name" features. We can also drop the features which we converted using One-Hot Encoding such as "key" and "time_signature".

In [3]:
# Removing track_id, track_name and key, time_siganture
df = df.drop(columns=['track_id', 'track_name'])

### Splitting
We are splitting our Target and our Features

In [4]:
# Defining Target Attribute
target = "popularity"

# Spliting Target and Features used for prediction
X = df.drop(columns=[target])
y = df[target]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### Scaling
Some of our numerical values are way larger than others. We should solve this issue by scaling our data. As mentioned, we will use StandardScaler. It is important not to do this AFTER splitting our data so as to avoid data leakage. We choose to split the "duration_ms", "tempo" and "loudness" features.

In [5]:
# Initiate Scaler
scaler = StandardScaler()

# Choosing Numeric Columns we want to Scale
cols_to_scale = ['duration_ms', 'tempo', 'loudness']

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Scale 
X_train_scaled[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
X_test_scaled[cols_to_scale] = scaler.transform(X_test[cols_to_scale])


## Save Transformation

In [6]:
save_step(X_train, "step3_X_train")
save_step(X_test, "step3_X_test")
save_step(y_train, "step3_y_train")
save_step(y_test, "step3_y_test")

save_step(X_train_scaled, "step3_X_train_scaled")
save_step(X_test_scaled, "step3_X_test_scaled")

Saved step3_X_train.csv
Saved step3_X_test.csv
Saved step3_y_train.csv
Saved step3_y_test.csv
Saved step3_X_train_scaled.csv
Saved step3_X_test_scaled.csv


## Drop Values with Popularity Score = 0

In [8]:
print(y_train)

7920     71
82251    63
68729    32
75656    64
24803    18
         ..
6265     40
54886    27
76820    27
860      44
15795    25
Name: popularity, Length: 70612, dtype: int64


In [None]:
# We are merging the train and test data again 
zero_train = pd.concat([X_train, y_train], axis=1)
zero_test = pd.concat([X_test, y_test], axis=1)

# # Test if merging was complete
# zero_train.head()
# zero_test.head()

# Dropping 0 popularity of zero_train and zero_test
drop_index = zero_train[zero_train['popularity'] == 0].index
zero_train = zero_train.drop(drop_index)

drop_index = zero_test[zero_test['popularity'] == 0].index
zero_test = zero_test.drop(drop_index)

# Checking that 0s are removed
zero_train[zero_train['popularity']==0]

# Re-Splitting popularity feature:
zero_y_train = zero_train[['popularity']] # Double [] to make it a dataframe
zero_X_train = zero_train.drop(columns=['popularity'])

zero_y_test = zero_test[['popularity']] # Double [] to make it a dataframe
zero_X_test = zero_test.drop(columns=['popularity'])

In [13]:
# sanity check
print(zero_train.columns)

Index(['duration_ms', 'explicit', 'danceability', 'energy', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'genre__Classical_Opera',
       'genre__Country_Americana', 'genre__Electronic_Dance',
       'genre__Folk_Acoustic_Singer-Songwriter', 'genre__Hip-Hop_Rap',
       'genre__Jazz_Blues', 'genre__Latin', 'genre__Metal',
       'genre__Mood_Functional_Other', 'genre__Pop', 'genre__R&B_Soul_Funk',
       'genre__Reggae_Ska_Dub', 'genre__Rock', 'genre__Soundtrack_Showtunes',
       'genre__World_International', 'amount_genres', 'key_0', 'key_1',
       'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9',
       'key_10', 'key_11', 'time_signature_1', 'time_signature_3',
       'time_signature_4', 'time_signature_5', 'length', 'word_count',
       'sentiment_neutral', 'sentiment_positive', 'popularity'],
      dtype='object')


### Scaling

In [14]:
# Initiate Scaler
scaler = StandardScaler()

# Choosing Numeric Columns we want to Scale
cols_to_scale = ['duration_ms', 'tempo', 'loudness']

zero_X_train_scaled = X_train.copy()
zero_X_test_scaled = X_test.copy()

# Scale 
zero_X_train_scaled[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
zero_X_test_scaled[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

In [15]:
save_step(zero_y_train, "zero_y_train")
save_step(zero_X_train_scaled, "zero_X_train_scaled")
save_step(zero_y_test, "zero_y_test")
save_step(zero_X_test_scaled, "zero_X_test_scaled")

Saved zero_y_train.csv
Saved zero_X_train_scaled.csv
Saved zero_y_test.csv
Saved zero_X_test_scaled.csv
