# Project Phase 3: Data Preprocessing
In this stage, the dataset will be formatted so that it can be used in machine learning model. That is, all data should be encoded numerically. At this stage, one-hot encoding and ordinal encoding will be used to encode non-numeric data. In this notebook, we will also execute the feature selection phase.

## Loading the Cleaned Dataset
Now, let's start by loading our cleaned dataset into a `Pandas.DataFrame` object.

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Import the Feature Names, convert them to numpy and them flatten them into one-dimension.
ATTRS_NUM = pd.read_csv("dataset/constants/ATTRS_NUM.csv", index_col=0).to_numpy().flatten()

DATASET = pd.read_csv("dataset/cleaned/Dataset.csv", index_col="EmployeeNumber")
DATASET.head()

In [None]:
# Define Feature Matrix and Target Vector to be inputted on machine learning models.
X = DATASET.iloc[:, 0:-1]
y = DATASET.iloc[:, -1]

# Conduct One-Hot Encoding on the Nominal Data of the feature matrix
X = pd.get_dummies(data=X, drop_first=True)

# Save the column names of the newly encoded dataset
ATTRS_ENCODED = X.columns

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Convert the X, and y DataFrames into NDArray
X = X.values
y = y.values

# Target columns to be scaled
COLUMNS_TO_BE_SCALED = [i for i in range(0,len(ATTRS_NUM))]

# Define the column transformer with standard scaler targetted to columns defined in COLUMNS_TO_BE_SCALED.
column_transformer = ColumnTransformer(
  [("Standard Scaler", StandardScaler(),  COLUMNS_TO_BE_SCALED),],
  remainder="passthrough"
)

# Scale the whole feature set
X_scaled = column_transformer.fit_transform(X)

In [None]:
# Feature Selection
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

# Fit the classifier to all of the dataset
classifier = ExtraTreesClassifier(n_estimators=50)
classifier.fit(X_scaled, y)

# Select only the important features
model = SelectFromModel(classifier, prefit=True)


X_scaled_feature_selected = model.transform(X_scaled)

# Save the names of the selected features
model.feature_names_in_ = ATTRS_ENCODED
ATTRS_SELECTED = model.get_feature_names_out()

# Preview the selected features
pd.DataFrame({
  "Selected Features": ATTRS_SELECTED
})

In [None]:
# Balancing the Dataset using SMOTE
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=22)

X_scaled_resampled, y_resampled = smote.fit_resample(X_scaled_feature_selected, y)
# X_scaled_resampled, y_resampled = smote.fit_resample(X_scaled, y)

In [None]:
# Split the features and targets into a training set and test set.
X_train, X_test, y_train, y_test = train_test_split(X_scaled_resampled, y_resampled, test_size=0.20)

In [None]:
# Export the training and testing set.
pd.DataFrame(X_train).to_csv("dataset/preprocessed/Features_Training_Set.csv")
pd.DataFrame(X_test).to_csv("dataset/preprocessed/Features_Testing_Set.csv")
pd.Series(y_train).to_csv("dataset/preprocessed/Target_Training_Set.csv")
pd.Series(y_test).to_csv("dataset/preprocessed/Target_Testing_Set.csv")

# Export the names of the selected features
pd.Series(ATTRS_SELECTED).to_csv("./dataset/constants/ATTRS_SELECTED.csv")