# **Preprocessing**
---
##Author: Emmanuel Paalam
##Methods used: SMOTE, Pipeline of transformers
##Cleaned dataset has 13119 rows and 212 columns.

##Import dataset and needed libraries

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive

drive.mount("/content/drive")
## Insert pathway to dataset below
df = pd.read_csv("/content/drive/My Drive/research/SURP 2024/colab/data/cleaned/epaalam-cleaned_els_02_12_byf3pststu_v1_0.csv")

Mounted at /content/drive


In [None]:
df.shape

(13119, 212)

##Split data, handle imbalances

In [None]:
y = df['F3EVERDO']
X = df.drop(columns='F3EVERDO', axis=1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=71, stratify=y)

### Perform overfitting

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='constant')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_imputed, y_train)

# Convert back to DataFrame to keep column names
X_train_balanced = pd.DataFrame(X_train_balanced, columns=X_train.columns)

In [None]:
X_train_balanced.shape

(14125, 211)

##Identify column type groups

### **Binary**
### *Do not transform binary columns*, already in useable format

In [None]:
binary_columns = []

# Iterate over columns and check nunique() value
for column in X.columns:
    if X[column].nunique() == 2:
        binary_columns.append(column)

# Remove binary-nominal columns
binary_columns.remove('BYL01')
binary_columns.remove('F1SEX')

# Remove binary-ordinal columns
binary_columns.remove('BYL20')

binary_columns

['F1S62',
 'F1N03',
 'BYA11',
 'BYL03A',
 'BYL03B',
 'BYL03C',
 'BYL03D',
 'BYL03E',
 'BYL03F',
 'BYL04',
 'BYL05',
 'BYL06DA',
 'BYL07',
 'BYL09',
 'BYL11AA',
 'BYL11BA',
 'BYL11CA',
 'BYL11DA',
 'BYL11EA',
 'BYL11FA',
 'BYL11GA',
 'BYL11HA',
 'BYL11IA',
 'BYL11JA',
 'BYL11LA',
 'BYL11MA',
 'BYL11NA',
 'BYL11OA',
 'BYL11PA',
 'BYL11QA',
 'BYL11RA',
 'BYL11SA',
 'BYL11TA',
 'BYL12A',
 'BYL12B',
 'BYL12C',
 'BYL12D',
 'BYL12E',
 'BYL12F',
 'BYL12G',
 'BYL12H',
 'BYL12I',
 'BYL12J',
 'BYL13',
 'BYL14A',
 'BYL14B',
 'BYL14C',
 'BYL14D',
 'BYL14E',
 'BYL15',
 'BYL17',
 'BYL27G',
 'BYL31A',
 'BYL31B',
 'BYL31C',
 'BYL31D',
 'BYL32',
 'BYF01A',
 'BYF01B',
 'BYF01C',
 'BYF01D',
 'BYF01E',
 'BYF01F',
 'BYF01G',
 'BYF01H',
 'BYF01I',
 'BYF01J',
 'BYF03A',
 'BYF03B',
 'BYF03C',
 'BYF03D',
 'BYF04A',
 'BYF04B',
 'BYF04C',
 'BYF04D',
 'BYF04E',
 'BYF04F',
 'BYF04G',
 'BYF05A',
 'BYF05B',
 'BYF05C',
 'BYF05D',
 'BYF05E',
 'BYF05F',
 'BYF05G',
 'BYF05H',
 'BYF06A',
 'BYF06B',
 'BYF06C',
 'BYF06D',
 

### **Numerical (continuous)**
### *Scale* rather than transform

In [None]:
numerical_columns = ['BYA07', 'BYA09', 'BYA14E', 'BYA20', 'BYA22B', 'BYA24A', 'BYL02', 'BYL22B', 'BYL23', 'BYL25']

numerical_columns.append('BYTXMSTD')
numerical_columns.append('BYTXRSTD')
numerical_columns.append('BYHMWRK')
numerical_columns.append('F1DOB_P')

### **Ordinal**

In [None]:
ordinal_columns = ['BY10FLP', 'F1WRKHRS', 'F1MOTHED', 'F1FATHED', 'F1STEXP', 'F1HIMATH', 'F1RHTUNP', 'F1RGPP2', 'BYS28', 'BYS37', 'BYS54A', 'BYS54B', 'BYS54C', 'F1S16A', 'F1S16B',
                   'F1S16C', 'F1S16F', 'F1S16H', 'F1S30E', 'F1S30F', 'F1S30H', 'F1S30I', 'F1S33', 'F1S34A', 'F1S35A', 'F1S36B', 'F1S37A', 'F1S37D', 'F1S37E', 'F1S37F', 'F1S39A',
                   'F1S39B', 'F1S39C', 'F1S39D', 'F1S39E', 'F1S39F', 'F1S39G', 'F1S39H', 'F1S39I', 'F1S40A', 'F1S40B', 'F1S40C', 'F1S40D', 'F1S40E', 'F1S40F', 'F1S40G', 'F1S40H',
                   'F1S40I', 'F1S40J', 'F1S40K', 'F1S40L', 'F1S40M', 'F1S40N', 'F1S40O', 'F1S40P','F1S40Q','F1S40R', 'F1S65B', 'F1S65D', 'BYFTTP', 'BYA08', 'BYL16A', 'BYL16B',
                   'BYL16C', 'BYL16D', 'BYL19', 'BYL24', 'BYL26', 'BYL30A', 'BYF02', 'BYF09A', 'BYF09B', 'BYF09C', 'BYF09D', 'BYF09E', 'F1S65A']

ordinal_columns.append('BYL20')
ordinal_columns.append('BYINCOME')
ordinal_columns.append('BYGRDRPT')
ordinal_columns.append('BYXTRACU')
ordinal_columns.append('BYG10EP')

#### Record all ranges of ordinal columns in ordinal_categories
##### (Sorting required to use ColumnTransformer)

In [None]:
ordinal_categories = [sorted(X_train_balanced[col].dropna().unique().tolist()) for col in ordinal_columns]

### **Nominal**

In [None]:
nominal_columns = ['F1RACE', 'F1HOMLNG', 'F1FCOMP', 'F1OCCUM', 'F1OCCUF', 'F1CTLPTN', 'F1RTRCC', 'BYL34']

nominal_columns.append('BYREGURB')
nominal_columns.append('BYL01')
nominal_columns.append('F1SEX')
nominal_columns.append('BYSCHPRG')

#####**FOR TESTING PURPOSES**

In [None]:
temp = []
for column in X.columns:
  temp.append(column)

In [None]:
for column in binary_columns:
      if column not in temp:
        print(f"{column} does not exist as a column name.")
      else:
        temp.remove(column)

In [None]:
for column in numerical_columns:
      if column not in temp:
        print(f"{column} does not exist as a column name.")
      else:
        temp.remove(column)

In [None]:
for column in ordinal_columns:
      if column not in temp:
        print(f"{column} does not exist as a column name.")
      else:
        temp.remove(column)

In [None]:
for column in nominal_columns:
      if column not in temp:
        print(f"{column} does not exist as a column name.")
      else:
        temp.remove(column)

In [None]:
temp

[]

## Make tranformers

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline



*   Continuous columns: use SimpleImputer to handle missing values, all values scaled using Standard Scaler
*   Ordinal: transform with OrdinalEncoder
*   Nominal: *may return to, OneHotEncoder not working as an option*
*   Binary: Do not transform, data already in Boolean format



In [None]:
continuous_transformer = Pipeline(steps=[
    ('scaler', StandardScaler(copy=False))
])

ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=np.nan))
])

nominal_transformer = 'passthrough'

binary_transformer = 'passthrough'

### Concatenate tranformers into a singular versatile transformer, to use on whole training set

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', continuous_transformer, numerical_columns),
        ('ordinal', ordinal_transformer, ordinal_columns),
        ('binary', binary_transformer, binary_columns),
        ('nominal', nominal_transformer, nominal_columns)
    ]
)

In [None]:
preprocessor.fit(X_train_balanced)

## Transform/scale data using ColumnTransformer

In [None]:
X_train_transformed = preprocessor.transform(X_train_balanced)
X_test_transformed = preprocessor.transform(X_test_imputed)

print("Shape of X_train_transformed:", X_train_transformed.shape)
print("Shape of X_test_transformed:", X_test_transformed.shape)

Shape of X_train_transformed: (14125, 211)
Shape of X_test_transformed: (2624, 211)


## Save all training, testing target data as NumPy files

In [None]:
np.save('/content/drive/My Drive/research/SURP 2024/colab/data/cleaned/X_train_SMOTE.5.npy', X_train_transformed)
np.save('/content/drive/My Drive/research/SURP 2024/colab/data/cleaned/X_test_SMOTE.5.npy', X_test_transformed)
np.save('/content/drive/My Drive/research/SURP 2024/colab/data/cleaned/y_train_SMOTE.5.npy', y_train_balanced)
np.save('/content/drive/My Drive/research/SURP 2024/colab/data/cleaned/y_test_SMOTE.5.npy', y_test)

# Save feature names as a separate file
with open('/content/drive/My Drive/research/SURP 2024/colab/data/cleaned/feature_names.txt', 'w') as f:
    f.write('\n'.join(X.columns))