# **Preprocessing**
---
##Author: Emmanuel Paalam
##Methods used: MICE for imputation
##Cleaned dataset has 13119 rows and 212 columns.

##Import dataset

In [1]:
import pandas as pd
import numpy as np
from google.colab import drive

drive.mount("/content/drive")
## Insert pathway to dataset below
df = pd.read_csv("/content/drive/My Drive/research/SURP 2024/colab/data/cleaned/epaalam-cleaned_els_02_12_byf3pststu_v1_0.csv")

Mounted at /content/drive


###Split data

In [2]:
y = df['F3EVERDO']
X = df.drop(columns='F3EVERDO', axis=1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=71, stratify=y)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

##Distinguish column types

### **Binary**
### *Do not transform binary columns*, already in useable format

In [None]:
binary_columns = []

# Iterate over columns and check nunique() value
for column in df.columns:
    if df[column].nunique() == 2:
        binary_columns.append(column)

binary_columns.remove('F3EVERDO')

# Remove binary-nominal columns
binary_columns.remove('BYL01')
binary_columns.remove('F1SEX')

# Remove binary-ordinal columns
binary_columns.remove('BYL20')

binary_columns

### **Numerical (continuous)**
### *Scale* rather than transform

In [None]:
numerical_columns = ['BYA07', 'BYA09', 'BYA14E', 'BYA20', 'BYA22B', 'BYA24A', 'BYL02', 'BYL22B', 'BYL23', 'BYL25']

numerical_columns.append('BYTXMSTD')
numerical_columns.append('BYTXRSTD')
numerical_columns.append('BYHMWRK')
numerical_columns.append('F1DOB_P')

### **Ordinal**

In [None]:
ordinal_columns = ['BY10FLP', 'F1WRKHRS', 'F1MOTHED', 'F1FATHED', 'F1STEXP', 'F1HIMATH', 'F1RHTUNP', 'F1RGPP2', 'BYS28', 'BYS37', 'BYS54A', 'BYS54B', 'BYS54C', 'F1S16A', 'F1S16B',
                   'F1S16C', 'F1S16F', 'F1S16H', 'F1S30E', 'F1S30F', 'F1S30H', 'F1S30I', 'F1S33', 'F1S34A', 'F1S35A', 'F1S36B', 'F1S37A', 'F1S37D', 'F1S37E', 'F1S37F', 'F1S39A',
                   'F1S39B', 'F1S39C', 'F1S39D', 'F1S39E', 'F1S39F', 'F1S39G', 'F1S39H', 'F1S39I', 'F1S40A', 'F1S40B', 'F1S40C', 'F1S40D', 'F1S40E', 'F1S40F', 'F1S40G', 'F1S40H',
                   'F1S40I', 'F1S40J', 'F1S40K', 'F1S40L', 'F1S40M', 'F1S40N', 'F1S40O', 'F1S40P','F1S40Q','F1S40R', 'F1S65B', 'F1S65D', 'BYFTTP', 'BYA08', 'BYL16A', 'BYL16B',
                   'BYL16C', 'BYL16D', 'BYL19', 'BYL24', 'BYL26', 'BYL30A', 'BYF02', 'BYF09A', 'BYF09B', 'BYF09C', 'BYF09D', 'BYF09E', 'F1S65A']

ordinal_columns.append('BYL20')
ordinal_columns.append('BYINCOME')
ordinal_columns.append('BYGRDRPT')
ordinal_columns.append('BYXTRACU')
ordinal_columns.append('BYG10EP')

#### Record all ranges of ordinal columns in ordinal_categories
##### (Sorting required to use ColumnTransformer)

In [None]:
ordinal_categories = [sorted(df[col].dropna().unique().tolist()) for col in ordinal_columns]

### **Nominal**

In [None]:
nominal_columns = ['F1RACE', 'F1HOMLNG', 'F1FCOMP', 'F1OCCUM', 'F1OCCUF', 'F1CTLPTN', 'F1RTRCC', 'BYL34']

nominal_columns.append('BYREGURB')
nominal_columns.append('BYL01')
nominal_columns.append('F1SEX')
nominal_columns.append('BYSCHPRG')

###**FOR TESTING PURPOSES**

In [None]:
temp = []
for column in df.columns:
  temp.append(column)

In [None]:
for column in binary_columns:
      if column not in temp:
        print(f"{column} does not exist as a column name.")
      else:
        temp.remove(column)

In [None]:
for column in numerical_columns:
      if column not in temp:
        print(f"{column} does not exist as a column name.")
      else:
        temp.remove(column)

In [None]:
for column in ordinal_columns:
      if column not in temp:
        print(f"{column} does not exist as a column name.")
      else:
        temp.remove(column)

In [None]:
for column in nominal_columns:
      if column not in temp:
        print(f"{column} does not exist as a column name.")
      else:
        temp.remove(column)

In [None]:
temp

['F3EVERDO']

##Encode data

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Preprocessing pipelines for different column types
ordinal_pipeline = Pipeline(steps=[
    ('ordinal_encoder', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=np.nan))
])

nominal_pipeline = Pipeline(steps=[
    ('onehot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

# Combined transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_columns),
        ('ord', ordinal_pipeline, ordinal_columns),
        ('nom', nominal_pipeline, nominal_columns),
        ('bin', 'passthrough', binary_columns)
    ]
)

In [None]:
preprocessor.fit(X_train)



In [None]:
X_train_preprocessed = preprocessor.transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

## Use IterativeImputer

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np

# Initialize IterativeImputer
imputer = IterativeImputer(max_iter=10, random_state=0)

# Impute missing data
imputer.fit(X_train)

# Transform training and testing sets
X_train_imputed = imputer.transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [None]:
print(f"X_train shape: {X_train_imputed.shape}")
print(f"X_test shape: {X_test_imputed.shape}")

X_train shape: (10495, 211)
X_test shape: (2624, 211)


###Check for missing values

In [None]:
remaining_missing_mask = pd.DataFrame(X_train_imputed).isnull().any(axis=1)
print("Any remaining missing values after iterative imputation:", remaining_missing_mask.isnull().values.any())

remaining_missing_mask = pd.DataFrame(X_test_imputed).isnull().any(axis=1)
print("Any remaining missing values after iterative imputation:", remaining_missing_mask.isnull().values.any())

remaining_missing_mask = pd.DataFrame(y_train).isnull().any(axis=1)
print("Any remaining missing values after iterative imputation:", remaining_missing_mask.isnull().values.any())

remaining_missing_mask = pd.DataFrame(y_test).isnull().any(axis=1)
print("Any remaining missing values after iterative imputation:", remaining_missing_mask.isnull().values.any())

Any remaining missing values after iterative imputation: False
Any remaining missing values after iterative imputation: False
Any remaining missing values after iterative imputation: False
Any remaining missing values after iterative imputation: False


##Save to Drive

In [None]:
X_train_imputed_df = pd.DataFrame(X_train_imputed, columns=X_train.columns)
X_test_imputed_df = pd.DataFrame(X_test_imputed, columns=X_test.columns)
y_train_df = pd.DataFrame(y_train, columns=['F3EVERDO'])
y_test_df = pd.DataFrame(y_test, columns=['F3EVERDO'])

In [None]:
try:
  X_train_imputed_df.to_csv("/content/drive/My Drive/research/SURP 2024/colab/data/processed/X_train_MICEimputed.csv", index=False)
  X_test_imputed_df.to_csv("/content/drive/My Drive/research/SURP 2024/colab/data/processed/X_test_MICEimputed.csv", index=False)
  y_train_df.to_csv("/content/drive/My Drive/research/SURP 2024/colab/data/processed/y_MICEtrain.csv", index=False)
  y_test_df.to_csv("/content/drive/My Drive/research/SURP 2024/colab/data/processed/y_MICEtest.csv", index=False)
except Exception as e:
  print("An error occurred:", e)