# **Preprocessing**
---
##Author: Emmanuel Paalam
##Methods used: SimpleImputer for imputation
##Cleaned dataset has 13119 rows and 212 columns.

##Import dataset

In [111]:
import pandas as pd
from google.colab import drive

drive.mount("/content/drive")
## Insert pathway to dataset below
df = pd.read_csv("/content/drive/My Drive/research/SURP 2024/colab/data/cleaned/epaalam-cleaned_els_02_12_byf3pststu_v1_0.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


###Split data

In [112]:
y = df['F3EVERDO']
X = df.drop(columns='F3EVERDO', axis=1)

##Distinguish column types

### **Binary**
### *Do not transform binary columns*, already in useable format
### Remove F3EVERDO

In [113]:
binary_columns = []

# Iterate over columns and check nunique() value
for column in X.columns:
    if df[column].nunique() == 2:
        binary_columns.append(column)

# Remove binary-nominal columns
binary_columns.remove('BYL01')
binary_columns.remove('F1SEX')

# Remove binary-ordinal columns
binary_columns.remove('BYL20')

binary_columns

['F1S62',
 'F1N03',
 'BYA11',
 'BYL03A',
 'BYL03B',
 'BYL03C',
 'BYL03D',
 'BYL03E',
 'BYL03F',
 'BYL04',
 'BYL05',
 'BYL06DA',
 'BYL07',
 'BYL09',
 'BYL11AA',
 'BYL11BA',
 'BYL11CA',
 'BYL11DA',
 'BYL11EA',
 'BYL11FA',
 'BYL11GA',
 'BYL11HA',
 'BYL11IA',
 'BYL11JA',
 'BYL11LA',
 'BYL11MA',
 'BYL11NA',
 'BYL11OA',
 'BYL11PA',
 'BYL11QA',
 'BYL11RA',
 'BYL11SA',
 'BYL11TA',
 'BYL12A',
 'BYL12B',
 'BYL12C',
 'BYL12D',
 'BYL12E',
 'BYL12F',
 'BYL12G',
 'BYL12H',
 'BYL12I',
 'BYL12J',
 'BYL13',
 'BYL14A',
 'BYL14B',
 'BYL14C',
 'BYL14D',
 'BYL14E',
 'BYL15',
 'BYL17',
 'BYL27G',
 'BYL31A',
 'BYL31B',
 'BYL31C',
 'BYL31D',
 'BYL32',
 'BYF01A',
 'BYF01B',
 'BYF01C',
 'BYF01D',
 'BYF01E',
 'BYF01F',
 'BYF01G',
 'BYF01H',
 'BYF01I',
 'BYF01J',
 'BYF03A',
 'BYF03B',
 'BYF03C',
 'BYF03D',
 'BYF04A',
 'BYF04B',
 'BYF04C',
 'BYF04D',
 'BYF04E',
 'BYF04F',
 'BYF04G',
 'BYF05A',
 'BYF05B',
 'BYF05C',
 'BYF05D',
 'BYF05E',
 'BYF05F',
 'BYF05G',
 'BYF05H',
 'BYF06A',
 'BYF06B',
 'BYF06C',
 'BYF06D',
 

### **Numerical (continuous)**
### *Scale* rather than transform

In [114]:
numerical_columns = ['BYA07', 'BYA09', 'BYA14E', 'BYA20', 'BYA22B', 'BYA24A', 'BYL02', 'BYL22B', 'BYL23', 'BYL25']

numerical_columns.append('BYTXMSTD')
numerical_columns.append('BYTXRSTD')
numerical_columns.append('BYHMWRK')
numerical_columns.append('F1DOB_P')

### **Ordinal**

In [115]:
ordinal_columns = ['BY10FLP', 'F1WRKHRS', 'F1MOTHED', 'F1FATHED', 'F1STEXP', 'F1HIMATH', 'F1RHTUNP', 'F1RGPP2', 'BYS28', 'BYS37', 'BYS54A', 'BYS54B', 'BYS54C', 'F1S16A', 'F1S16B',
                   'F1S16C', 'F1S16F', 'F1S16H', 'F1S30E', 'F1S30F', 'F1S30H', 'F1S30I', 'F1S33', 'F1S34A', 'F1S35A', 'F1S36B', 'F1S37A', 'F1S37D', 'F1S37E', 'F1S37F', 'F1S39A',
                   'F1S39B', 'F1S39C', 'F1S39D', 'F1S39E', 'F1S39F', 'F1S39G', 'F1S39H', 'F1S39I', 'F1S40A', 'F1S40B', 'F1S40C', 'F1S40D', 'F1S40E', 'F1S40F', 'F1S40G', 'F1S40H',
                   'F1S40I', 'F1S40J', 'F1S40K', 'F1S40L', 'F1S40M', 'F1S40N', 'F1S40O', 'F1S40P','F1S40Q','F1S40R', 'F1S65B', 'F1S65D', 'BYFTTP', 'BYA08', 'BYL16A', 'BYL16B',
                   'BYL16C', 'BYL16D', 'BYL19', 'BYL24', 'BYL26', 'BYL30A', 'BYF02', 'BYF09A', 'BYF09B', 'BYF09C', 'BYF09D', 'BYF09E', 'F1S65A']

ordinal_columns.append('BYL20')
ordinal_columns.append('BYINCOME')
ordinal_columns.append('BYGRDRPT')
ordinal_columns.append('BYXTRACU')
ordinal_columns.append('BYG10EP')

#### Record all ranges of ordinal columns in ordinal_categories
##### (Sorting required to use ColumnTransformer)

In [116]:
ordinal_categories = [sorted(df[col].dropna().unique().tolist()) for col in ordinal_columns]

### **Nominal**

In [117]:
nominal_columns = ['F1RACE', 'F1HOMLNG', 'F1FCOMP', 'F1OCCUM', 'F1OCCUF', 'F1CTLPTN', 'F1RTRCC', 'BYL34']

nominal_columns.append('BYREGURB')
nominal_columns.append('BYL01')
nominal_columns.append('F1SEX')
nominal_columns.append('BYSCHPRG')

###**FOR TESTING PURPOSES**

In [118]:
temp = []
for column in df.columns:
  temp.append(column)

In [119]:
for column in binary_columns:
      if column not in temp:
        print(f"{column} does not exist as a column name.")
      else:
        temp.remove(column)

In [120]:
for column in numerical_columns:
      if column not in temp:
        print(f"{column} does not exist as a column name.")
      else:
        temp.remove(column)

In [121]:
for column in ordinal_columns:
      if column not in temp:
        print(f"{column} does not exist as a column name.")
      else:
        temp.remove(column)

In [122]:
for column in nominal_columns:
      if column not in temp:
        print(f"{column} does not exist as a column name.")
      else:
        temp.remove(column)

In [123]:
temp

['F3EVERDO']

##Create and use imputers

In [124]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [125]:
# Create transformers for each type of data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

In [126]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('bin', binary_transformer, binary_columns),
        ('ord', ordinal_transformer, ordinal_columns),
        ('nom', nominal_transformer, nominal_columns)
    ]
)

In [127]:
X_processed = preprocessor.fit_transform(X)

In [128]:
all_columns = preprocessor.transformers_[0][2] + preprocessor.transformers_[1][2] + preprocessor.transformers_[2][2] + preprocessor.transformers_[3][2]

### Save the transformed data and the preprocessor

In [129]:
transformed_df = pd.DataFrame(X_processed, columns=all_columns)
transformed_df['F3EVERDO'] = y.reset_index(drop=True)

##Final check

In [130]:
for column in transformed_df:
  if transformed_df[column].isnull().sum() > 0:
    print(f"{column} has {transformed_df[column].isnull().sum()} missing values.")

In [131]:
transformed_df.shape

(13119, 212)

##Save to Drive

In [133]:
try:
  transformed_df.to_csv('/content/drive/My Drive/research/SURP 2024/colab/data/processed/epaalam-SIMPLEprocessed_els_02_12_byf3pststu_v1_0.csv', index=False)
  print("DataFrame saved")
except Exception as e:
  print("An error occurred:", e)

DataFrame saved
