<a href="https://colab.research.google.com/github/drstannwoji2019/ML_Projects/blob/main/Data_Preprocessing_ANLY530.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2

# Sample dataset
data = {
    'Age': [25, 30, 35, np.nan, 40, 50, np.nan, 60, 65, 70],
    'Income': [50000, 60000, np.nan, 80000, 90000, 100000, 110000, np.nan, 130000, 140000],
    'Education': ['Bachelors', 'Masters', 'PhD', 'Masters', 'Bachelors', 'PhD', 'Masters', 'Bachelors', 'PhD', 'Masters'],
    'Purchased': [0, 1, 0, 1, 1, 0, 1, 0, 1, 0]
}

df = pd.DataFrame(data)
print("Original Data:\n", df)


Original Data:
     Age    Income  Education  Purchased
0  25.0   50000.0  Bachelors          0
1  30.0   60000.0    Masters          1
2  35.0       NaN        PhD          0
3   NaN   80000.0    Masters          1
4  40.0   90000.0  Bachelors          1
5  50.0  100000.0        PhD          0
6   NaN  110000.0    Masters          1
7  60.0       NaN  Bachelors          0
8  65.0  130000.0        PhD          1
9  70.0  140000.0    Masters          0


In [8]:
# Step 2: Handling Missing Values
# Impute missing values with the median for numerical columns
imputer = SimpleImputer(strategy='median')
df[['Age', 'Income']] = imputer.fit_transform(df[['Age', 'Income']])

print("Data after Imputation:\n", df)


Data after Imputation:
     Age    Income  Education  Purchased
0  25.0   50000.0  Bachelors          0
1  30.0   60000.0    Masters          1
2  35.0   95000.0        PhD          0
3  45.0   80000.0    Masters          1
4  40.0   90000.0  Bachelors          1
5  50.0  100000.0        PhD          0
6  45.0  110000.0    Masters          1
7  60.0   95000.0  Bachelors          0
8  65.0  130000.0        PhD          1
9  70.0  140000.0    Masters          0


In [9]:
# Encoding Categorical Values
# One-hot encoding for categorical features
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_education = encoder.fit_transform(df[['Education']])
encoded_education_df = pd.DataFrame(encoded_education, columns=encoder.get_feature_names_out(['Education']))

# Merge with original dataframe and drop the original categorical column
df = df.drop(['Education'], axis=1)
df = pd.concat([df, encoded_education_df], axis=1)

print("Data after Encoding:\n", df)



Data after Encoding:
     Age    Income  Purchased  Education_Masters  Education_PhD
0  25.0   50000.0          0                0.0            0.0
1  30.0   60000.0          1                1.0            0.0
2  35.0   95000.0          0                0.0            1.0
3  45.0   80000.0          1                1.0            0.0
4  40.0   90000.0          1                0.0            0.0
5  50.0  100000.0          0                0.0            1.0
6  45.0  110000.0          1                1.0            0.0
7  60.0   95000.0          0                0.0            0.0
8  65.0  130000.0          1                0.0            1.0
9  70.0  140000.0          0                1.0            0.0


In [10]:
# Step 4: Feature Scaling
scaler = StandardScaler()
df[['Age', 'Income']] = scaler.fit_transform(df[['Age', 'Income']])

print("Data after Scaling:\n", df)


Data after Scaling:
         Age    Income  Purchased  Education_Masters  Education_PhD
0 -1.519330 -1.700840          0                0.0            0.0
1 -1.165998 -1.322876          1                1.0            0.0
2 -0.812665  0.000000          0                0.0            1.0
3 -0.106000 -0.566947          1                1.0            0.0
4 -0.459332 -0.188982          1                0.0            0.0
5  0.247333  0.188982          0                0.0            1.0
6 -0.106000  0.566947          1                1.0            0.0
7  0.953998  0.000000          0                0.0            0.0
8  1.307331  1.322876          1                0.0            1.0
9  1.660663  1.700840          0                1.0            0.0


In [11]:
# Step 5: Feature Selection Using Chi-Square Test
X = df.drop('Purchased', axis=1)
y = df['Purchased']

# Apply SelectKBest with chi-square test
# Before applying SelectKBest, ensure all values in X are non-negative
X = X.abs() # take the absolute value of the features to ensure they are non-negative.
selector = SelectKBest(score_func=chi2, k=2)  # Select top 2 features
X_selected = selector.fit_transform(X, y)

print("Selected Features:\n", X.columns[selector.get_support()])

Selected Features:
 Index(['Age', 'Education_Masters'], dtype='object')
