In [1]:
import pandas as pd
from IPython.display import display, Markdown
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('/Users/daivik/Desktop/Projects/FITBData-feature-engineering-practice/data/FITB_train.csv')
test_df = pd.read_csv('/Users/daivik/Desktop/Projects/FITBData-feature-engineering-practice/data/FITB_test.csv')

In [3]:
#basic eda
print("Training set shape:", train_df.shape)
print("Test set shape:", test_df.shape)
print("\nTraining set columns:", train_df.columns.tolist())
print("\nTraining set info:")
print(train_df.info())


Training set shape: (3941, 7)
Test set shape: (1059, 7)

Training set columns: ['feature_1', 'feature_2', 'feature_3', 'feature_4', 'id', 'date', 'y']

Training set info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3941 entries, 0 to 3940
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   feature_1  3941 non-null   float64
 1   feature_2  3854 non-null   float64
 2   feature_3  3805 non-null   float64
 3   feature_4  3941 non-null   float64
 4   id         3941 non-null   int64  
 5   date       3941 non-null   object 
 6   y          3941 non-null   object 
dtypes: float64(4), int64(1), object(2)
memory usage: 215.7+ KB
None


In [4]:
#Check for missing values
print("\nMissing values in training set:")
print(train_df.isnull().sum())
print("\nMissing values in test set:")
print(test_df.isnull().sum())


Missing values in training set:
feature_1      0
feature_2     87
feature_3    136
feature_4      0
id             0
date           0
y              0
dtype: int64

Missing values in test set:
feature_1     0
feature_2    26
feature_3    27
feature_4     0
id            0
date          0
y             0
dtype: int64


In [5]:
#Convert date columns to datetime
train_df['date'] = pd.to_datetime(train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])


### Winzorization


In [8]:
#Calculate training set percentiles
p1 = train_df['feature_3'].quantile(0.01)  #1st percentile
p99 = train_df['feature_3'].quantile(0.99)  #99th percentile

print(f"1st percentile (lower bound): {p1:.2f}")
print(f"99th percentile (upper bound): {p99:.2f}")

1st percentile (lower bound): 54.78
99th percentile (upper bound): 2050.80


In [12]:
#apply winzorization to training set 
train_df['feature_3_windsorized'] = train_df['feature_3'].clip(lower=p1, upper=p99)

#Check capped values
lower_capped = (train_df['feature_3'] < p1).sum()
upper_capped = (train_df['feature_3'] > p99).sum()

print(f"Values capped at lower bound: {lower_capped}")
print(f"Values capped at upper bound: {upper_capped}")

Values capped at lower bound: 39
Values capped at upper bound: 39


In [14]:
#Apply winzorization to test set
test_df['feature_3_windsorized'] = test_df['feature_3'].clip(lower=p1, upper=p99)

print("\nOriginal feature_3 range (train):")
print(f"Min: {train_df['feature_3'].min():.2f}, Max: {train_df['feature_3'].max():.2f}")
print("\nWinsorized feature_3_windsorized range (train):")
print(f"Min: {train_df['feature_3_windsorized'].min():.2f}, Max: {train_df['feature_3_windsorized'].max():.2f}")



Original feature_3 range (train):
Min: 45.04, Max: 2644.03

Winsorized feature_3_windsorized range (train):
Min: 54.78, Max: 2050.80


### Task 3 - Impute missing values

In [17]:
#Missing values in feature 3 windsorized
print("Missing values in feature_3_winsor (train):", train_df['feature_3_windsorized'].isnull().sum())
print("Missing values in feature_3_winsor (test):", test_df['feature_3_windsorized'].isnull().sum())

Missing values in feature_3_winsor (train): 136
Missing values in feature_3_winsor (test): 27


In [19]:
#Calculate median on training set 
median_feature3 = train_df['feature_3_windsorized'].median()
print(f"Median value for imputation: {median_feature3:.2f}")

#Impute missing values with median
train_df['feature_3_imputed'] = train_df['feature_3_windsorized'].fillna(median_feature3)
test_df['feature_3_imputed'] = test_df['feature_3_windsorized'].fillna(median_feature3)

#Check imputed values
print("\nMissing values in feature_3_imputed (train):", train_df['feature_3_imputed'].isnull().sum())
print("Missing values in feature_3_imputed (test):", test_df['feature_3_imputed'].isnull().sum())


Median value for imputation: 139.07

Missing values in feature_3_imputed (train): 0
Missing values in feature_3_imputed (test): 0
