In [2]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.impute import SimpleImputer

# Loading the feature-engineered dataset
df = pd.read_csv("../data/final_dataset_after_featureeng2.csv")

# Display the first few rows of the dataset
df.head()


Unnamed: 0,Age,Gender,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Location,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,Age_Category,BMI_category,Gender_encoded,Occupation_Family_History_Interaction
0,31,Male,Former,,,,27.56,Lalitpur,84,,0,adult,overweight,1,
1,60,Male,Never,,,,30.3,Pokhara,131,,0,old,obese,1,
2,33,Male,Former,,,,28.45,Pokhara,123,,0,adult,overweight,1,
3,36,Female,Current,,,,27.49,Kathmandu,253,,1,adult,overweight,0,
4,58,Male,Never,,,,25.49,Pokhara,117,,0,middle_aged,overweight,1,


In [3]:
df.columns

Index(['Age', 'Gender', 'Smoking_Status', 'Biomass_Fuel_Exposure',
       'Occupational_Exposure', 'Family_History_COPD', 'BMI', 'Location',
       'Air_Pollution_Level', 'Respiratory_Infections_Childhood',
       'COPD_Diagnosis', 'Age_Category', 'BMI_category', 'Gender_encoded',
       'Occupation_Family_History_Interaction'],
      dtype='object')

In [4]:
# Display the first few rows of the dataset and column names for verification
print("DataFrame Columns:", df.columns.tolist())
df.head()

DataFrame Columns: ['Age', 'Gender', 'Smoking_Status', 'Biomass_Fuel_Exposure', 'Occupational_Exposure', 'Family_History_COPD', 'BMI', 'Location', 'Air_Pollution_Level', 'Respiratory_Infections_Childhood', 'COPD_Diagnosis', 'Age_Category', 'BMI_category', 'Gender_encoded', 'Occupation_Family_History_Interaction']


Unnamed: 0,Age,Gender,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Location,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,Age_Category,BMI_category,Gender_encoded,Occupation_Family_History_Interaction
0,31,Male,Former,,,,27.56,Lalitpur,84,,0,adult,overweight,1,
1,60,Male,Never,,,,30.3,Pokhara,131,,0,old,obese,1,
2,33,Male,Former,,,,28.45,Pokhara,123,,0,adult,overweight,1,
3,36,Female,Current,,,,27.49,Kathmandu,253,,1,adult,overweight,0,
4,58,Male,Never,,,,25.49,Pokhara,117,,0,middle_aged,overweight,1,


Data Preprocessing Pipeline Explanation

Feature Selection: Dropping features that are not needed for the modeling or redundant based on the feature engineering decisions made earlier.

In [5]:
# Dropping unnecessary or redundant features
df = df.drop(columns=['Age', 'Gender', 'Air_Pollution_Level', 'BMI'], errors='ignore')

In [6]:
print("Available Columns after dropping:", df.columns.tolist())

Available Columns after dropping: ['Smoking_Status', 'Biomass_Fuel_Exposure', 'Occupational_Exposure', 'Family_History_COPD', 'Location', 'Respiratory_Infections_Childhood', 'COPD_Diagnosis', 'Age_Category', 'BMI_category', 'Gender_encoded', 'Occupation_Family_History_Interaction']


Defining Column Types: Separate columns based on their types (numerical, categorical) for appropriate preprocessing.

In [7]:
# Identifying the feature types
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()


In [8]:
# Display identified features for verification
print("Numerical Features:", numerical_features)
print("Categorical Features:", categorical_features)


Numerical Features: ['Biomass_Fuel_Exposure', 'Occupational_Exposure', 'Family_History_COPD', 'Respiratory_Infections_Childhood', 'COPD_Diagnosis', 'Gender_encoded', 'Occupation_Family_History_Interaction']
Categorical Features: ['Smoking_Status', 'Location', 'Age_Category', 'BMI_category']


In [9]:
numerical_features.remove('COPD_Diagnosis')

In [10]:
numerical_features

['Biomass_Fuel_Exposure',
 'Occupational_Exposure',
 'Family_History_COPD',
 'Respiratory_Infections_Childhood',
 'Gender_encoded',
 'Occupation_Family_History_Interaction']

Creating Preprocessing Pipelines:

1. Numerical Pipeline: Handles missing values and scaling.
2. Categorical Pipeline: Encodes categorical variables using OneHotEncoding or Ordinal Encoding.

In [11]:
# Numerical Pipeline: Scaling numerical features and handling missing values
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Imputes missing numerical values with the mean
    ('scaler', MinMaxScaler())                   # Scales features to a range
])

# Categorical Pipeline: Encoding categorical features
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Imputes missing categorical values with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encodes categorical variables
])


Combining the Pipelines:

Uses ColumnTransformer to apply the respective pipelines to numerical and categorical features.

In [12]:
# Combining the numerical and categorical pipelines
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])


Creating the Full Pipeline:

The preprocessing steps are integrated into the final modeling pipeline, allowing for streamlined training.

In [13]:
# Full Pipeline: Combining the preprocessor with a feature selector and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=chi2, k=10)),  # Selecting the best 10 features based on chi-squared test
])

In [14]:
df.columns

Index(['Smoking_Status', 'Biomass_Fuel_Exposure', 'Occupational_Exposure',
       'Family_History_COPD', 'Location', 'Respiratory_Infections_Childhood',
       'COPD_Diagnosis', 'Age_Category', 'BMI_category', 'Gender_encoded',
       'Occupation_Family_History_Interaction'],
      dtype='object')

In [15]:
# Splitting the data into training and testing sets
X = df.drop('COPD_Diagnosis', axis=1, errors='ignore')
y = df['COPD_Diagnosis'] if 'COPD_Diagnosis' in df.columns else None               

In [16]:
# Ensuring that target column is correctly identified
if y is None:
    raise ValueError("Target column not found in the dataframe. Please specify the correct target column.")

In [17]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
# Fitting the pipeline on the training data
pipeline.fit(X_train, y_train)


 'Respiratory_Infections_Childhood'
 'Occupation_Family_History_Interaction']. At least one non-missing value is needed for imputation with strategy='mean'.


In [19]:
# import pickle
# pickle.dump(pipeline,open('../models/pipeline2.pkl','wb'))

In [20]:
# df.to_csv('../models/after_pipelining2.csv', index=False)