In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest,chi2


In [3]:
df=pd.read_csv("../data/final_dataset_after_featureeng.csv")


In [4]:
df.head()

Unnamed: 0,Age,Gender,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Location,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,Age_Category,BMI_category,Air_Pollution_Level_category,Gender_encoded
0,31,Male,Former,1,1,1,27.56,Lalitpur,84,0,0,adult,overweight,Satisfactory,1
1,60,Male,Never,1,0,0,30.3,Pokhara,131,1,0,old,obese,Moderate,1
2,33,Male,Former,0,0,1,28.45,Pokhara,123,1,0,adult,overweight,Moderate,1
3,36,Female,Current,1,0,0,27.49,Kathmandu,253,0,1,adult,overweight,Poor,0
4,58,Male,Never,0,0,0,25.49,Pokhara,117,1,0,middle_aged,overweight,Moderate,1


In [5]:
df=df.drop(columns=['Age','Gender','Air_Pollution_Level','BMI'])

In [6]:
df.head()

Unnamed: 0,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,Location,Respiratory_Infections_Childhood,COPD_Diagnosis,Age_Category,BMI_category,Air_Pollution_Level_category,Gender_encoded
0,Former,1,1,1,Lalitpur,0,0,adult,overweight,Satisfactory,1
1,Never,1,0,0,Pokhara,1,0,old,obese,Moderate,1
2,Former,0,0,1,Pokhara,1,0,adult,overweight,Moderate,1
3,Current,1,0,0,Kathmandu,0,1,adult,overweight,Poor,0
4,Never,0,0,0,Pokhara,1,0,middle_aged,overweight,Moderate,1


In [7]:
features=df.columns.tolist()

In [8]:
features.remove('COPD_Diagnosis')
print(features)

['Smoking_Status', 'Biomass_Fuel_Exposure', 'Occupational_Exposure', 'Family_History_COPD', 'Location', 'Respiratory_Infections_Childhood', 'Age_Category', 'BMI_category', 'Air_Pollution_Level_category', 'Gender_encoded']


In [9]:
X_train,X_test,y_train,y_test=train_test_split(df[features],df['COPD_Diagnosis'],test_size=0.2,random_state=42)

In [10]:
X_train.head()

Unnamed: 0,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,Location,Respiratory_Infections_Childhood,Age_Category,BMI_category,Air_Pollution_Level_category,Gender_encoded
29,Never,0,0,0,Hetauda,1,middle_aged,overweight,Moderate,0
535,Current,1,0,1,Bhaktapur,1,adult,normal,Moderate,0
695,Never,1,0,0,Butwal,1,too_old,normal,Satisfactory,0
557,Former,1,0,0,Bhaktapur,0,middle_aged,overweight,Moderate,1
836,Never,0,1,0,Bhaktapur,0,too_old,obese,Moderate,0


In [11]:
print(X_train.shape,X_test.shape)

(800, 10) (200, 10)


In [12]:
ohe = OneHotEncoder(sparse_output=False)
X_train_gender_city = ohe.fit_transform(X_train[['Location']])

In [13]:
X_train_gender_city.shape

(800, 10)

# Start with pipelining and using column Transformer

### **I planned of Ordinal Encoding Smoking_Status, Age_Category,	BMI_category, Air_Pollution_Level_category**

In [14]:
trf1=ColumnTransformer([
    ('oe_columns',OrdinalEncoder(categories=[['Never','Former','Current'],['young','adult','middle_aged','old','too_old'],['underweight','normal','overweight','obese','too_obese'],['Good','Satisfactory','Moderate','Poor','Very_Poor','Severe']]),[0,6,7,8]),
    ('ohe_Location',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[4])
],remainder='passthrough')
    

In [22]:
# Feature selection
trf2 = SelectKBest(score_func=chi2,k=8)

In [23]:
# create the pipeline
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2)
])

In [24]:
pipe.fit(X_train,y_train)

In [25]:
import pickle
pickle.dump(pipe,open('../models/pipe.pkl','wb'))

In [26]:
df.to_csv('../models/final_dataset_after_pipelining.csv', index=False)