##### **COLUMN TRANSFORMER**

In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [None]:
df = pd.read_csv('covid_toy.csv')

In [None]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [None]:
df.isnull().sum()

Unnamed: 0,0
age,0
gender,0
fever,10
cough,0
city,0
has_covid,0


**always do a train test split first**

In [None]:
df.head(1)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No


In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),df['has_covid'],test_size=0.2,random_state=13)

In [None]:
X_train

Unnamed: 0,age,gender,fever,cough,city
3,31,Female,98.0,Mild,Kolkata
4,65,Female,101.0,Mild,Mumbai
90,59,Female,99.0,Strong,Delhi
72,83,Female,101.0,Mild,Kolkata
20,12,Male,98.0,Strong,Bangalore
...,...,...,...,...,...
25,23,Male,,Mild,Mumbai
16,69,Female,103.0,Mild,Kolkata
74,34,Female,104.0,Strong,Delhi
48,66,Male,99.0,Strong,Bangalore


In [None]:
y_train

Unnamed: 0,has_covid
3,No
4,No
90,No
72,No
20,No
...,...
25,No
16,Yes
74,No
48,No


##### **LIFE WITHOUT COLUMN TRANSFORMER**

In [None]:
df.head(1)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No


we can see that
1. Age is perfect (we can scale it tho)
2. gender,city is nominal categorical data (i.e no order)
3. fever has missing values
4. cough has 2 values i.e mild and strong which means ordinal categorical data (i.e order is present where mild < strong)
5. has_covid is target value, which is label encoder categorical data

In [None]:
# add simple imputer to fever column
# to fill the missing values by the mean of that data in the column
si = SimpleImputer()
X_train_fever = si.fit_transform(X_train[['fever']])
X_train_fever

array([[ 98.        ],
       [101.        ],
       [ 99.        ],
       [101.        ],
       [ 98.        ],
       [104.        ],
       [100.        ],
       [101.        ],
       [103.        ],
       [104.        ],
       [103.        ],
       [102.        ],
       [100.        ],
       [102.        ],
       [100.        ],
       [100.        ],
       [ 98.        ],
       [ 98.        ],
       [102.        ],
       [104.        ],
       [ 99.        ],
       [103.        ],
       [ 99.        ],
       [101.        ],
       [101.        ],
       [101.        ],
       [104.        ],
       [ 98.        ],
       [104.        ],
       [100.76712329],
       [ 98.        ],
       [ 99.        ],
       [101.        ],
       [101.        ],
       [103.        ],
       [100.76712329],
       [102.        ],
       [101.        ],
       [ 98.        ],
       [102.        ],
       [100.76712329],
       [ 99.        ],
       [102.        ],
       [100

In [None]:
# transform the test data
X_test_fever = si.transform(X_test[['fever']])
X_test_fever.shape

(20, 1)

In [None]:
# now apply ordinal encoding on cough
oe = OrdinalEncoder(categories=[['Mild','Strong']])
X_train_cough = oe.fit_transform(X_train[['cough']])
X_test_cough = oe.transform(X_test[['cough']])

In [None]:
X_train.head()

Unnamed: 0,age,gender,fever,cough,city
3,31,Female,98.0,Mild,Kolkata
4,65,Female,101.0,Mild,Mumbai
90,59,Female,99.0,Strong,Delhi
72,83,Female,101.0,Mild,Kolkata
20,12,Male,98.0,Strong,Bangalore


In [None]:
X_train_cough[2]

array([1.])

In [None]:
# apply one hot encoding on gender,city
# to solve multicollinearity problem, drop first column in both gender and city
# and to get numpy array instead of sparse matrix set sparse as False
ohe = OneHotEncoder(drop='first',sparse_output=False,dtype=np.int32)
X_train_gender_city = ohe.fit_transform(X_train[['gender','city']])
X_test_gender_city = ohe.transform(X_test[['gender','city']])
X_train_gender_city


array([[0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 0, 1, 0],
       [1, 0, 1, 0],
       [0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [1, 1, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 1, 0, 0],
       [1, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [1, 1, 0, 0],
       [0, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [1, 0, 1, 0],
       [1, 0, 1, 0],
       [0, 0, 1, 0],
       [1, 0, 0, 0],
       [1, 1, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 0,

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
y_train_transformed = le.fit_transform(y_train)
y_test_transformed = le.transform(y_test)
y_train_transformed

array([0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1])

In [None]:
y_train.head(6)

Unnamed: 0,has_covid
3,No
4,No
90,No
72,No
20,No
59,Yes


In [None]:
# extract age
X_train_age = X_train.drop(columns=['gender','fever','cough','city']).values
X_test_age = X_test.drop(columns=['gender','fever','cough','city']).values


In [None]:
# concatenate the numpy arrays
# age with the rest
X_train_transformed = np.concatenate((X_train_age,X_train_fever,X_train_gender_city,X_train_cough),axis=1)
X_test_transformed = np.concatenate((X_test_age,X_test_fever,X_test_gender_city,X_test_cough),axis=1)

In [None]:
X_train_transformed.shape

(80, 7)

7 -> age + fever + (1 gender) + (3 cities) + (cough)

##### **LIFE WITH COLUMN TRANSFORMER**

In [None]:
from sklearn.compose import ColumnTransformer

apply columntransformer on specific columns, and on the remaining columns you either keep them which is "passthrough" or else you "drop" them

In [None]:
df =pd.read_csv('covid_toy.csv')

In [None]:
transformer_feature = ColumnTransformer(transformers=[
    # we pass 3 things in the tuple
    # transformer name, object of transformer class,columns on which tranformation to be applied
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(drop='first',sparse_output=False),['gender','city'])

], remainder='passthrough')

In [None]:
df.head(1)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No


In [None]:
transformer_feature.fit_transform(X_train).shape

(80, 7)

In [None]:
transformer_feature.transform(X_test).shape

(20, 7)