In [1]:
import numpy as np
import pandas as pd

In [21]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day28-column-transformer/covid_toy.csv")

In [4]:
df.head(10)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
5,84,Female,,Mild,Bangalore,Yes
6,14,Male,101.0,Strong,Bangalore,No
7,20,Female,,Strong,Mumbai,Yes
8,19,Female,100.0,Strong,Bangalore,No
9,64,Female,101.0,Mild,Delhi,No


In [6]:
np.round(df.describe(), 2)

Unnamed: 0,age,fever
count,100.0,90.0
mean,44.22,100.84
std,24.88,2.05
min,5.0,98.0
25%,20.0,99.0
50%,45.0,101.0
75%,66.5,102.75
max,84.0,104.0


### Find total no of nulls in the dataset

In [8]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [9]:
## So fever column has 10 null datapoints

In [11]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


### On gender and city we will apply OneHotEncoding, on cough we will apply ordinal encoding and on has_covid we will apply label encoding

## Train Test Split

In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df.drop("has_covid", axis = 1), df["has_covid"],
                                                     test_size = 0.3, random_state = 0)

In [24]:
x_train.head()

Unnamed: 0,age,gender,fever,cough,city
60,24,Female,102.0,Strong,Bangalore
80,14,Female,99.0,Mild,Mumbai
90,59,Female,99.0,Strong,Delhi
68,54,Female,104.0,Strong,Kolkata
51,11,Female,100.0,Strong,Kolkata


In [25]:
x_train.isnull().sum()

age       0
gender    0
fever     9
cough     0
city      0
dtype: int64

# First we will solve this without using column transformer and then we will solve using column transformer

## Without using column transformer

#### In order to fill the missing values we use simple imputer. It basically replaces the Nan values with a specific placeholder
#### missing_values : The missing_values placeholder which has to be imputed. By default is NaN 
#### strategy : The data which will replace the NaN values from the dataset. The strategy argument can take the values – ‘mean'(default), ‘median’, ‘most_frequent’ and ‘constant’. 
#### fill_value : The constant value to be given to the NaN data using the constant strategy.

In [32]:
# adding simpleImputer to fever col

si = SimpleImputer(missing_values=np.nan, strategy='mean')


x_train_fever = si.fit_transform(x_train[['fever']])

x_test_fever = si.fit_transform(x_test[['fever']])

x_train_fever

array([[102.        ],
       [ 99.        ],
       [ 99.        ],
       [104.        ],
       [100.        ],
       [102.        ],
       [ 98.        ],
       [101.06557377],
       [100.        ],
       [104.        ],
       [100.        ],
       [ 98.        ],
       [100.        ],
       [101.06557377],
       [101.        ],
       [103.        ],
       [ 98.        ],
       [102.        ],
       [101.        ],
       [101.06557377],
       [101.06557377],
       [104.        ],
       [103.        ],
       [102.        ],
       [104.        ],
       [101.        ],
       [ 98.        ],
       [102.        ],
       [ 98.        ],
       [100.        ],
       [101.06557377],
       [103.        ],
       [104.        ],
       [ 99.        ],
       [103.        ],
       [103.        ],
       [101.        ],
       [ 98.        ],
       [104.        ],
       [103.        ],
       [101.06557377],
       [101.06557377],
       [104.        ],
       [101

In [34]:
# OrdinalEncoding -> cough

oe = OrdinalEncoder(categories = [["Mild","Strong"]])

oe.fit(x_train[["cough"]])

x_train_cough = oe.transform(x_train[["cough"]])

x_test_cough = oe.transform(x_test[["cough"]])

In [35]:
# OneHotEncoding -> gender, city

ohe = OneHotEncoder(drop = 'first', sparse = False)

x_train_gender_city = ohe.fit_transform(x_train[["gender","city"]])

x_test_gender_city = ohe.fit_transform(x_test[["gender","city"]])

In [40]:
#Extract age in the form of numpy arrays

x_train_age = x_train[["age"]].values
x_test_age = x_test[["age"]].values

### Create the combined data set

In [41]:
x_train_transformed = np.concatenate((x_train_cough,x_train_gender_city,x_train_fever, x_train_age), axis = 1)

x_test_transformed = np.concatenate((x_test_cough,x_test_gender_city,x_test_fever, x_test_age), axis = 1)

In [44]:
x_train_transformed[0:5:1,:]

array([[  1.,   0.,   0.,   0.,   0., 102.,  24.],
       [  0.,   0.,   0.,   0.,   1.,  99.,  14.],
       [  1.,   0.,   1.,   0.,   0.,  99.,  59.],
       [  1.,   0.,   0.,   1.,   0., 104.,  54.],
       [  1.,   0.,   0.,   1.,   0., 100.,  11.]])

In [48]:
## shape of the final transformed dataset

x_train_transformed.shape, x_test_transformed.shape

((70, 7), (30, 7))

# With using column transformer

In [51]:
from sklearn.compose import ColumnTransformer

# ColumnTransformer takes 2 argument, 1 name of all the transforemrs and 2nd remainder for those columns in which no transformation occurs
# Transformers are passed in the form of tupples
# Each tupple consists of 3 things - transformer_name, transformer_object, columns

transformer = ColumnTransformer(transformers = [
    ('tnf1', SimpleImputer(), ['fever']),
    ('tnf2', OrdinalEncoder(categories = [["Mild", "Strong"]]), ["cough"]),
    ('tnf3', OneHotEncoder(sparse = False, drop = "first"),["gender","city"])
], remainder = 'passthrough')

In [52]:
x_train_new_modified = transformer.fit_transform(x_train)

x_test_new_modified = transformer.fit_transform(x_test)