In [127]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day28-column-transformer/covid_toy.csv')

In [3]:
data.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [4]:
data.to_csv('covid_toy.csv')

In [128]:
df = pd.read_csv('covid_toy.csv')

In [129]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,age,gender,fever,cough,city,has_covid
0,0,60,Male,103.0,Mild,Kolkata,No
1,1,27,Male,100.0,Mild,Delhi,Yes
2,2,42,Male,101.0,Mild,Delhi,No
3,3,31,Female,98.0,Mild,Kolkata,No
4,4,65,Female,101.0,Mild,Mumbai,No


In [95]:
df.isnull().sum()

Unnamed: 0     0
age            0
gender         0
fever         10
cough          0
city           0
has_covid      0
dtype: int64

In [130]:
x = df.drop(columns=['has_covid','Unnamed: 0'])

In [131]:
y=df['has_covid']

In [7]:
# imputing missing values without imputer

x['fever'] = x['fever'].fillna(x['fever'].mean())

In [8]:
x.isnull().sum()

Unnamed: 0    0
age           0
gender        0
fever         0
cough         0
city          0
dtype: int64

In [9]:
x['fever'].shape

(100,)

In [10]:
# imputing ordinal categorical variable without ordinal encoding

x['cough'].unique()

array(['Mild', 'Strong'], dtype=object)

In [11]:
x['cough'] = x['cough'].map({'Mild':0,'Strong':1 })

In [12]:
x[['cough']]

Unnamed: 0,cough
0,0
1,0
2,0
3,0
4,0
...,...
95,0
96,1
97,0
98,1


In [13]:
x['cough'].shape

(100,)

In [14]:
# imputing nominal category without calling class onehotencoding 
x_gender_city = pd.get_dummies(x[['gender','city']], drop_first=True)

In [15]:
x = pd.concat([x,x_gender_city],axis=1)

In [16]:
x

Unnamed: 0.1,Unnamed: 0,age,gender,fever,cough,city,gender_Male,city_Delhi,city_Kolkata,city_Mumbai
0,0,60,Male,103.0,0,Kolkata,1,0,1,0
1,1,27,Male,100.0,0,Delhi,1,1,0,0
2,2,42,Male,101.0,0,Delhi,1,1,0,0
3,3,31,Female,98.0,0,Kolkata,0,0,1,0
4,4,65,Female,101.0,0,Mumbai,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
95,95,12,Female,104.0,0,Bangalore,0,0,0,0
96,96,51,Female,101.0,1,Kolkata,0,0,1,0
97,97,20,Female,101.0,0,Bangalore,0,0,0,0
98,98,5,Female,98.0,1,Mumbai,0,0,0,1


In [28]:
x = x.drop(columns=['Unnamed: 0','gender','city'])

In [29]:
x

Unnamed: 0,age,fever,cough,gender_Male,city_Delhi,city_Kolkata,city_Mumbai
0,60,103.0,0,1,0,1,0
1,27,100.0,0,1,1,0,0
2,42,101.0,0,1,1,0,0
3,31,98.0,0,0,0,1,0
4,65,101.0,0,0,0,0,1
...,...,...,...,...,...,...,...
95,12,104.0,0,0,0,0,0
96,51,101.0,1,0,0,1,0
97,20,101.0,0,0,0,0,0
98,5,98.0,1,0,0,0,1


In [22]:
df.head()

Unnamed: 0.1,Unnamed: 0,age,gender,fever,cough,city,has_covid
0,0,60,Male,103.0,Mild,Kolkata,No
1,1,27,Male,100.0,Mild,Delhi,Yes
2,2,42,Male,101.0,Mild,Delhi,No
3,3,31,Female,98.0,Mild,Kolkata,No
4,4,65,Female,101.0,Mild,Mumbai,No


In [112]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [113]:
x['fever'].isnull().sum()

10

In [114]:
# adding simple imputer to fever col

si = SimpleImputer()
x_fever = si.fit_transform(x[['fever']])

x_fever.shape

(100, 1)

In [115]:
#ordinalcoding --> cough

oe = OrdinalEncoder(categories=[['Mild','Strong']])
x_cough = oe.fit_transform(x[['cough']])

In [116]:
#onehotenoding --> gender,city

ohe = OneHotEncoder(drop='first',sparse=False)
x_gender = ohe.fit_transform(x[['gender']])
x_city = ohe.fit_transform(x[['city']])

In [117]:
x_age = x.drop(columns=['gender','fever','cough','city']).values

In [118]:
x_transformed = np.concatenate((x_age,x_fever,x_gender,x_city,x_cough), axis=1)

In [121]:
x_transformed.shape

(100, 7)

Unnamed: 0,age,gender,fever,cough,city
0,60,Male,103.0,Mild,Kolkata
1,27,Male,100.0,Mild,Delhi
2,42,Male,101.0,Mild,Delhi
3,31,Female,98.0,Mild,Kolkata
4,65,Female,101.0,Mild,Mumbai
...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore
96,51,Female,101.0,Strong,Kolkata
97,20,Female,101.0,Mild,Bangalore
98,5,Female,98.0,Strong,Mumbai


In [140]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_transformed)

In [141]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train,y_test =train_test_split(x_scaled,y,test_size=0.2)

In [142]:
x_train.shape

(80, 7)

In [132]:
from sklearn.compose import ColumnTransformer

In [133]:
transformer = ColumnTransformer(transformers=[
    ('tnf1', SimpleImputer(),['fever']),
    ('tnf2', OrdinalEncoder([['Mild','Strong']]),['cough']),
    ('tnf3', OneHotEncoder(sparse=False, drop='first'),['gender','city']),
],remainder='passthrough')



In [139]:
x_transformed = transformer.fit_transform(x)

In [138]:
y.shape

(100,)