In [98]:
import pandas as pd
import numpy as np

df = pd.read_csv('covid_5000.csv')
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        5000 non-null   int64  
 1   gender     5000 non-null   str    
 2   fever      5000 non-null   float64
 3   cough      3370 non-null   str    
 4   city       5000 non-null   str    
 5   has_covid  5000 non-null   str    
dtypes: float64(1), int64(1), str(4)
memory usage: 234.5 KB


In [99]:
df.isnull().sum()

age             0
gender          0
fever           0
cough        1630
city            0
has_covid       0
dtype: int64

In [100]:
df.sample(10)


Unnamed: 0,age,gender,fever,cough,city,has_covid
408,16,Female,98.6,Severe,Chennai,Yes
1974,52,Male,100.8,Severe,Bangalore,Yes
3048,14,Male,100.2,Severe,Bangalore,No
3575,57,Male,98.8,Mild,Mumbai,Yes
4593,78,Female,99.4,Mild,Delhi,Yes
4574,6,Male,97.0,Mild,Bangalore,No
4570,48,Male,104.1,Mild,Chennai,No
3890,26,Female,99.5,Mild,Pune,No
3452,75,Female,99.7,Mild,Mumbai,Yes
1505,35,Female,97.1,Severe,Chennai,No


In [101]:
df['cough'].value_counts()

cough
Severe    1694
Mild      1676
Name: count, dtype: int64

In [102]:
df['city'].value_counts()

city
Mumbai       874
Delhi        853
Chennai      852
Pune         844
Bangalore    789
Kolkata      788
Name: count, dtype: int64

<h3>Without ColumnTransformation</h3>

In [103]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),(df['has_covid']),test_size = 0.2)


In [35]:
x_train

Unnamed: 0,age,gender,fever,cough,city
4227,49,Male,98.5,Severe,Pune
4676,21,Male,100.9,Mild,Pune
800,19,Female,104.4,Severe,Bangalore
3671,25,Male,104.6,Mild,Delhi
4193,33,Male,97.9,,Delhi
...,...,...,...,...,...
4426,21,Female,102.9,,Mumbai
466,80,Male,103.0,,Bangalore
3092,41,Male,102.6,,Delhi
3772,57,Female,102.5,Mild,Pune


In [42]:
x_train.shape

(4000, 5)

Simple Imputer

In [36]:
from sklearn.impute import SimpleImputer

In [38]:
si = SimpleImputer()

x_train_fever = si.fit_transform(x_train[['fever']])
x_test_fever = si.fit_transform(x_test[['fever']])

In [39]:
x_train_fever

array([[ 98.5],
       [100.9],
       [104.4],
       ...,
       [102.6],
       [102.5],
       [100.6]], shape=(4000, 1))

Ordinal Encoding

In [43]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder()

x_train_cough = oe.fit_transform(x_train[['cough']])
x_test_cough = oe.fit_transform(x_test[['cough']])

x_train_cough.shape

(4000, 1)

In [47]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

x_train_gender_city = ohe.fit_transform(x_train[['gender','city']]).toarray()
x_test_gender_city = ohe.fit_transform(x_test[['gender','city']]).toarray()
x_test_gender_city

array([[0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.]], shape=(1000, 8))

In [54]:
x_train_age = x_train.drop(columns=['gender','fever','cough','city']).values
x_test_age = x_test.drop(columns=['gender','fever','cough','city']).values
x_test_age

array([[49],
       [29],
       [85],
       [ 2],
       [69],
       [11],
       [89],
       [21],
       [14],
       [27],
       [29],
       [80],
       [75],
       [67],
       [57],
       [83],
       [13],
       [30],
       [45],
       [31],
       [84],
       [26],
       [35],
       [54],
       [74],
       [58],
       [32],
       [85],
       [79],
       [44],
       [29],
       [42],
       [13],
       [62],
       [76],
       [64],
       [10],
       [73],
       [15],
       [78],
       [63],
       [35],
       [42],
       [ 4],
       [87],
       [81],
       [12],
       [21],
       [82],
       [ 4],
       [58],
       [ 6],
       [38],
       [74],
       [85],
       [33],
       [54],
       [62],
       [56],
       [21],
       [83],
       [88],
       [67],
       [ 3],
       [76],
       [12],
       [69],
       [13],
       [28],
       [12],
       [47],
       [40],
       [70],
       [17],
       [51],
       [13],
       [37],

In [56]:
x_train_age.shape

(4000, 1)

In [57]:
x_test_age.shape

(1000, 1)

concatenate the columns

In [65]:
x_train_transfromed = np.concatenate((x_train_age,x_train_fever,x_train_gender_city,x_train_cough),axis=1)

x_test_transformed = np.concatenate((x_test_age,x_test_fever,x_test_gender_city,x_test_cough),axis=1)

In [66]:
x_train_transfromed

array([[ 49. ,  98.5,   0. , ...,   0. ,   1. ,   1. ],
       [ 21. , 100.9,   0. , ...,   0. ,   1. ,   0. ],
       [ 19. , 104.4,   1. , ...,   0. ,   0. ,   1. ],
       ...,
       [ 41. , 102.6,   0. , ...,   0. ,   0. ,   nan],
       [ 57. , 102.5,   1. , ...,   0. ,   1. ,   0. ],
       [ 44. , 100.6,   0. , ...,   0. ,   0. ,   nan]], shape=(4000, 11))