# <span style='color:red'>Column Transformers</span>

## <span style='color:blue'><a href=''>Read In detail about Column Transformers</a></span>

In [1]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('covid.csv')

In [3]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [4]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [5]:
from sklearn.model_selection import train_test_split


#splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(df.drop('has_covid', axis=1), df['has_covid'], test_size = 0.2, random_state = 2) 

In [6]:
X_train.shape, X_test.shape

((80, 5), (20, 5))

## Imputing and Encoding values Like a Rookie

In [7]:
from sklearn.impute import SimpleImputer

si = SimpleImputer()


#applying Simple IMputer on fever column of dataset
X_train_fever = si.fit_transform(X_train[['fever']])
X_test_fever = si.fit_transform(X_test[['fever']])

X_train_fever.shape

(80, 1)

In [8]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories = [['Mild', 'Strong']])

#applying Oordinal Encoding on cough column of dataset
X_train_cough = oe.fit_transform(X_train[['cough']])
X_test_cough = oe.fit_transform(X_test[['cough']])

X_train_cough.shape

(80, 1)

In [9]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop = 'first', sparse_output = False)

#applying OHE on two columns of dataset
X_train_gender_city = ohe.fit_transform(X_train[['gender', 'city']])
X_test_gender_city = ohe.fit_transform(X_test[['gender', 'city']])

X_train_gender_city.shape

(80, 4)

In [10]:
# Extracting Age
X_train_age = X_train.drop(columns=['gender','fever','cough','city']).values

# also the test data
X_test_age = X_test.drop(columns=['gender','fever','cough','city']).values

X_train_age.shape

(80, 1)

In [11]:
# combining all data pieces of train data set 
X_train_transformed = np.concatenate((X_train_age, X_train_fever, X_train_cough, X_train_gender_city), axis =1)


#also of test data set
X_test_transformed = np.concatenate((X_test_age, X_test_fever, X_test_cough, X_test_gender_city), axis =1)

In [12]:
X_train_transformed

array([[ 82.        , 102.        ,   1.        ,   0.        ,
          0.        ,   0.        ,   0.        ],
       [ 65.        ,  98.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   1.        ],
       [ 69.        ,  98.        ,   1.        ,   0.        ,
          0.        ,   0.        ,   1.        ],
       [ 20.        , 102.        ,   1.        ,   1.        ,
          1.        ,   0.        ,   0.        ],
       [ 34.        ,  98.        ,   1.        ,   1.        ,
          0.        ,   1.        ,   0.        ],
       [ 84.        , 100.79166667,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ],
       [ 23.        , 100.79166667,   0.        ,   1.        ,
          0.        ,   0.        ,   1.        ],
       [  8.        , 101.        ,   0.        ,   0.        ,
          0.        ,   1.        ,   0.        ],
       [ 34.        , 104.        ,   1.        ,   0.        ,
          1.    

In [13]:
X_train_transformed.shape

(80, 7)

In [14]:
X_test_transformed.shape

(20, 7)

## Perform the same task like a PRO 

In [15]:
from sklearn.compose import ColumnTransformer

In [16]:
#Applying Column Transformer on all columns at once

transformer = ColumnTransformer(transformers=[
    ('trf1', SimpleImputer(), ['fever']),
    ('trf2', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough']),
    ('trf3', OneHotEncoder(drop='first', sparse_output = False), ['gender', 'city']),
], remainder = 'passthrough')

In [17]:
transformer.fit_transform(X_train).shape

(80, 7)

In [18]:
transformer.transform(X_test).shape

(20, 7)