In [1]:
import numpy as np 
import pandas as pd

In [2]:
df=pd.read_csv("F:\\covid_toy.csv")

In [3]:
df=df.dropna()

In [4]:
df=df.drop(columns=['age','fever'])

In [5]:
df.head()

Unnamed: 0,gender,cough,city,has_covid
0,Male,Mild,Kolkata,No
1,Male,Mild,Delhi,Yes
2,Male,Mild,Delhi,No
3,Female,Mild,Kolkata,No
4,Female,Mild,Mumbai,No


In [6]:
from sklearn.preprocessing import OneHotEncoder

In [7]:
df.shape

(90, 4)

# get_dummies method

In [8]:
p=pd.get_dummies(df,columns=['gender','cough','city','has_covid'])

In [9]:
p.shape

(90, 10)

In [10]:
p=pd.get_dummies(df,columns=['gender','cough','city','has_covid'],drop_first=True)

In [11]:
p.shape

(90, 6)

In [12]:
othe=OneHotEncoder(drop='first',sparse=False,dtype=np.int32)

In [13]:
df_new=othe.fit_transform(df[['gender','cough','city','has_covid']])



In [14]:
df_new.shape

(90, 6)

# column transformer

In [15]:
import numpy as np 
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [16]:
df=pd.read_csv("F:\\covid_toy.csv")

In [17]:
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [18]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
x_train,x_test,y_train,y_test=train_test_split(df.drop(columns=['has_covid']),df['has_covid'],test_size=0.2)

In [21]:
x_train

Unnamed: 0,age,gender,fever,cough,city
54,60,Female,99.0,Mild,Mumbai
72,83,Female,101.0,Mild,Kolkata
90,59,Female,99.0,Strong,Delhi
22,71,Female,98.0,Strong,Kolkata
50,19,Male,101.0,Mild,Delhi
...,...,...,...,...,...
7,20,Female,,Strong,Mumbai
75,5,Male,102.0,Mild,Kolkata
82,24,Male,98.0,Mild,Kolkata
79,48,Female,103.0,Mild,Kolkata


# Mannually type output

In [22]:
# adding simple imputer to fever column
si=SimpleImputer()
x_train_fever=si.fit_transform(x_train[['fever']])

# also the test data
x_test_fever=si.fit_transform(x_test[['fever']])

x_train_fever.shape

(80, 1)

In [23]:
# ordinal encoding ==> cough

oe=OrdinalEncoder(categories=[['Mild','Strong']])
x_train_cough=oe.fit_transform(x_train[['cough']])

# also the test data
x_test_cough=oe.fit_transform(x_test[['cough']])
x_train_cough.shape

(80, 1)

In [24]:
# OneHotEncoding ==> Gender,city

ohe=OneHotEncoder(drop='first',sparse=False)
x_train_gender_city=ohe.fit_transform(x_train[['gender','city']])


# also the test data
x_test_gender_city=othe.fit_transform(x_test[['gender','city']])

x_train_gender_city.shape



(80, 4)

In [27]:
 # #  Extracting Age
x_train_age = x_train.drop(columns = ['gender' , 'fever' , 'cough' , 'city']).values
# also the test data
x_test_age = x_test.drop(columns = ['gender' , 'fever' , 'cough' , 'city']).values

In [28]:
x_train_age.shape

(80, 1)

In [29]:
x_train_transformed=np.concatenate((x_train_age,x_train_fever,x_train_gender_city,x_train_cough),axis=1)

In [30]:
x_train_transformed.shape

(80, 7)

# by the help of column transformer1

In [31]:
from sklearn.compose import ColumnTransformer #this how to import ColumnTransformer

transformer=ColumnTransformer(transformers=[
        ('tnf1',SimpleImputer(),['fever']), #  is a 'fever' column by the help of SI we fill missing value
        ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
        ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city']),
],remainder='passthrough')#remainder =passthrough ==> means rest all the columns remain same

In [32]:
transformer.fit_transform(x_train).shape



(80, 7)

In [33]:
transformer.fit_transform(x_test).shape



(20, 7)

# Function transformer

In [34]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np

# create a dataset
X=np.array([[1,2],[3,4]])

# define the transformation function
log_transform = FunctionTransformer(np.log1p)

# apply the transformation to the dataset
X_transformed = log_transform.transform(X)

# view the transformed data
print(X_transformed)

[[0.69314718 1.09861229]
 [1.38629436 1.60943791]]


# Example 2

In [35]:
#1. Custom Feature Engineering
from sklearn.preprocessing import FunctionTransformer 
import numpy as np
# create a dataset
X= np.array([[1, 2], [3, 4]])
# define a custom feature engineering function 
def my_feature_engineering(X): 
    return np.hstack ((X, X**2))
# create a Function Transformer to apply the custom function 
custom_transformer=FunctionTransformer(my_feature_engineering)
# apply the transformer to the input data
X_transformed=custom_transformer.transform(X)
# view the transformed data 
print(X_transformed)

[[ 1  2  1  4]
 [ 3  4  9 16]]


# Example 3

In [36]:
from sklearn.preprocessing import FunctionTransformer 
import numpy as np

# create a dataset
X= np.array([[1, 2], [3, 4]])

# define a custom feature engineering function 
def my_scaling(X): 
    return X/np.max(X)

# create a Function Transformer to apply the custom function 
custom_transformer=FunctionTransformer(my_scaling)

# apply the transformer to the input data
X_transformed=custom_transformer.transform(X)

# view the transformed data 
print(X_transformed)

[[0.25 0.5 ]
 [0.75 1.  ]]


# Example 4

In [37]:
# Data Cleaning
from sklearn.preprocessing import FunctionTransformer 
import numpy as np

# create a dataset
X= np.array([[1, 2], [3, np.nan]])

# define a custom feature engineering function 
def my_cleaning(X):
    X[np.isnan(X)]=0
    return X

# create a Function Transformer to apply the custom function 
custom_transformer=FunctionTransformer(my_cleaning)

# apply the transformer to the input data
X_transformed=custom_transformer.transform(X)

# view the transformed data 
print(X_transformed)

[[1. 2.]
 [3. 0.]]
