In [2]:
import pandas as pd

In [3]:
data = {
    "Name": ["Anna", "Bob", "Charlie", "Diana", "Eric"], 
    "Age": [20, 34, 23, None, 33],
    "Gender": ["f", "m", "m", "f", "m"],
    "Job": ["Programmer", "Writter", "Cook", "Programmer", "Teacher"]

}

In [4]:
df = pd.DataFrame(data)

In [5]:
df

Unnamed: 0,Name,Age,Gender,Job
0,Anna,20.0,f,Programmer
1,Bob,34.0,m,Writter
2,Charlie,23.0,m,Cook
3,Diana,,f,Programmer
4,Eric,33.0,m,Teacher


Preprocessing PipeLine:

* Drop Name Feature

* Imput Ages

* Turn Genders Into Binary / Numeric

* One Hot Encode Jobs
#Hot Encode means we take categories and put numbers instead and we will make a column for each Job

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Drop Name Feature

df = df.drop(["Name"], axis=1)

In [7]:
df

Unnamed: 0,Age,Gender,Job
0,20.0,f,Programmer
1,34.0,m,Writter
2,23.0,m,Cook
3,,f,Programmer
4,33.0,m,Teacher


In [8]:
# Impute Age

imputer = SimpleImputer(strategy="mean")
df["Age"] = imputer.fit_transform(df[["Age"]])
df

Unnamed: 0,Age,Gender,Job
0,20.0,f,Programmer
1,34.0,m,Writter
2,23.0,m,Cook
3,27.5,f,Programmer
4,33.0,m,Teacher


In [9]:
# Numeric Gender

gender_dct = {"m": 0, "f": 1}
df["Gender"] = [gender_dct[g] for g in df["Gender"]]

In [10]:
df

Unnamed: 0,Age,Gender,Job
0,20.0,1,Programmer
1,34.0,0,Writter
2,23.0,0,Cook
3,27.5,1,Programmer
4,33.0,0,Teacher


In [11]:
# OneHotEncode Jobs

encoder = OneHotEncoder()
matrix = encoder.fit_transform(df[["Job"]]).toarray()

column_names = ["Programmer", "Writter", "Cook", "Teacher"]

for i in range(len(matrix.T)):
    df[column_names[i]] = matrix.T[i]

df = df.drop("Job", axis=1)


df

Unnamed: 0,Age,Gender,Programmer,Writter,Cook,Teacher
0,20.0,1,0.0,1.0,0.0,0.0
1,34.0,0,0.0,0.0,0.0,1.0
2,23.0,0,1.0,0.0,0.0,0.0
3,27.5,1,0.0,1.0,0.0,0.0
4,33.0,0,0.0,0.0,1.0,0.0


In [16]:
from sklearn.base import BaseEstimator, TransformerMixin

class NameDropper(BaseEstimator, TransformerMixin):

    def fit(self, X, Y=None):
        return self

    def transform(self, X):
        return X.drop(["Name"], axis=1)


class AgeImputer(BaseEstimator, TransformerMixin):

    def fit(self, X, Y=None):
        return self

    def transform(self, X):
        imputer = SimpleImputer(strategy="mean")
        X["Age"]  = imputer.fit_transform(X[["Age"]])
        return X

class FeatureEncoder (BaseEstimator, TransformerMixin):

    def fit(self, X, Y=None):
        return self

    def transform(self, X):
        # Numeric Gender
        gender_dct = {"m": 0, "f": 1}
        X["Gender"] = [gender_dct[g] for g in X["Gender"]]
        # OneHotEncode Jobs
        encoder = OneHotEncoder()
        matrix = encoder.fit_transform(X[["Job"]]).toarray()

        column_names = ["Programmer", "Writter", "Cook", "Teacher"]

        for i in range(len(matrix.T)):
            X[column_names[i]] = matrix.T[i]

        return X.drop("Job", axis=1)


In [17]:
data = {
    "Name": ["Fiona", "Gerald", "Hans", "Isabella", "Jacob"], 
    "Age": [20, 34, None, None, 33],
    "Gender": ["f", "m", "m", "f", "m"],
    "Job": ["Writter", "Programmer", "Programmer", "Programmer", "Teacher"]
}

df2 = pd.DataFrame(data)

df2

Unnamed: 0,Name,Age,Gender,Job
0,Fiona,20.0,f,Writter
1,Gerald,34.0,m,Programmer
2,Hans,,m,Programmer
3,Isabella,,f,Programmer
4,Jacob,33.0,m,Teacher


In [18]:
dropper = NameDropper()
dropper.fit_transform(df2)

Unnamed: 0,Age,Gender,Job
0,20.0,f,Writter
1,34.0,m,Programmer
2,,m,Programmer
3,,f,Programmer
4,33.0,m,Teacher


In [19]:
df2

Unnamed: 0,Name,Age,Gender,Job
0,Fiona,20.0,f,Writter
1,Gerald,34.0,m,Programmer
2,Hans,,m,Programmer
3,Isabella,,f,Programmer
4,Jacob,33.0,m,Teacher


In [23]:
dropper =  NameDropper()
imp = AgeImputer()
enc = FeatureEncoder()

enc.fit_transform(imp.fit_transform((dropper.fit_transform(df2))))

Unnamed: 0,Age,Gender,Programmer,Writter,Cook
0,20.0,1,0.0,0.0,1.0
1,34.0,0,1.0,0.0,0.0
2,29.0,0,1.0,0.0,0.0
3,29.0,1,1.0,0.0,0.0
4,33.0,0,0.0,1.0,0.0


In [24]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("dropper",  NameDropper()),
    ("imp", AgeImputer()),
    ("enc", FeatureEncoder())
])

# df2

pipe.fit_transform(df2)

Unnamed: 0,Age,Gender,Programmer,Writter,Cook
0,20.0,1,0.0,0.0,1.0
1,34.0,0,1.0,0.0,0.0
2,29.0,0,1.0,0.0,0.0
3,29.0,1,1.0,0.0,0.0
4,33.0,0,0.0,1.0,0.0
