## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Getting Dataset

In [2]:
dataset = pd.read_csv("Data.csv")

In [12]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
Country      10 non-null object
Age          9 non-null float64
Salary       9 non-null float64
Purchased    10 non-null object
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [6]:
dataset.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


## Finding missing data

In [11]:
dataset[dataset.Salary.isna()==True]

Unnamed: 0,Country,Age,Salary,Purchased
4,Germany,40.0,,Yes


In [13]:
dataset[dataset.Age.isna()==True]

Unnamed: 0,Country,Age,Salary,Purchased
6,Spain,,52000.0,No


Splitting dependent and independent variables:

In [16]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

In [27]:
qualitative = [column for column in X.columns if X[column].dtype=="object"]
quantitative = [column for column in X.columns if column not in qualitative]

In [30]:
print(qualitative)
print(quantitative)

['Country']
['Age', 'Salary']


### 1. Imputing missing data

In [14]:
from sklearn.preprocessing import Imputer

In [15]:
missing_imputer = Imputer(strategy="mean")

In [31]:
X[quantitative] = missing_imputer.fit_transform(X[quantitative])

In [32]:
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [33]:
dataset.Salary.mean()

63777.77777777778

In [34]:
dataset.Age.mean()

38.77777777777778

### 2. Enconding categorical data

In [35]:
from sklearn.preprocessing import OneHotEncoder

In [87]:
ohe = OneHotEncoder()
categorical = ohe.fit_transform(X[qualitative])
categorical_df = pd.DataFrame(data=categorical.toarray(), columns=ohe.categories_)

In [90]:
X_final = pd.merge(categorical_df, X[quantitative], how="inner", left_index=True, right_index=True)

In [91]:
X_final

Unnamed: 0,"(France,)","(Germany,)","(Spain,)",Age,Salary
0,1.0,0.0,0.0,44.0,72000.0
1,0.0,0.0,1.0,27.0,48000.0
2,0.0,1.0,0.0,30.0,54000.0
3,0.0,0.0,1.0,38.0,61000.0
4,0.0,1.0,0.0,40.0,63777.777778
5,1.0,0.0,0.0,35.0,58000.0
6,0.0,0.0,1.0,38.777778,52000.0
7,1.0,0.0,0.0,48.0,79000.0
8,0.0,1.0,0.0,50.0,83000.0
9,1.0,0.0,0.0,37.0,67000.0


In [92]:
from sklearn.preprocessing import LabelBinarizer

In [100]:
lb = LabelBinarizer()
y_final = lb.fit_transform(y)

In [101]:
y_final

array([[0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1]])

### 3. Pipeline

In [170]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Imputer

In [171]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

In [172]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, data, labels=None):
        return self

    def transform(self, data):
        assert isinstance(data, pd.DataFrame)

        try:
            return data[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(data.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)

In [173]:
numerical = Pipeline(steps=[
    ("numerical_columns", ColumnSelector(columns=quantitative)),
    ("mean_imputer", Imputer(strategy="mean"))
            ])

categorical = Pipeline(steps=[
    ("categorical_columns", ColumnSelector(columns=qualitative)),
    ("onehot", OneHotEncoder())
            ])

preprocessor = FeatureUnion(transformer_list=[
            ('cat', categorical),
            ('num', numerical)
                ])

In [174]:
X_final = preprocessor.fit_transform(X)

In [175]:
X_final = pd.DataFrame(X_final.toarray(), columns=list(categorical[1].categories_[0])+quantitative)

In [176]:
X_final

Unnamed: 0,France,Germany,Spain,Age,Salary
0,1.0,0.0,0.0,44.0,72000.0
1,0.0,0.0,1.0,27.0,48000.0
2,0.0,1.0,0.0,30.0,54000.0
3,0.0,0.0,1.0,38.0,61000.0
4,0.0,1.0,0.0,40.0,63777.777778
5,1.0,0.0,0.0,35.0,58000.0
6,0.0,0.0,1.0,38.777778,52000.0
7,1.0,0.0,0.0,48.0,79000.0
8,0.0,1.0,0.0,50.0,83000.0
9,1.0,0.0,0.0,37.0,67000.0


In [154]:
numerical = Pipeline(steps=[
    ("numerical_columns", ColumnSelector(columns=quantitative)),
    ("mean_imputer", Imputer(strategy="mean")),
    ("std_scaler", StandardScaler())
            ])

categorical = Pipeline(steps=[
    ("categorical_columns", ColumnSelector(columns=qualitative)),
    ("onehot", OneHotEncoder())
            ])

preprocessor = FeatureUnion(transformer_list=[
            ('cat', categorical),
            ('num', numerical)
                ])

In [155]:
X_final = preprocessor.fit_transform(X)

In [156]:
X_final = pd.DataFrame(X_final.toarray(), columns=list(categorical[1].categories_[0])+quantitative)

In [157]:
X_final

Unnamed: 0,France,Germany,Spain,Age,Salary
0,1.0,0.0,0.0,0.758874,0.7494733
1,0.0,0.0,1.0,-1.711504,-1.438178
2,0.0,1.0,0.0,-1.275555,-0.8912655
3,0.0,0.0,1.0,-0.113024,-0.2532004
4,0.0,1.0,0.0,0.177609,6.632192e-16
5,1.0,0.0,0.0,-0.548973,-0.5266569
6,0.0,0.0,1.0,0.0,-1.07357
7,1.0,0.0,0.0,1.34014,1.387538
8,0.0,1.0,0.0,1.630773,1.752147
9,1.0,0.0,0.0,-0.25834,0.2937125


## Splitting dataset into training and testing

In [190]:
from sklearn.model_selection import train_test_split

In [191]:
X_train, X_test, y_train, y_test = train_test_split(X, y_final, test_size=0.2, random_state=0)

Applying pipeline:

In [192]:
X_train_final = preprocessor.fit_transform(X_train)
X_test_final = preprocessor.fit(X_test)