# Data Preprocessing Tools

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [6]:
df = pd.read_csv('Data.csv')

In [7]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [8]:
df.shape

(10, 4)

In [10]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [11]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## Taking care of missing data

In [12]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [13]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [29]:
df['Salary'].fillna(value=round(df['Salary'].mean(),2),inplace=True)

In [30]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [33]:
df['Age'] = round(df['Age'],2)

In [34]:
df['Age'].fillna(value=df['Age'].mean(),inplace=True)

In [36]:
df['Salary'] = df['Salary'].apply(lambda x: round(x))

In [37]:
type(df)

pandas.core.frame.DataFrame

In [38]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [39]:
X

array([['France', 44.0, 72000],
       ['Spain', 27.0, 48000],
       ['Germany', 30.0, 54000],
       ['Spain', 38.0, 61000],
       ['Germany', 40.0, 63778],
       ['France', 35.0, 58000],
       ['Spain', 38.78, 52000],
       ['France', 48.0, 79000],
       ['Germany', 50.0, 83000],
       ['France', 37.0, 67000]], dtype=object)

In [40]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## Encoding categorical data

### Encoding the Independent Variable

In [42]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough')

X = np.array(ct.fit_transform(X))
X

array([[1.0, 0.0, 0.0, 44.0, 72000],
       [0.0, 0.0, 1.0, 27.0, 48000],
       [0.0, 1.0, 0.0, 30.0, 54000],
       [0.0, 0.0, 1.0, 38.0, 61000],
       [0.0, 1.0, 0.0, 40.0, 63778],
       [1.0, 0.0, 0.0, 35.0, 58000],
       [0.0, 0.0, 1.0, 38.78, 52000],
       [1.0, 0.0, 0.0, 48.0, 79000],
       [0.0, 1.0, 0.0, 50.0, 83000],
       [1.0, 0.0, 0.0, 37.0, 67000]], dtype=object)

### Encoding the Dependent Variable

In [43]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y = le.fit_transform(y)

In [44]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting the dataset into the Training set and Test set

In [45]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [46]:
X_train

array([[0.0, 0.0, 1.0, 27.0, 48000],
       [0.0, 0.0, 1.0, 38.78, 52000],
       [1.0, 0.0, 0.0, 48.0, 79000],
       [0.0, 1.0, 0.0, 50.0, 83000],
       [0.0, 1.0, 0.0, 40.0, 63778],
       [0.0, 1.0, 0.0, 30.0, 54000],
       [0.0, 0.0, 1.0, 38.0, 61000],
       [1.0, 0.0, 0.0, 35.0, 58000]], dtype=object)

In [47]:
X_train[:,3:]

array([[27.0, 48000],
       [38.78, 52000],
       [48.0, 79000],
       [50.0, 83000],
       [40.0, 63778],
       [30.0, 54000],
       [38.0, 61000],
       [35.0, 58000]], dtype=object)

In [48]:
X_test

array([[1.0, 0.0, 0.0, 44.0, 72000],
       [1.0, 0.0, 0.0, 37.0, 67000]], dtype=object)

In [49]:
type(X_train)

numpy.ndarray

## Feature Scaling
- Transforming age and salary feature column 

In [50]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()


In [51]:
X_train[:,3:]= ss.fit_transform(X_train[:,3:])
X_test[:,3:]= ss.transform(X_test[:,3:])

In [52]:
X_train

array([[0.0, 0.0, 1.0, -1.5272489276025045, -1.2184547069320466],
       [0.0, 0.0, 1.0, 0.05820975203243801, -0.8787506641553343],
       [1.0, 0.0, 0.0, 1.2991205352441668, 1.4142516245874743],
       [0.0, 1.0, 0.0, 1.5682985793248023, 1.7539556673641867],
       [0.0, 1.0, 0.0, 0.22240835892162542, 0.12150788980069531],
       [0.0, 1.0, 0.0, -1.1234818614815514, -0.7088986427669781],
       [0.0, 0.0, 1.0, -0.046769685159009936, -0.11441656790773144],
       [1.0, 0.0, 0.0, -0.450536751279963, -0.3691945999902657]],
      dtype=object)

In [53]:
X_test

array([[1.0, 0.0, 0.0, 0.7607644470828961, 0.8197695497282276],
       [1.0, 0.0, 0.0, -0.18135870719932762, 0.39513949625733713]],
      dtype=object)