# Import Pandas Library

In [10]:
import pandas as pd

## Load dataset

In [27]:
df = pd.read_csv('Data.csv')

### Check dataset

In [12]:
print(df)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


### Split feature X

In [65]:
X = df.iloc[:, :-1]

In [66]:
print(X)

   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
5   France  35.0  58000.0
6    Spain   NaN  52000.0
7   France  48.0  79000.0
8  Germany  50.0  83000.0
9   France  37.0  67000.0


### Split label y

In [29]:
y = df.iloc[:, -1]

In [30]:
print(y)

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


## Handle missing value

### Import SimpleImputer and numpy

In [31]:
from sklearn.impute import SimpleImputer
import numpy as np

### Replace nan value with mean

In [67]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X.iloc[:, 1:3] = imputer.fit_transform(X.iloc[:, 1:3])

In [68]:
print(X)

   Country        Age        Salary
0   France  44.000000  72000.000000
1    Spain  27.000000  48000.000000
2  Germany  30.000000  54000.000000
3    Spain  38.000000  61000.000000
4  Germany  40.000000  63777.777778
5   France  35.000000  58000.000000
6    Spain  38.777778  52000.000000
7   France  48.000000  79000.000000
8  Germany  50.000000  83000.000000
9   France  37.000000  67000.000000


## Handle Categories value

### import libraries

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

### Feature categories

In [74]:
print(X.columns)

Index(['Country', 'Age', 'Salary'], dtype='object')


In [75]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = pd.DataFrame(ct.fit_transform(X))

In [76]:
print(X)

     0    1    2          3             4
0  1.0  0.0  0.0  44.000000  72000.000000
1  0.0  0.0  1.0  27.000000  48000.000000
2  0.0  1.0  0.0  30.000000  54000.000000
3  0.0  0.0  1.0  38.000000  61000.000000
4  0.0  1.0  0.0  40.000000  63777.777778
5  1.0  0.0  0.0  35.000000  58000.000000
6  0.0  0.0  1.0  38.777778  52000.000000
7  1.0  0.0  0.0  48.000000  79000.000000
8  0.0  1.0  0.0  50.000000  83000.000000
9  1.0  0.0  0.0  37.000000  67000.000000


### Label categories

In [47]:
from sklearn.preprocessing import LabelEncoder

In [48]:
le = LabelEncoder()
y = le.fit_transform(y)

In [49]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Spliting data into train and test set

### Import train_test_split library

In [77]:
from sklearn.model_selection import train_test_split

### Split the data

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [79]:
print(X_train)

     0    1    2          3             4
5  1.0  0.0  0.0  35.000000  58000.000000
0  1.0  0.0  0.0  44.000000  72000.000000
7  1.0  0.0  0.0  48.000000  79000.000000
2  0.0  1.0  0.0  30.000000  54000.000000
9  1.0  0.0  0.0  37.000000  67000.000000
4  0.0  1.0  0.0  40.000000  63777.777778
3  0.0  0.0  1.0  38.000000  61000.000000
6  0.0  0.0  1.0  38.777778  52000.000000


In [80]:
print(X_test)

     0    1    2     3        4
8  0.0  1.0  0.0  50.0  83000.0
1  0.0  0.0  1.0  27.0  48000.0


## Scale the data

### Import libraries

In [81]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train[: 1:] = ss.fit_transform(X_train[: 1:])
X_test[: 1:] = ss.transform(X_train[: 1:])

In [82]:
print(X_train)

     0    1    2          3             4
5  0.0  0.0  0.0   0.000000      0.000000
0  1.0  0.0  0.0  44.000000  72000.000000
7  1.0  0.0  0.0  48.000000  79000.000000
2  0.0  1.0  0.0  30.000000  54000.000000
9  1.0  0.0  0.0  37.000000  67000.000000
4  0.0  1.0  0.0  40.000000  63777.777778
3  0.0  0.0  1.0  38.000000  61000.000000
6  0.0  0.0  1.0  38.777778  52000.000000


In [83]:
print(X_test)

     0    1    2     3        4
8 -1.0  0.0  0.0 -35.0 -58000.0
1  0.0  0.0  1.0  27.0  48000.0
