## Importing the Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the Dataset

In [2]:
dataset = pd.read_csv(r"datasets\Data.csv")
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Classify features and labels

In [3]:
# classify X and y
# converting data-frame to array

X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [4]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [5]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Handling Missing Data

In [6]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])

In [7]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding Categorical Data

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])], remainder="passthrough")
X = ct.fit_transform(X)

In [9]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [11]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting Dataset into Training and Testing

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
print(X_train)

[[1.0 0.0 0.0 37.0 67000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 30.0 54000.0]]


In [14]:
print(X_test)

[[1.0 0.0 0.0 35.0 58000.0]
 [1.0 0.0 0.0 44.0 72000.0]]


In [15]:
print(y_train)

[1 0 0 1 0 1 1 0]


In [16]:
print(y_test)

[1 0]


## Feature Scaling

#### Standardization [ better for any case ] ✅

\begin{equation*}
x = 
\frac{x-mean(x)}{std(x)}
\end{equation*}

#### Normalization [ best for data following normal distribution ]

\begin{equation*}
x = 
\frac{x-min(x)}{max(x)-min(x)}
\end{equation*}

In [17]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:,3:] = sc.fit_transform(X_train[:,3:])
X_test[:,3:] = sc.transform(X_test[:,3:])

In [18]:
print(X_train)

[[1.0 0.0 0.0 -0.2174169445587274 0.30060118852044354]
 [0.0 0.0 1.0 -0.081295031443698 -0.2106575258135387]
 [0.0 1.0 0.0 1.5521679259366548 1.6639577600777296]
 [0.0 1.0 0.0 0.1909487947863608 0.026036323415157055]
 [0.0 0.0 1.0 0.02457756764576941 -0.9775455973145121]
 [0.0 0.0 1.0 -1.5786360757090214 -1.3183847402038336]
 [1.0 0.0 0.0 1.279924099706596 1.323118617188408]
 [0.0 1.0 0.0 -1.1702703363639333 -0.8071260258698513]]


In [19]:
print(X_test)

[[1.0 0.0 0.0 -0.4896607707887862 -0.46628688298052984]
 [1.0 0.0 0.0 0.7354364472464784 0.7266501171320954]]
