# Import the libraries

In [3]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

# Import the Dataset

In [5]:
dataset =  pd.read_csv('Data.csv')
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [6]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [7]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# Taking care of the missing data.
  # 1. delete 1% of missing data
  # 2. Numerical data (replace by mean , median ), categorical data => mode(Most frequent value).

In [9]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
x[:,1:3] = imputer.fit_transform(x[:,1:3])

In [10]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


# Encoading the valriables 
#    1. One Hot Encoading. 
#    2. Label Encoading.

In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough')
x = ct.fit_transform(x)

In [13]:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [15]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


# splitting the dataset into training and testing set 

In [20]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train,y_test = train_test_split(x,y,random_state = 42)

In [21]:
print(x_train)

[[1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]]


In [22]:
print(x_test)

[[0.0 1.0 0.0 50.0 83000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [23]:
print(y_train)

[0 1 0 1 1 0 0]


In [24]:
print(y_test)

[0 1 1]


# Scaling the feaatures.

In [26]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:,1:] = sc.fit_transform(x_train[:,1:])
x_test[:,1:] = sc.transform(x_test[:,1:])

In [27]:
print(x_train)

[[1.0 -0.6324555320336758 -0.6324555320336759 0.8790542992514155
  0.8892086044783815]
 [1.0 -0.6324555320336758 -0.6324555320336759 1.6429221730836805
  1.6782246901422972]
 [0.0 1.5811388300841895 -0.6324555320336759 -1.7944832591615116
  -1.1396899015145447]
 [1.0 -0.6324555320336758 -0.6324555320336759 -0.45771447995504805
  0.32562568614701304]
 [0.0 1.5811388300841895 -0.6324555320336759 0.11518642541915065
  -0.03757219455542401]
 [0.0 -0.6324555320336758 1.5811388300841898 -0.26674751149698184
  -0.350673815850629]
 [0.0 -0.6324555320336758 1.5811388300841898 -0.11821764714070793
  -1.3651230688470921]]


In [28]:
print(x_test)

[[0.0 1.5811388300841895 -0.6324555320336759 2.024856109999813
  2.129091024807392]
 [0.0 -0.6324555320336758 1.5811388300841898 -2.3673841645357103
  -1.8159894035121869]
 [1.0 -0.6324555320336758 -0.6324555320336759 -0.8396484168711805
  -0.68882356684945]]
