#### Data Preprocessing

In [1]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### importing the dataset

In [2]:
dataset = pd.read_csv('Data.csv')
#get inputs / independent variables separately
#.values ensures that we get x as an array instead of being a pandas subset
x = dataset.iloc[:, :-1].values # :-1 excludes the last colum

#dependent variable / what we are trying to predict
y = dataset.iloc[:, -1].values # takes the last column

In [3]:
print(x)


[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [4]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


### taking care of missing data

-This can be achieved my using the mean of the values in the colum where the data is missing
- We will compute the average age and replace missing data with average age
- Same thing with salary - take averages
- Use the simpleImputer method from sklearn library

For categorical data the strategy is to replace missing data with the most frequent value

In [5]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy='mean') # takes missing values attribute
imputer.fit(x[:, 1:3]) # computes the mean for columns for age and Salary
x[:, 1:3] = imputer.transform(x[:,1:3]) #transform method performs the replacement


In [6]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


### Encoding Categorical Data
-Country independent variable and purchased dependent variables must be converted to numeric data before passing them to a model
-There is no relational order in the country variable using One Hot encoding or using Dummy variable encoding

In [12]:
#encoding the independent variables
from sklearn.compose import ColumnTransformer #replaces country column with 0 and 1
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers =[('encoder', OneHotEncoder(),[0])], remainder='passthrough') #[0] is the index of the column and remaining columns are passed throu after transformation
x=np.array(ct.fit_transform(x)) # making sure its a numpy array


In [11]:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable
- As the values are simple - use LabelEncoder to replace yes with 1 and no with 0

In [16]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [17]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


### Spliting the dataset into the test set and training set
- Training set is needed to train the model
-Test set is required to evaluate model performance
- inputs for the train_test_split: - independent and dependent variables (x,y)
- test_size = 20% in this example
- random_state = 1 - seed variable to ensure we get the same result regardless of random factors
- the variables given destructure the output from the train_test_split() function

In [19]:
from sklearn.model_selection import  train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state =1)

In [20]:
print(x_train)

[[1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 0.0 48.0 79000.0]
 [1.0 0.0 1.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 0.0 35.0 58000.0]]


In [21]:
print(x_test)

[[1.0 0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 0.0 37.0 67000.0]]


In [22]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [23]:
print(y_test)

[0 1]
