# Data Preprocessing

### Import the dataset

In [1]:
dataset = read.csv('Data.csv')

In [2]:
dataset # Unlike python indexing starts with 1 in R

Country,Age,Salary,Purchased
France,44.0,72000.0,No
Spain,27.0,48000.0,Yes
Germany,30.0,54000.0,No
Spain,38.0,61000.0,No
Germany,40.0,,Yes
France,35.0,58000.0,Yes
Spain,,52000.0,No
France,48.0,79000.0,Yes
Germany,50.0,83000.0,No
France,37.0,67000.0,Yes


### Taking care of missing data

In [7]:
dataset$Age = ifelse(is.na(dataset$Age), 
                     ave(dataset$Age, FUN = function(x) mean(x, na.rm = TRUE)), 
                         dataset$Age)
dataset$Salary = ifelse(is.na(dataset$Salary), 
                        ave(dataset$Salary, FUN = function(x) mean(x, na.rm = TRUE)), 
                            dataset$Salary)

In [8]:
dataset

Country,Age,Salary,Purchased
France,44.0,72000.0,No
Spain,27.0,48000.0,Yes
Germany,30.0,54000.0,No
Spain,38.0,61000.0,No
Germany,40.0,63777.78,Yes
France,35.0,58000.0,Yes
Spain,38.77778,52000.0,No
France,48.0,79000.0,Yes
Germany,50.0,83000.0,No
France,37.0,67000.0,Yes


### Encoding categorical data

In [10]:
dataset$Country = factor(dataset$Country, 
                         levels = c('France', 'Spain', 'Germany'),
                         labels = c(1, 2, 3))

In [11]:
dataset

Country,Age,Salary,Purchased
1,44.0,72000.0,No
2,27.0,48000.0,Yes
3,30.0,54000.0,No
2,38.0,61000.0,No
3,40.0,63777.78,Yes
1,35.0,58000.0,Yes
2,38.77778,52000.0,No
1,48.0,79000.0,Yes
3,50.0,83000.0,No
1,37.0,67000.0,Yes


In [12]:
dataset$Purchased = factor(dataset$Purchased, 
                         levels = c('No', 'Yes'),
                         labels = c(0, 1))

In [13]:
dataset

Country,Age,Salary,Purchased
1,44.0,72000.0,0
2,27.0,48000.0,1
3,30.0,54000.0,0
2,38.0,61000.0,0
3,40.0,63777.78,1
1,35.0,58000.0,1
2,38.77778,52000.0,0
1,48.0,79000.0,1
3,50.0,83000.0,0
1,37.0,67000.0,1


### Splitting the dataset into the Training set and Test set

In [15]:
# install.packages('caTools')

In [16]:
library(caTools)

In [18]:
set.seed(42)
split = sample.split(dataset$Purchased, SplitRatio = 0.8)

In [19]:
split # TRUE = Training set, FALSE = Test set

In [20]:
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

In [21]:
training_set

Unnamed: 0,Country,Age,Salary,Purchased
1,1,44.0,72000,0
2,2,27.0,48000,1
4,2,38.0,61000,0
6,1,35.0,58000,1
7,2,38.77778,52000,0
8,1,48.0,79000,1
9,3,50.0,83000,0
10,1,37.0,67000,1


In [31]:
dim(training_set)[1]

In [22]:
test_set

Unnamed: 0,Country,Age,Salary,Purchased
3,3,30,54000.0,0
5,3,40,63777.78,1


In [32]:
dim(test_set)[1]

### Feature Scaling

In [26]:
training_set = scale(training_set)
test_set = scale(test_set)

ERROR: Error in colMeans(x, na.rm = TRUE): 'x' must be numeric


In [27]:
training_set[, 2:3] = scale(training_set[, 2:3])
test_set[, 2:3] = scale(test_set[, 2:3])

In [28]:
training_set

Unnamed: 0,Country,Age,Salary,Purchased
1,1,0.5746726,0.5594249,0
2,2,-1.7090912,-1.3586033,1
4,2,-0.2313617,-0.3196714,0
6,1,-0.6343788,-0.5594249,1
7,2,-0.1268758,-1.0389319,0
8,1,1.1120288,1.1188498,1
9,3,1.3807069,1.4385211,0
10,1,-0.3657007,0.1598357,1


In [29]:
test_set

Unnamed: 0,Country,Age,Salary,Purchased
3,3,-0.7071068,-0.7071068,0
5,3,0.7071068,0.7071068,1
