## import libraries

In [34]:
import numpy as np  # for working with arraya
import matplotlib.pyplot as plt  # for drawing beautiful charts
import pandas as pd  # for preprocess and importing dataset

## import dataset

In [35]:
dataset = pd.read_csv('Data.csv')
print(dataset)
print('-----------------------------------------------')
print(type(dataset))
print('-----------------------------------------------')
print(dataset.describe())
print('-----------------------------------------------')
print(dataset.head(3))
print('-----------------------------------------------')
print(dataset.tail())
print('-----------------------------------------------')
print(dataset.values)
print('-----------------------------------------------')
print(type(dataset.values))


   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes
-----------------------------------------------
<class 'pandas.core.frame.DataFrame'>
-----------------------------------------------
             Age        Salary
count   9.000000      9.000000
mean   38.777778  63777.777778
std     7.693793  12265.579662
min    27.000000  48000.000000
25%    35.000000  54000.000000
50%    38.000000  61000.000000
75%    44.000000  72000.000000
max    50.000000  83000.000000
-----------------------------------------------
   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No

In [36]:
dataset = dataset.values
x = dataset[:, :-1]
y = dataset[:, -1]

print(x)
print(y)
print(type(x))
print(type(y))
print(x.shape)
print(y.shape)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(10, 3)
(10,)


## taking care of missing data
 - if no. of missing entries are very less compared to data size we have ==> we can delete them/drop them
 - Otherwise, we can either replace missing values with average values.
 - In case of categorical values, we can replace missing values with maximum occurred categorical value
 - We can also replace missing categorical values in the same proportion as other categorical values present 

In [37]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:,1:3])
print("x after imputer fit:")
print(x)
print('-----------------------------------------------')
print(x[:,1:3])
print('-----------------------------------------------')
print(imputer.transform(x[:,1:3]))
print('-----------------------------------------------')
print(x[:,1:3])
print('-----------------------------------------------')
x[:,1:3]=imputer.transform(x[:,1:3]) # it returns the array with averaging out missing values, and doesn't modify in the array itself
print("x after imputer transform:")
print(x)

x after imputer fit:
[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
-----------------------------------------------
[[44.0 72000.0]
 [27.0 48000.0]
 [30.0 54000.0]
 [38.0 61000.0]
 [40.0 nan]
 [35.0 58000.0]
 [nan 52000.0]
 [48.0 79000.0]
 [50.0 83000.0]
 [37.0 67000.0]]
-----------------------------------------------
[[4.40000000e+01 7.20000000e+04]
 [2.70000000e+01 4.80000000e+04]
 [3.00000000e+01 5.40000000e+04]
 [3.80000000e+01 6.10000000e+04]
 [4.00000000e+01 6.37777778e+04]
 [3.50000000e+01 5.80000000e+04]
 [3.87777778e+01 5.20000000e+04]
 [4.80000000e+01 7.90000000e+04]
 [5.00000000e+01 8.30000000e+04]
 [3.70000000e+01 6.70000000e+04]]
-----------------------------------------------
[[44.0 72000.0]
 [27.0 48000.0]
 [30.0 54000.0]
 [38.0 61000.0]
 [40.0 nan]
 [35.0 58000.0]
 [nan 5200

# Encoding Categorical data

### Encoding the independent variable
  - string values which we can't compare; where one value is neither less nor greater than another.
  - Like: name of person, city he lives in, etc.

In [39]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = ct.fit_transform(x)
print(x)
print(type(x))
print(x.shape)

[[0.0 1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 0.0 35.0 58000.0]
 [1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 0.0 48.0 79000.0]
 [1.0 0.0 1.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 0.0 37.0 67000.0]]
<class 'numpy.ndarray'>


### Encoding the dependent variable
  - string values which can be compared.
  - Like: Did person bought: yes or no; grade in exams: a,b,c,d,e,f; etc.

In [40]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)
print(type(y))
print(y.shape)

[0 1 0 0 1 1 0 1 0 1]
<class 'numpy.ndarray'>
(10,)


## Splitting the dataset into the Training set and Test set
 - We must split our dataset first, and then go for feature scaling.
 - This is because, the data that will be given to us for prediction, will not be known to us beforehand. So, even for testing data, we want to maintain the ideal state. If we do feature scaling on whole dataset, it will be like leakage of future data. As we have already taken in account the data, that we were supposed to be unaware of.

In [42]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)
print(x_train)
print(x_test)
print(y_train)
print(y_test)

[[1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 0.0 48.0 79000.0]
 [1.0 0.0 1.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 0.0 35.0 58000.0]]
[[1.0 0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 0.0 37.0 67000.0]]
[0 1 0 0 1 1 0 1]
[0 1]


## feature scaling
 - is done, so that higher value columns don't ignore lower value columns
 - values after feature scaling generally ranges between (-3 to 3).
 - we should not apply feature scaling on the categorical data that we encode. Because, that way, they will lose their meaning. France column value '1', means person was from France. After feature scaling, if value becomes 0.3, what will it mean.
 - For training data, we use 'fit_transform'.
 - For testing data, we must use 'transform', as we wish to transform them on the scale of testing data. As, test data will not be known before hand.

In [45]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:, 3:] = sc.fit_transform(x_train[:, 3:])
x_test[:, 3:] = sc.transform(x_test[:, 3:])
print(x_train)
print(x_test)

[[1.0 0.0 0.0 1.2909944487358056 -0.19159184384578545 -1.0781259408412425]
 [1.0 0.0 1.0 -0.7745966692414834 -0.014117293757057777
  -0.07013167641635372]
 [0.0 1.0 0.0 -0.7745966692414834 0.566708506533324 0.633562432710455]
 [1.0 0.0 0.0 1.2909944487358056 -0.30453019390224867
  -0.30786617274297867]
 [1.0 0.0 0.0 1.2909944487358056 -1.9018011447007988 -1.420463615551582]
 [0.0 1.0 0.0 -0.7745966692414834 1.1475343068237058 1.232653363453549]
 [1.0 0.0 1.0 -0.7745966692414834 1.4379472069688968 1.5749910381638885]
 [0.0 1.0 0.0 -0.7745966692414834 -0.7401495441200351 -0.5646194287757332]]
[[1.0 0.0 1.0 -0.7745966692414834 -1.4661817944830124 -0.9069571034860727]
 [0.0 1.0 0.0 -0.7745966692414834 -0.44973664397484414 0.2056403393225306]]
