# Basics of Data Preprocessing

## 1. Importing Data

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
dataset = pd.read_csv("Data.csv")
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### Creating Feature variable 'X'

In [6]:
X = dataset[['Country','Age','Salary']]
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


### Create dependent variable 'y'

In [7]:
y = dataset[['Purchased']]
y

Unnamed: 0,Purchased
0,No
1,Yes
2,No
3,No
4,Yes
5,Yes
6,No
7,Yes
8,No
9,Yes


## 2. Handling Missing Values

In [8]:
pd.isnull(X)

Unnamed: 0,Country,Age,Salary
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,True
5,False,False,False
6,False,True,False
7,False,False,False
8,False,False,False
9,False,False,False


In [9]:
X['Age'].fillna(X['Age'].mean() , inplace = True)
X

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [10]:
X['Salary'].fillna(X['Salary'].mean(),inplace = True)
X

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


### Handling missing values using Imputer class of sklearn

In [11]:
from sklearn.preprocessing import Imputer
X = dataset[['Country','Age','Salary']]
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [12]:
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)

In [13]:
imputer = imputer.fit(X[['Age','Salary']])

In [14]:
X[['Age','Salary']] = imputer.transform(X[['Age','Salary']])
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


# 3. Changing Categorical Data

### Approach 1 : Find and Replace

### Approach 2: Label Encoding

In [15]:
X['Country'] = X['Country'].astype('category')
X['Country_codes'] = X['Country'].cat.codes
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Country,Age,Salary,Country_codes
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,2
2,Germany,30.0,54000.0,1
3,Spain,38.0,61000.0,2
4,Germany,40.0,63777.777778,1
5,France,35.0,58000.0,0
6,Spain,38.777778,52000.0,2
7,France,48.0,79000.0,0
8,Germany,50.0,83000.0,1
9,France,37.0,67000.0,0


### Approach 3: One Hot Encoding

In [16]:
pd.get_dummies(X, columns = ['Country_codes'])

Unnamed: 0,Country,Age,Salary,Country_codes_0,Country_codes_1,Country_codes_2
0,France,44.0,72000.0,1,0,0
1,Spain,27.0,48000.0,0,0,1
2,Germany,30.0,54000.0,0,1,0
3,Spain,38.0,61000.0,0,0,1
4,Germany,40.0,63777.777778,0,1,0
5,France,35.0,58000.0,1,0,0
6,Spain,38.777778,52000.0,0,0,1
7,France,48.0,79000.0,1,0,0
8,Germany,50.0,83000.0,0,1,0
9,France,37.0,67000.0,1,0,0


# 4. Splitting Test and Train Data

In [17]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [22]:
X_train

Unnamed: 0,Country,Age,Salary,Country_codes
4,Germany,40.0,63777.777778,1
9,France,37.0,67000.0,0
1,Spain,27.0,48000.0,2
6,Spain,38.777778,52000.0,2
7,France,48.0,79000.0,0
3,Spain,38.0,61000.0,2
0,France,44.0,72000.0,0
5,France,35.0,58000.0,0


In [23]:
X_test

Unnamed: 0,Country,Age,Salary,Country_codes
2,Germany,30.0,54000.0,1
8,Germany,50.0,83000.0,1


In [24]:
y_train

Unnamed: 0,Purchased
4,Yes
9,Yes
1,Yes
6,No
7,Yes
3,No
0,No
5,Yes


In [25]:
y_test

Unnamed: 0,Purchased
2,No
8,No
