# Data Preprocessing

This exercise walks through: 
* Importing libraries
* Importing a dataset
* Exporing a dataset
* Handling missing data
* Handling categorical features
* Dividing data into training set and test set
* Scaling feature set

In [1]:
# Importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [2]:
# Importing the dataset
dataset = pd.read_csv('Sales_Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

In [3]:
# Exploring the dataset
print(dataset.shape)
print(dataset.describe())
dataset.head(10)

(150, 4)
              Age        Salary
count  131.000000    135.000000
mean    37.519084  59385.185185
std      8.146119  12867.861844
min     21.000000  32000.000000
25%     32.000000  50000.000000
50%     37.000000  60000.000000
75%     43.000000  68500.000000
max     55.000000  88000.000000


Unnamed: 0,State,Age,Salary,Purchased
0,Texas,44.0,72000.0,No
1,New York,27.0,48000.0,Yes
2,California,30.0,54000.0,No
3,New York,38.0,61000.0,No
4,California,40.0,,Yes
5,Texas,35.0,58000.0,Yes
6,New York,,52000.0,No
7,Texas,48.0,79000.0,Yes
8,California,50.0,83000.0,No
9,Texas,37.0,67000.0,Yes


In [4]:
# Handling missing data
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

pd.DataFrame(X).head(10)

Unnamed: 0,0,1,2
0,Texas,44.0,72000.0
1,New York,27.0,48000.0
2,California,30.0,54000.0
3,New York,38.0,61000.0
4,California,40.0,59385.2
5,Texas,35.0,58000.0
6,New York,37.5191,52000.0
7,Texas,48.0,79000.0
8,California,50.0,83000.0
9,Texas,37.0,67000.0


In [5]:
# Handling categorical features - encoding the independent variables
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()

pd.DataFrame(X).head(10)

Unnamed: 0,0,1,2,3,4
0,0.0,0.0,1.0,44.0,72000.0
1,0.0,1.0,0.0,27.0,48000.0
2,1.0,0.0,0.0,30.0,54000.0
3,0.0,1.0,0.0,38.0,61000.0
4,1.0,0.0,0.0,40.0,59385.185185
5,0.0,0.0,1.0,35.0,58000.0
6,0.0,1.0,0.0,37.519084,52000.0
7,0.0,0.0,1.0,48.0,79000.0
8,1.0,0.0,0.0,50.0,83000.0
9,0.0,0.0,1.0,37.0,67000.0


In [6]:
# Handling categorical features - encoding the dependent variable
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

pd.DataFrame(y).head()

Unnamed: 0,0
0,0
1,1
2,0
3,0
4,1


In [7]:
# Dividing data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

print(pd.DataFrame(X_train).shape)
print(pd.DataFrame(X_test).shape)
print(pd.DataFrame(y_train).shape)
print(pd.DataFrame(y_test).shape)

(120, 5)
(30, 5)
(120, 1)
(30, 1)


In [8]:
# Scaling feature set (if necessary)
scale_X = StandardScaler()
X_train = scale_X.fit_transform(X_train)
X_test = scale_X.transform(X_test)
scale_y = StandardScaler()
y_train = scale_y.fit_transform(y_train.reshape(-1,1))

print(pd.DataFrame(X_train).head())
print(pd.DataFrame(X_test).head())
print(pd.DataFrame(y_train).head())
print(pd.DataFrame(y_test).head())

         0         1         2         3         4
0 -0.66767  1.658312 -0.859727  1.257440  1.107393
1 -0.66767  1.658312 -0.859727 -0.349717 -0.348330
2 -0.66767 -0.603023  1.163160 -0.012337  0.783899
3 -0.66767  1.658312 -0.859727 -0.617576 -0.671824
4 -0.66767  1.658312 -0.859727 -1.555084 -1.399685
          0         1         2         3         4
0 -0.667670  1.658312 -0.859727  0.587791  0.460405
1  1.497746 -0.603023 -0.859727 -1.019365 -0.833571
2 -0.667670  1.658312 -0.859727 -0.885435 -1.076191
3 -0.667670  1.658312 -0.859727 -1.956873  0.006315
4 -0.667670 -0.603023  1.163160  1.391369  1.592633
          0
0 -1.051315
1 -1.051315
2  0.951190
3 -1.051315
4  0.951190
   0
0  0
1  0
2  0
3  0
4  1
