In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#Load the data
data = pd.read_csv('sampleExample.csv')

In [3]:
#Missing data check
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 4 columns):
Country      18 non-null object
Age          16 non-null float64
Salary       16 non-null float64
Purchased    18 non-null object
dtypes: float64(2), object(2)
memory usage: 704.0+ bytes


In [7]:
#Seperate data with features and label
features = data.iloc[:,:-1].values #numpy array
label = data.iloc[:,3].values #numpy array, remember if ':' then<[Inclusive]:[Exclusive]> otherwise whatever is mentioned will be considered.

In [8]:
features

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Germany', nan, 59000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [9]:
label

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes',
       'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'Yes'], dtype=object)

In [10]:
features.shape #vector

(18, 3)

In [11]:
label.shape #1D array , scalar

(18,)

In [12]:
#Handle Missing Data
# 1. Collect the columns that has missing data (info on dataframe)
# 2. Check the type of column who holds missing data
# age-Numerical sal-Numerical
# if col is non numeric, delete that record !!!
# if col is numeric , perform Imputation !!!
# Imputation is a process of handling missing data on a numeric column

In [13]:
#Sklearn(Sci-kit Learn) package to perform Imputation
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values='NaN',
                 strategy='mean',
                 axis =0) #column wise or along the row one and the same thing

#imputer.fit(features[:,1:3])
#fit means create an equation ---- Calc the mean
#transform means apply the equation ---- apply mean to all NaN


features[:,1:3]=imputer.fit_transform(features[:,1:3])
features



array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 64687.5],
       ['France', 35.0, 58000.0],
       ['Spain', 39.1875, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 64687.5],
       ['France', 35.0, 58000.0],
       ['Germany', 39.1875, 59000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [14]:
#Handling String Data
# Encoding Technique
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
encode = LabelEncoder() #initialization
features[:,0] = encode.fit_transform(features[:,0]) #Label Encoding
features

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 64687.5],
       [0, 35.0, 58000.0],
       [2, 39.1875, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 64687.5],
       [0, 35.0, 58000.0],
       [1, 39.1875, 59000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [15]:
encode.classes_

array(['France', 'Germany', 'Spain'], dtype=object)

In [16]:
#OHE to remove weights from the columns
hotencode = OneHotEncoder(categorical_features=[0]) #categorical_features[weightedcolumn index]
features = hotencode.fit_transform(features).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [17]:
features

array([[1.00000e+00, 0.00000e+00, 0.00000e+00, 4.40000e+01, 7.20000e+04],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, 2.70000e+01, 4.80000e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 3.00000e+01, 5.40000e+04],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, 3.80000e+01, 6.10000e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 4.00000e+01, 6.46875e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, 3.50000e+01, 5.80000e+04],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, 3.91875e+01, 5.20000e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, 4.80000e+01, 7.90000e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 5.00000e+01, 8.30000e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, 3.70000e+01, 6.70000e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 3.00000e+01, 5.40000e+04],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, 3.80000e+01, 6.10000e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 4.00000e+01, 6.46875e+04],
       [1.00000e+00, 0.00000e+00, 0.00

In [18]:
data = pd.read_csv('sampleExample.csv')

In [20]:
#data1 = pd.get_dummies(data.Country)
data2 = pd.concat([data,pd.get_dummies(data.Country)],axis=1)
data2

Unnamed: 0,Country,Age,Salary,Purchased,France,Germany,Spain
0,France,44.0,72000.0,No,1,0,0
1,Spain,27.0,48000.0,Yes,0,0,1
2,Germany,30.0,54000.0,No,0,1,0
3,Spain,38.0,61000.0,No,0,0,1
4,Germany,40.0,,Yes,0,1,0
5,France,35.0,58000.0,Yes,1,0,0
6,Spain,,52000.0,No,0,0,1
7,France,48.0,79000.0,Yes,1,0,0
8,Germany,50.0,83000.0,No,0,1,0
9,France,37.0,67000.0,Yes,1,0,0


In [21]:
data2.drop(['Country'],axis=1)

Unnamed: 0,Age,Salary,Purchased,France,Germany,Spain
0,44.0,72000.0,No,1,0,0
1,27.0,48000.0,Yes,0,0,1
2,30.0,54000.0,No,0,1,0
3,38.0,61000.0,No,0,0,1
4,40.0,,Yes,0,1,0
5,35.0,58000.0,Yes,1,0,0
6,,52000.0,No,0,0,1
7,48.0,79000.0,Yes,1,0,0
8,50.0,83000.0,No,0,1,0
9,37.0,67000.0,Yes,1,0,0


In [44]:
data2['Age'].fillna((data2['Age'].mean()), inplace=True)
data2['Salary'].fillna((data2['Salary'].mean()), inplace=True)
data2


Unnamed: 0,Country,Age,Salary,Purchased,France,Germany,Spain
0,France,44.0,72000.0,No,1,0,0
1,Spain,27.0,48000.0,Yes,0,0,1
2,Germany,30.0,54000.0,No,0,1,0
3,Spain,38.0,61000.0,No,0,0,1
4,Germany,40.0,64687.5,Yes,0,1,0
5,France,35.0,58000.0,Yes,1,0,0
6,Spain,39.1875,52000.0,No,0,0,1
7,France,48.0,79000.0,Yes,1,0,0
8,Germany,50.0,83000.0,No,0,1,0
9,France,37.0,67000.0,Yes,1,0,0


In [45]:
featuresDF = data2.iloc[:,[0,1,2,4,5,6]]
labelDF = data2.Purchased


In [47]:
featuresDF.head()

Unnamed: 0,Country,Age,Salary,France,Germany,Spain
0,France,44.0,72000.0,1,0,0
1,Spain,27.0,48000.0,0,0,1
2,Germany,30.0,54000.0,0,1,0
3,Spain,38.0,61000.0,0,0,1
4,Germany,40.0,64687.5,0,1,0


In [48]:
labelDF.head()

0     No
1    Yes
2     No
3     No
4    Yes
Name: Purchased, dtype: object