# importing libraries

In [8]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

# importing dataset

In [9]:
dataset = pd.read_csv('/content/drive/MyDrive/Elina_MetaSciforTraining/Data Preprocessing/DataPreprocessing.csv')
df = pd.DataFrame(dataset)
df

Unnamed: 0,Region,Age,Income,Online Shopper
0,India,49.0,86400.0,No
1,Brazil,32.0,57600.0,Yes
2,USA,35.0,64800.0,No
3,Brazil,43.0,73200.0,No
4,USA,45.0,,Yes
5,India,40.0,69600.0,Yes
6,Brazil,,62400.0,No
7,India,53.0,94800.0,Yes
8,USA,55.0,99600.0,No
9,India,42.0,80400.0,Yes


In [10]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [11]:
print(X)

[['India' 49.0 86400.0]
 ['Brazil' 32.0 57600.0]
 ['USA' 35.0 64800.0]
 ['Brazil' 43.0 73200.0]
 ['USA' 45.0 nan]
 ['India' 40.0 69600.0]
 ['Brazil' nan 62400.0]
 ['India' 53.0 94800.0]
 ['USA' 55.0 99600.0]
 ['India' 42.0 80400.0]]


In [12]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# handeling missing value

In [13]:
df.isnull().sum()

Unnamed: 0,0
Region,0
Age,1
Income,1
Online Shopper,0


In [14]:
df1 = df.copy()

In [15]:
#Using dropna
# summarize the shape of the raw data
print("Before:",df1.shape)

# drop rows with missing values
df1.dropna(inplace=True)

# summarize the shape of the data with missing rows removed
print("After:",df1.shape)

Before: (10, 4)
After: (8, 4)


#Scikit learn

In [16]:
X

array([['India', 49.0, 86400.0],
       ['Brazil', 32.0, 57600.0],
       ['USA', 35.0, 64800.0],
       ['Brazil', 43.0, 73200.0],
       ['USA', 45.0, nan],
       ['India', 40.0, 69600.0],
       ['Brazil', nan, 62400.0],
       ['India', 53.0, 94800.0],
       ['USA', 55.0, 99600.0],
       ['India', 42.0, 80400.0]], dtype=object)

In [17]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [18]:
X

array([['India', 49.0, 86400.0],
       ['Brazil', 32.0, 57600.0],
       ['USA', 35.0, 64800.0],
       ['Brazil', 43.0, 73200.0],
       ['USA', 45.0, 76533.33333333333],
       ['India', 40.0, 69600.0],
       ['Brazil', 43.77777777777778, 62400.0],
       ['India', 53.0, 94800.0],
       ['USA', 55.0, 99600.0],
       ['India', 42.0, 80400.0]], dtype=object)

# Encoding the categorical data

In [19]:
#Column transfer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
df

Unnamed: 0,Region,Age,Income,Online Shopper
0,India,49.0,86400.0,No
1,Brazil,32.0,57600.0,Yes
2,USA,35.0,64800.0,No
3,Brazil,43.0,73200.0,No
4,USA,45.0,,Yes
5,India,40.0,69600.0,Yes
6,Brazil,,62400.0,No
7,India,53.0,94800.0,Yes
8,USA,55.0,99600.0,No
9,India,42.0,80400.0,Yes


In [20]:
print(X)

[[0.0 1.0 0.0 49.0 86400.0]
 [1.0 0.0 0.0 32.0 57600.0]
 [0.0 0.0 1.0 35.0 64800.0]
 [1.0 0.0 0.0 43.0 73200.0]
 [0.0 0.0 1.0 45.0 76533.33333333333]
 [0.0 1.0 0.0 40.0 69600.0]
 [1.0 0.0 0.0 43.77777777777778 62400.0]
 [0.0 1.0 0.0 53.0 94800.0]
 [0.0 0.0 1.0 55.0 99600.0]
 [0.0 1.0 0.0 42.0 80400.0]]


#Splitting the dataset

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [22]:
print(X_train)

[[1.0 0.0 0.0 43.77777777777778 62400.0]
 [0.0 0.0 1.0 45.0 76533.33333333333]
 [0.0 1.0 0.0 49.0 86400.0]
 [1.0 0.0 0.0 43.0 73200.0]
 [1.0 0.0 0.0 32.0 57600.0]
 [0.0 1.0 0.0 53.0 94800.0]
 [0.0 0.0 1.0 55.0 99600.0]
 [0.0 1.0 0.0 40.0 69600.0]]


In [24]:
print(X_test)

[[0.0 0.0 1.0 35.0 64800.0]
 [0.0 1.0 0.0 42.0 80400.0]]


In [25]:
print(y_train)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes']


In [26]:
print(y_test)

['No' 'Yes']


#Feature Scaling

In [27]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
X_train[:, 3:] = mm.fit_transform(X_train[:, 3:])
X_test[:, 3:] = mm.transform(X_test[:, 3:])

In [28]:
print(X_train[:, 3:])

[[0.5120772946859904 0.11428571428571432]
 [0.5652173913043479 0.45079365079365075]
 [0.7391304347826089 0.6857142857142859]
 [0.4782608695652173 0.37142857142857144]
 [0.0 0.0]
 [0.9130434782608696 0.8857142857142857]
 [1.0 1.0]
 [0.34782608695652173 0.2857142857142858]]


In [29]:
print(X_test[:, 3:])

[[0.13043478260869557 0.17142857142857149]
 [0.4347826086956521 0.5428571428571429]]
