<a href="https://colab.research.google.com/github/cheshtadhingra/Machine_Learning_Using_Python/blob/main/data_preprocessing_tools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing Tools

## Importing the libraries

In [None]:
import numpy as np # deals with the array part
import matplotlib.pyplot as plt # helps making great charts (seaborn also is a great library for charts)
import pandas as pd # allow us to import dataset and modify studd

## Importing the dataset

In [None]:
dataset = pd.read_csv("/content/Data.csv")  #reading the dataset using the read_csv module of pandas library
# features are the variables that can use to predict the value that you want to predict
# dependent variables are those which we predict using features



In [None]:
dataset.head() #returns the first 5 columns of the dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [None]:
X=dataset.iloc[:, :-1].values #iloc stands for locate indexes iloc[rows, colmns]
# range in python includes the lowerbound and not the upper bound
# .values will assign the values in the variable
y = dataset.iloc[:, -1].values

In [None]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [None]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [None]:
from sklearn.impute import SimpleImputer #a machine learning library that has wide applications
imputer = SimpleImputer(missing_values= np.nan, strategy='mean') # created a function called imputer which is an instance of simple imputer
# missing_values="the values which are considered missing from the dataset"
# strategy = 'the value that replaces the missing values, here it is the mean of other valus
imputer.fit(X[:, 1:3]) # fit takes numerical columns
X[:, 1:3] = imputer.transform(X[:, 1:3]) # do the replacement of the missing values, missing values will be replaced by the mean of the other columns


In [None]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

In [None]:
# for columns with categories, we need to encode those categories into number, so that our model interprets better
# use one hot encoding, creates binary vector for each category
# super fine and doesn't compromise with the numerical integrity of the model


### Encoding the Independent Variable

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder() , [0] )] , remainder= 'passthrough' )
# for transformers, we have to specify three things, 1. the kind of transformation, 2. what kind of encoding we want to do, 3. the indexes of columns we want to encode
# passthrough : rest columns will not be applied any transformation
X = np.array(ct.fit_transform(X)) # fit transform doesn't return output as a numpy array

In [None]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y) 

In [None]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
# create 4 sets for X_test, X_train, y_test, y_train by splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size= 0.2 , random_state = 1)
# keep more data in train set and less in test set, in a 80-20 ratio, randomstate=1, so that we get the same test and training set

In [None]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [None]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [None]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [None]:
print(y_test)

[0 1]


## Feature Scaling

In [None]:
# gets mean and SD of all values to apply scaling
# must be applied after splitting to prevent information leakage on the test set which you are not supposed to have until training is done
# helps apply all features on same scale
# to prevent the dominance of one feature on the other
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[: , 3:] = sc.fit_transform(X_train[:, 3:]); #because we don't apply normalization/standardization on dummy variables
X_test[:, 3:] = sc.transform(X_test[:, 3:]) # we don't scale the test data

In [None]:
print(X_train)

[[0.0 0.0 1.0 -0.1915918438457856 -1.0781259408412427]
 [0.0 1.0 0.0 -0.014117293757057902 -0.07013167641635401]
 [1.0 0.0 0.0 0.5667085065333239 0.6335624327104546]
 [0.0 0.0 1.0 -0.3045301939022488 -0.30786617274297895]
 [0.0 0.0 1.0 -1.901801144700799 -1.4204636155515822]
 [1.0 0.0 0.0 1.1475343068237056 1.2326533634535488]
 [0.0 1.0 0.0 1.4379472069688966 1.5749910381638883]
 [1.0 0.0 0.0 -0.7401495441200352 -0.5646194287757336]]


In [None]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]
