# Data Preprocessing

## Importing the necessary libraries and packages

In [45]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the csv file as a dataFrame

In [46]:
dataset = pd.read_csv('Data.csv')

## Exploratory Data Analysis

Formal information about the dataset

In [47]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


Description of the dataset

In [48]:
dataset.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


Checking if any null value exists

In [49]:
dataset.isna().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

Columns of the dataset

In [50]:
dataset.columns

Index(['Country', 'Age', 'Salary', 'Purchased'], dtype='object')

Correlations between the variables

In [51]:
dataset.corr()

Unnamed: 0,Age,Salary
Age,1.0,0.982495
Salary,0.982495,1.0


## Taking Care of Missing Data

In [63]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
dataset[['Age', 'Salary']] = imputer.fit_transform(dataset[['Age', 'Salary']])
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,27.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,63777.777778,1
5,0,35.0,58000.0,1
6,2,38.777778,52000.0,0
7,0,48.0,79000.0,1
8,1,50.0,83000.0,0
9,0,37.0,67000.0,1


## Encoding Categorical Data

In [54]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split  # splitting the dataset
le_country = LabelEncoder()
le_purchased = LabelEncoder()
dataset['Country'] = le_country.fit_transform(dataset['Country'])
dataset['Purchased'] = le_purchased.fit_transform(dataset['Purchased'])

In [55]:
print(le_country.classes_)
print(le_purchased.classes_)

['France' 'Germany' 'Spain']
['No' 'Yes']


In [56]:
print(le_country.inverse_transform(dataset['Country']))
print(le_purchased.inverse_transform(dataset['Purchased']))

['France' 'Spain' 'Germany' 'Spain' 'Germany' 'France' 'Spain' 'France'
 'Germany' 'France']
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [57]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,27.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,63777.777778,1
5,0,35.0,58000.0,1
6,2,38.777778,52000.0,0
7,0,48.0,79000.0,1
8,1,50.0,83000.0,0
9,0,37.0,67000.0,1


## Now taking 2 parts from the dataset
### X: which contains all columns except the <b>target</b> column<br>Y: which contains only the <b>target</b> column

In [58]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

## Split the <b>X</b> and <b>y</b> Dataframes for Test and Training

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## Implementation of the Model

In [60]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

In [61]:
print(y_test)
print(y_pred)

[0 0 1]
[1 1 0]


In [62]:
print(le_purchased.inverse_transform(y_test))
print(le_purchased.inverse_transform(y_pred))

['No' 'No' 'Yes']
['Yes' 'Yes' 'No']
