# Data Preprocessing

## Importing the libraries

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

## Importing the dataset

In [None]:
df = pd.read_csv('dataset.csv')

In [None]:
X = df.iloc[:, 1:-1].values
y = df.iloc[:, -1].values

In [None]:
print(X)

[['verb' 7 4 ... 0 0 2]
 ['noun' 2 1 ... 0 580 11]
 ['adjective' 4 2 ... 0 0 11]
 ...
 ['noun' 6 4 ... 0 0 4]
 ['number' 9 0 ... 0 0 1]
 ['noun' 6 2 ... 0 0 3]]


In [None]:
print(y)

['b2' 'a2' 'a2' ... 'b1' 'a2' 'b2']


## Encoding


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
print(X)

[[0.0 0.0 0.0 ... 0 0 2]
 [0.0 0.0 0.0 ... 0 580 11]
 [1.0 0.0 0.0 ... 0 0 11]
 ...
 [0.0 0.0 0.0 ... 0 0 4]
 [0.0 0.0 0.0 ... 0 0 1]
 [0.0 0.0 0.0 ... 0 0 3]]


In [None]:
print(y)

[3 1 1 ... 2 1 3]


## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [None]:
print(X_train)

[[0.0 0.0 0.0 ... 0 464 4]
 [0.0 0.0 0.0 ... 0 0 3]
 [0.0 0.0 0.0 ... 0 0 4]
 ...
 [0.0 0.0 0.0 ... 0 323 4]
 [1.0 0.0 0.0 ... 0 0 20]
 [0.0 0.0 0.0 ... 0 0 2]]


In [None]:
print(X_test)

[[0.0 0.0 0.0 ... 0 464 71]
 [0.0 0.0 0.0 ... 0 356 4]
 [0.0 0.0 0.0 ... 0 0 5]
 ...
 [0.0 0.0 0.0 ... 0 0 4]
 [0.0 0.0 0.0 ... 0 0 3]
 [0.0 0.0 0.0 ... 0 0 5]]


In [None]:
print(y_train)

[3 2 2 ... 1 0 0]


In [None]:
print(y_test)

[3 4 2 1 4 4 3 1 4 0 4 4 2 3 4 4 3 4 2 0 4 0 1 3 0 3 1 4 1 0 2 3 3 0 4 4 4
 1 3 4 0 3 0 1 0 3 1 4 3 3 1 1 0 3 3 4 3 0 2 4 0 3 1 4 2 4 0 3 1 1 3 4 2 0
 3 2 4 3 4 4 0 2 3 4 2 4 3 1 1 0 2 3 3 4 4 0 0 0 0 4 4 4 4 3 0 0 4 1 1 0 4
 3 0 4 0 3 2 1 1 0 4 4 1 1 2 4 0 4 3 2 2 3 4 1 3 2 0 2 4 1 4 4 3 3 3 1 0 3
 4 0 0 1 3 1 4 4 1 2 1 4 4 0 0 3 3 3 0 4 4 1 3 0 3 1 3 4 2 1 3 1 3 3 3 3 3
 2 3 1 2 0 3 1 3 0 2 3 4 0 3 2 2 1 3 4 3 0 0 4 3 2 0 0 3 2 4 1 0 4 0 4 4 4
 3 3 3 3 4 1 1 3 2 3 2 0 3 0 3 0 1 4 3 1 0 0 3 3 4 1 0 4 3 1 4 3 2 1 0 4 1
 0 1 0 2 0 3 3 0 2 3 1 0 3 0 0 3 0 3 3 0 4 1 3 3 0 3 3 1 3 4 4 0 4 3 2 4 4
 4 3 3 2 3 0 2 4 2 3 3 0 3 1 4 4 2 3 4 4 1 3 0 4 4 2 2 4 1 3 1 1 0 4 3 1 3
 0 2 3 3 3 2 1 4 2 2 0 4 0 1 2 1 4 2 2 1 4 3 4 0 4 3 0 3 3 4 3 4 4 4 3 1 1
 4 2 3 1 3 1 3 3 2 4 3 4 0 4 2 4 2 2 0 1 1 4 4 0 4 4 1 3 2 4 0 0 3 3 3 2 0
 1 4 1 0 1 3 3 3 0 3 1 3 4 4 3 4 1 3 2 3 0 1 4 3 3 4 2 3 2 2 0 0 4 4 3 4 1
 4 1 4 4 2 3 2 1 0 1 0 3 4 1 0 1 0 4 4 4 2 4 4 2 4 4 3 4 1 2 1 4 0 1 0 4 1
 4 4 3 2 4 4 2 0 3 0 0 3 

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 9:] = sc.fit_transform(X_train[:, 9:])
X_test[:, 9:] = sc.transform(X_test[:, 9:])

In [None]:
print(X_train)

[[0.0 0.0 0.0 ... -0.25824238955886414 0.15301457120237313
  -0.4315313831333375]
 [0.0 0.0 0.0 ... -0.25824238955886414 -0.45481347147853396
  -0.47932395691964746]
 [0.0 0.0 0.0 ... -0.25824238955886414 -0.45481347147853396
  -0.4315313831333375]
 ...
 [0.0 0.0 0.0 ... -0.25824238955886414 -0.03169179521574734
  -0.4315313831333375]
 [1.0 0.0 0.0 ... -0.25824238955886414 -0.45481347147853396
  0.333149797447622]
 [0.0 0.0 0.0 ... -0.25824238955886414 -0.45481347147853396
  -0.5271165307059574]]


In [None]:
print(X_test)

[[0.0 0.0 0.0 ... -0.25824238955886414 0.15301457120237313
  2.7705710605494307]
 [0.0 0.0 0.0 ... -0.25824238955886414 0.011537354371472344
  -0.4315313831333375]
 [0.0 0.0 0.0 ... -0.25824238955886414 -0.45481347147853396
  -0.3837388093470275]
 ...
 [0.0 0.0 0.0 ... -0.25824238955886414 -0.45481347147853396
  -0.4315313831333375]
 [0.0 0.0 0.0 ... -0.25824238955886414 -0.45481347147853396
  -0.47932395691964746]
 [0.0 0.0 0.0 ... -0.25824238955886414 -0.45481347147853396
  -0.3837388093470275]]


# Decision Tree Classification

## Training the Decision Tree Classification model on the Training set

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dt.fit(X_train, y_train)

## Applying Grid Search to find the best parameters

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {"criterion": ['gini', 'entropy'],
              "max_depth": range(1,10),
              "min_samples_split": range(1,10),
              "min_samples_leaf": range(1,5)};

grid_search = GridSearchCV(estimator = dt,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 48.45 %
Best Parameters: {'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 3, 'min_samples_split': 9}


720 fits failed out of a total of 6480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
720 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 889, in fit
    super().fit(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklear

In [None]:
dt = DecisionTreeClassifier(criterion=best_parameters['criterion'], max_depth=best_parameters['max_depth'], min_samples_leaf=best_parameters['min_samples_leaf'], min_samples_split=best_parameters['min_samples_split'])
dt.fit(X_train, y_train)

## Predicting the Test set results

In [None]:
y_pred = dt.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[3 3]
 [4 4]
 [3 2]
 ...
 [1 2]
 [4 4]
 [0 0]]


## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[ 96  53  11   4   2]
 [ 30  69  22  34   5]
 [  9  30  38  58  10]
 [  5  19  53 139  51]
 [  0   5  12  83 154]]
              precision    recall  f1-score   support

           0       0.69      0.58      0.63       166
           1       0.39      0.43      0.41       160
           2       0.28      0.26      0.27       145
           3       0.44      0.52      0.48       267
           4       0.69      0.61      0.65       254

    accuracy                           0.50       992
   macro avg       0.50      0.48      0.49       992
weighted avg       0.51      0.50      0.50       992



In [None]:
accuracy_score(y_test, y_pred)

0.5

In [None]:
pd.crosstab(y_test, y_pred)

col_0,0,1,2,3,4
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,96,53,11,4,2
1,30,69,22,34,5
2,9,30,38,58,10
3,5,19,53,139,51
4,0,5,12,83,154


## Applying k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = dt, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 48.37 %
Standard Deviation: 1.90 %
