In [1]:
# Importing the Libraries
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

## Pre-Processing Steps

In [3]:
# Creating The Dataframe
cars_train = pd.read_csv('cars_train.csv', header=None)
cars_test = pd.read_csv('cars_test.csv', header=None)

In [4]:
print(cars_train.shape)
cars_train.head()

(1382, 7)


Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,high,3,more,small,low,unacc
1,low,vhigh,3,4,small,med,unacc
2,low,high,5more,more,big,low,unacc
3,high,med,4,2,small,med,unacc
4,low,low,3,more,big,med,good


In [5]:
# Naming the Column Headers for Training & Testing data. 
cars_train.columns=['buying','maint','doors','persons','lug_boot','safety','classes']
cars_test.columns=['buying','maint','doors','persons','lug_boot','safety','classes']

In [6]:
cars_train.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,classes
0,vhigh,high,3,more,small,low,unacc
1,low,vhigh,3,4,small,med,unacc
2,low,high,5more,more,big,low,unacc
3,high,med,4,2,small,med,unacc
4,low,low,3,more,big,med,good


In [7]:
cars_test.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,classes
0,med,vhigh,5more,4,small,low,unacc
1,vhigh,high,2,2,big,med,unacc
2,low,high,2,more,small,low,unacc
3,vhigh,vhigh,3,2,big,high,unacc
4,low,med,4,4,med,med,good


In [8]:
# No need for Feature Selection

In [9]:
# Checking for missing value
cars_train.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
classes     0
dtype: int64

In [10]:
cars_test.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
classes     0
dtype: int64

There are NO missing values, so we can perform further steps

In [11]:
# Checking Duplicated Values  [In this Dataset, We are Not Checking Duplicated Values]
cars_test.duplicated().sum()

0

In [12]:
# Since, all variable are Categorical so not performing Outliers Detection (Box-plot)

In [13]:
# Dropping Classes Col from testing file, for looking the Testing data Realistic.
cars_test.drop("classes",axis=1,inplace=True)
cars_test.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,med,vhigh,5more,4,small,low
1,vhigh,high,2,2,big,med
2,low,high,2,more,small,low
3,vhigh,vhigh,3,2,big,high
4,low,med,4,4,med,med


In [14]:
# Convert Categorical into Numerical
colname = cars_train.columns
colname

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'classes'], dtype='object')

In [15]:
# Convert Categorical data into Numerical for Training Data
from sklearn.preprocessing import LabelEncoder 
 
le=LabelEncoder()
 
for x in colname:
    cars_train[x]=le.fit_transform(cars_train[x])

In [16]:
cars_train.head()

# acc -> 0
# good -> 1
# unacc -> 2
# vgood -> 3

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,classes
0,3,0,1,2,2,1,2
1,1,3,1,1,2,2,2
2,1,0,3,2,0,1,2
3,0,2,2,0,2,2,2
4,1,1,1,2,0,2,1


In [17]:
# Convert Categorical into Numerical for Testing data
colname = cars_test.columns
colname

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'], dtype='object')

In [18]:
# Convert Categorical data into Numerical
from sklearn.preprocessing import LabelEncoder 
 
le=LabelEncoder()
 
for x in colname:
    cars_test[x]=le.fit_transform(cars_test[x])

In [19]:
cars_test.head()

# acc -> 0
# good -> 1
# unacc -> 2
# vgood -> 3

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,2,3,3,1,2,1
1,3,0,0,0,0,2
2,1,0,0,2,2,1
3,3,3,1,0,0,0
4,1,2,2,1,1,2


In [20]:
# Creating X & Y for Training Data
X = cars_train.values[:,0:-1]  # All Independent variable
Y = cars_train.values[:,-1] # dependent variable --> Classes
Y=Y.astype(int)

In [21]:
print(X.shape)
print(Y.shape)

(1382, 6)
(1382,)


In [22]:
# In this case, Scaling is Ideally not required because all the variables have same number of Categories.

In [23]:
# Scale the data -> To bring the data in uniform range
from sklearn.preprocessing import StandardScaler
 
scaler = StandardScaler()
 
scaler.fit(X)
X = scaler.transform(X)

In [24]:
print(X)

[[ 1.33507272 -1.3488262  -0.45682233  1.21505861  1.22565305  0.00176987]
 [-0.44760409  1.32688358 -0.45682233 -0.01064285  1.22565305  1.22474807]
 [-0.44760409 -1.3488262   1.33418038  1.21505861 -1.21505663  0.00176987]
 ...
 [-1.33894249  1.32688358  1.33418038 -0.01064285  0.00529821 -1.22120833]
 [ 0.44373431  0.43498032  0.43867903 -0.01064285 -1.21505663  0.00176987]
 [ 0.44373431 -0.45692294  1.33418038  1.21505861  1.22565305 -1.22120833]]


X --> Values are getting in Range of -3 to +3

In [25]:
from sklearn.model_selection import train_test_split
 
#Split the data into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,  # Default value -> test_size= 0.25
                                                    random_state=10)

**Running Decison Tree Model**

In [26]:
# predicting using Decision Tree Classifier.
from sklearn.tree import DecisionTreeClassifier

model_DT = DecisionTreeClassifier(random_state=10,
                                   criterion="gini")

# fit the model on data and predict the values
model_DT.fit(X_train,Y_train)      # fit is the function that is used for training the data
Y_pred = model_DT.predict(X_test) # Validation Data
#print(Y_pred)
print(list(zip(Y_test,Y_pred)))

[(2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (1, 1), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (1, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (0, 0), (2, 2), (0, 0), (3, 3), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (1, 0), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (0, 0), (3, 3), (0, 0), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (3, 3), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (3, 3), (2, 2), (0, 0), (3, 3), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (0, 0), (2, 2), (0, 0),

In [27]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,Y_pred))
 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[ 69   1   1   0]
 [  4   8   0   0]
 [  0   0 185   0]
 [  0   0   0   9]]
Classification report: 
              precision    recall  f1-score   support

           0       0.95      0.97      0.96        71
           1       0.89      0.67      0.76        12
           2       0.99      1.00      1.00       185
           3       1.00      1.00      1.00         9

    accuracy                           0.98       277
   macro avg       0.96      0.91      0.93       277
weighted avg       0.98      0.98      0.98       277

Accuracy of the model:  0.9783393501805054


In [28]:
model_DT.score(X_train,Y_train)  # It will give the accuarcy for Training Data

1.0

Even if the score function is giving us **100% accuarcy** it is not a case of overfitting 
given that our test data also shows a close by accuracy

In [29]:
model_DT.feature_importances_

array([0.21976793, 0.1822093 , 0.06200593, 0.19425872, 0.09772725,
       0.24403087])

In [30]:
print((list(zip(cars_train.columns[0:-1],
               model_DT.feature_importances_))))

[('buying', 0.21976792783843332), ('maint', 0.18220929667385857), ('doors', 0.062005934167191426), ('persons', 0.19425872158174767), ('lug_boot', 0.09772725134941933), ('safety', 0.24403086838934968)]


In [31]:
# The Buying variable shows 22% importance, & so on. 



# If a varibale shows a exact 0 value. It means that the variable was never use for splitting while building the tree. 
# so we can eliminate such 0 importance variable.
# we can also eliminate those variable which show a very low importance like 0.0005 assuming that these variable have been used very less number of times for splitting

# In our case Safety variable was used most number of times for splitting


In [32]:
# It is Feture Embedded Technique.
sample=pd.DataFrame()
 
sample["Column"]=cars_train.columns[0:-1]
sample["Imp value"]=model_DT.feature_importances_
 
sample.sort_values("Imp value",ascending=False)

Unnamed: 0,Column,Imp value
5,safety,0.244031
0,buying,0.219768
3,persons,0.194259
1,maint,0.182209
4,lug_boot,0.097727
2,doors,0.062006


## Pruned Desicion Tree

In [33]:
from sklearn import tree
with open(r"model_DT.txt", "w") as f:
    
    f = tree.export_graphviz(model_DT, feature_names=cars_train.columns[0:-1],
                             out_file=f)
    
#generate the file and upload the code in webgraphviz.com to plot the decision tree

## Tuned Decision Tree Model

In [34]:
# predicting using Decision Tree Classifier.
from sklearn.tree import DecisionTreeClassifier

model_DT = DecisionTreeClassifier(random_state=10,
                                   criterion="gini",
                                 splitter="best",
                                 min_samples_leaf=3,
                                 min_samples_split=5,
                                 max_depth=10
                                 )

# hypo-parameter :-   min_samples_leaf, min_samples_split, max_depth, max_features, max_leaf_nodes

# fit the model on data and predict the values
model_DT.fit(X_train,Y_train)      # fit is the function that is used for training the data
Y_pred = model_DT.predict(X_test) # Validation Data
#print(Y_pred)
print(list(zip(Y_test,Y_pred)))

[(2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (1, 1), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 2), (1, 3), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (0, 0), (2, 2), (0, 0), (3, 3), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (0, 0), (3, 3), (0, 0), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (3, 3), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (3, 3), (2, 2), (0, 0), (3, 3), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (0, 0), (2, 2), (0, 0),

In [35]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,Y_pred))
 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[ 66   0   5   0]
 [  1   8   1   2]
 [  2   0 183   0]
 [  0   0   0   9]]
Classification report: 
              precision    recall  f1-score   support

           0       0.96      0.93      0.94        71
           1       1.00      0.67      0.80        12
           2       0.97      0.99      0.98       185
           3       0.82      1.00      0.90         9

    accuracy                           0.96       277
   macro avg       0.94      0.90      0.91       277
weighted avg       0.96      0.96      0.96       277

Accuracy of the model:  0.9602888086642599


# Model using Logistic Regression

In [36]:
from sklearn.linear_model import LogisticRegression
#create a model object
classifier = LogisticRegression(multi_class="multinomial")
#train the model object
classifier.fit(X_train,Y_train)      # fit is the function that is used for training the data

Y_pred = classifier.predict(X_test)
print(Y_pred)

[2 2 2 2 2 0 2 2 2 0 0 2 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 2 3 0 2 2 2 0 2 2 2
 2 2 2 2 2 0 2 0 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 0 3
 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 0 2 2 2 2 2
 0 2 0 0 2 2 2 2 2 2 2 2 2 0 2 2 0 2 2 0 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 0 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 0 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 0 2 0 2 2 2 0 2 2 0 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 0 0 2 2 0 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 3 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2]


In [37]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,Y_pred))
 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[ 11   0  58   2]
 [  2   0  10   0]
 [ 18   0 167   0]
 [  6   0   2   1]]
Classification report: 
              precision    recall  f1-score   support

           0       0.30      0.15      0.20        71
           1       0.00      0.00      0.00        12
           2       0.70      0.90      0.79       185
           3       0.33      0.11      0.17         9

    accuracy                           0.65       277
   macro avg       0.33      0.29      0.29       277
weighted avg       0.56      0.65      0.59       277

Accuracy of the model:  0.6462093862815884


As compare to Decision Tree model, in Logistic regression the accuracy is low (0.6462093862815884). It is suitable for Binary Classification. 

Therefore, This Model is not suitable.

In [38]:
np.set_printoptions(suppress=True)
y_pred_prob = classifier.predict_proba(X_test)
print(y_pred_prob)

[[0.17723996 0.04469266 0.77722473 0.00084265]
 [0.07716139 0.02202663 0.90080527 0.0000067 ]
 [0.14540796 0.02338039 0.83015498 0.00105667]
 ...
 [0.16631889 0.04582859 0.78778272 0.0000698 ]
 [0.1018213  0.03768644 0.86046754 0.00002472]
 [0.05074201 0.01342706 0.93581987 0.00001105]]


### Conclusion

We have Implemented Base Decision Tree having accuracy of **97.83%**, <br>
then we have Tuned  the Decision Tree having accuracy of **96.02%**, <br>
and lastly we have implemented Logistic regression having accuracy of **64.62%**. <br>

we have conclude that the Base Decision has the Highest Accuracy of **97.83%**.

In [39]:
# In Industry, When we have to run Multiple Model Together. USE THIS CODE

In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
 
# first, initialize the classificators
tree= DecisionTreeClassifier(random_state=10) # using the random state for reproducibility
knn= KNeighborsClassifier(n_neighbors=5,metric='euclidean')
svm= SVC(kernel="rbf", gamma=0.1, C=1,random_state=10)    # This is Base SVM
logreg=LogisticRegression(multi_class="multinomial",random_state=10)

In [41]:
# now, create a list with the objects 
models= [tree, knn, svm, logreg]

In [42]:
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
 
for model in models:
    model.fit(X_train, Y_train) # fit the model
    Y_pred= model.predict(X_test) # then predict on the test set
    accuracy= accuracy_score(Y_test, Y_pred) 
    clf_report= classification_report(Y_test, Y_pred) 
    print(confusion_matrix(Y_test,Y_pred))
    print("The accuracy of the ",type(model).__name__, " model is ", accuracy*100 )
    print("Classification report:\n", clf_report)
    print("\n")

[[ 69   1   1   0]
 [  4   8   0   0]
 [  0   0 185   0]
 [  0   0   0   9]]
The accuracy of the  DecisionTreeClassifier  model is  97.83393501805054
Classification report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96        71
           1       0.89      0.67      0.76        12
           2       0.99      1.00      1.00       185
           3       1.00      1.00      1.00         9

    accuracy                           0.98       277
   macro avg       0.96      0.91      0.93       277
weighted avg       0.98      0.98      0.98       277



[[ 65   1   5   0]
 [  8   4   0   0]
 [  1   0 184   0]
 [  2   0   1   6]]
The accuracy of the  KNeighborsClassifier  model is  93.50180505415162
Classification report:
               precision    recall  f1-score   support

           0       0.86      0.92      0.88        71
           1       0.80      0.33      0.47        12
           2       0.97      0.99      0.98       185


# Tuned SVM

In [43]:
from sklearn import svm
model_SVC=svm.SVC(kernel="rbf", gamma=0.1, C=90) # Just changing value of C
#fitting training data to the model
model_SVC.fit(X_train,Y_train)
 
Y_pred=model_SVC.predict(X_test)
#print(list(zip(Y_test,Y_pred)))

In [44]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,Y_pred))
 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[ 69   0   2   0]
 [  2  10   0   0]
 [  0   0 185   0]
 [  0   0   0   9]]
Classification report: 
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        71
           1       1.00      0.83      0.91        12
           2       0.99      1.00      0.99       185
           3       1.00      1.00      1.00         9

    accuracy                           0.99       277
   macro avg       0.99      0.95      0.97       277
weighted avg       0.99      0.99      0.99       277

Accuracy of the model:  0.9855595667870036


Out of the many models implemented **Tuned SVM & Base DT** work the best for us. Out of the two we will finalize upon the base decision Tree given that the **accuracy** is almost **similar** & the Complexity and the time consumption required by the Tuned SVM is **More**

# Predicting on Test File (Real Data)   cars_test

In [45]:
cars_test.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,2,3,3,1,2,1
1,3,0,0,0,0,2
2,1,0,0,2,2,1
3,3,3,1,0,0,0
4,1,2,2,1,1,2


In [46]:
test=cars_test.values
test=scaler.transform(test)
#print(test)

In [47]:
# predicting using Decision Tree Classifier.
from sklearn.tree import DecisionTreeClassifier

model_DT = DecisionTreeClassifier(random_state=10)

# fit the model on data and predict the values
model_DT.fit(X_train,Y_train)

In [48]:
test_pred=model_DT.predict(test)
test_pred

array([2, 2, 2, 2, 1, 2, 0, 0, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 3,
       2, 0, 2, 2, 2, 2, 2, 0, 1, 3, 1, 2, 0, 2, 0, 2, 2, 2, 2, 3, 2, 2,
       0, 0, 2, 2, 3, 2, 2, 2, 1, 2, 0, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0,
       2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 0, 2, 2, 3, 2, 2, 0, 2, 0, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2,
       2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 3, 2, 2,
       2, 2, 0, 0, 2, 2, 2, 2, 3, 2, 0, 2, 1, 0, 2, 2, 2, 2, 2, 3, 0, 0,
       2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 3, 0, 2, 2, 2, 3, 2, 2, 0, 2,
       2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 3, 2, 2, 0,
       2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 0, 2,
       2, 2, 0, 2, 2, 0, 2, 2, 2, 1, 1, 2, 2, 2, 0, 2, 2, 0, 3, 3, 0, 2,
       0, 2, 2, 2, 3, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2,

In [49]:
# For Understanding the Client. Reloading the data.
cars_test=pd.read_csv(r"cars_test.csv", header=None)
cars_test.columns=['buying','maint','doors','persons',
                   'lug_boot','safety','classes']  # Yact
cars_test["Pred"]=test_pred                     # Ypred
cars_test["Pred"]=cars_test["Pred"].replace({0:"acc",1:"good",2:"unacc",
                                         3:"vgood"})   # Replacing 0,1,2,3 ---> acc, good, unaccurate, vgood.
cars_test.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,classes,Pred
0,med,vhigh,5more,4,small,low,unacc,unacc
1,vhigh,high,2,2,big,med,unacc,unacc
2,low,high,2,more,small,low,unacc,unacc
3,vhigh,vhigh,3,2,big,high,unacc,unacc
4,low,med,4,4,med,med,good,good


In [50]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(cars_test.classes,cars_test.Pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(cars_test.classes,cars_test.Pred))
 
acc=accuracy_score(cars_test.classes,cars_test.Pred)
print("Accuracy of the model: ",acc)

[[ 60   1   3   0]
 [  3  10   0   0]
 [  0   0 251   0]
 [  0   0   0  18]]
Classification report: 
              precision    recall  f1-score   support

         acc       0.95      0.94      0.94        64
        good       0.91      0.77      0.83        13
       unacc       0.99      1.00      0.99       251
       vgood       1.00      1.00      1.00        18

    accuracy                           0.98       346
   macro avg       0.96      0.93      0.94       346
weighted avg       0.98      0.98      0.98       346

Accuracy of the model:  0.9797687861271677


In [51]:
# After Predicting Ypred for Testing Data, we can conclude that the accuracy for Base Model DT & Testing File is same.

In [52]:
# Prediction is Done. After Prediction Send it to Client in Original format. (Excel File)
cars_test.to_excel("Decision Test Output.xlsx", header=True)

**DONE**

# Model Using Random Forest

In [53]:
#predicting using the Random_Forest_Classifier
from sklearn.ensemble import RandomForestClassifier
 
model_RandomForest=RandomForestClassifier(n_estimators=100,                  # estimator --> Default value -> 100
                                          random_state=10, bootstrap=True,   # Bootstrap -> Always will be True --> Vaules will be repeated in each Bag
                                         n_jobs=-1)                          # no of jobs --> -1 -> Special value --> To speed up the Process
 
#fit the model on the data and predict the values
model_RandomForest.fit(X_train,Y_train)
 
Y_pred=model_RandomForest.predict(X_test)

In [54]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,Y_pred))
 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[ 65   1   5   0]
 [  4   8   0   0]
 [  1   0 184   0]
 [  0   0   0   9]]
Classification report: 
              precision    recall  f1-score   support

           0       0.93      0.92      0.92        71
           1       0.89      0.67      0.76        12
           2       0.97      0.99      0.98       185
           3       1.00      1.00      1.00         9

    accuracy                           0.96       277
   macro avg       0.95      0.89      0.92       277
weighted avg       0.96      0.96      0.96       277

Accuracy of the model:  0.9602888086642599


# Model using Extra_Trees_Classifier

In [55]:
#predicting using the Model using Extra_Trees_Classifier
from sklearn.ensemble import ExtraTreesClassifier
 
model_EXT=ExtraTreesClassifier(n_estimators=100,                  # estimator --> Default value -> 100 --> Buiding the 100 Trees Parallely. -> how many DT you want to build behing the scene
                                          random_state=10, bootstrap=True,   # Bootstrap -> Always will be True --> Vaules will be repeated in each Bag
                                         n_jobs=-1)                          # no of jobs --> -1 -> Special value --> To speed up the Process
 
#fit the model on the data and predict the values
model_EXT.fit(X_train,Y_train)
 
Y_pred=model_EXT.predict(X_test)

In [56]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,Y_pred))
 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[ 64   0   7   0]
 [  8   4   0   0]
 [  2   0 183   0]
 [  2   0   0   7]]
Classification report: 
              precision    recall  f1-score   support

           0       0.84      0.90      0.87        71
           1       1.00      0.33      0.50        12
           2       0.96      0.99      0.98       185
           3       1.00      0.78      0.88         9

    accuracy                           0.93       277
   macro avg       0.95      0.75      0.81       277
weighted avg       0.93      0.93      0.93       277

Accuracy of the model:  0.9314079422382672


### Pruning

In [57]:
# 2 Techniques : for Cross Validation
# 1. GridSearchcv
# 2. Randomize Searchcv

In [58]:
# Generic model optimization Technique.
from sklearn.ensemble import ExtraTreesClassifier
 
model_EXT=ExtraTreesClassifier( random_state=10, bootstrap=True)
 
# when we have to pruned multiple value, create Dict
parameter_space = {
    'n_estimators':[100,300,500,1000],                                          #np.arange(100, 1001,50),
    'max_depth':[10,15, 8, 12],
    'min_samples_leaf':[3,4,5,6,7]
    }                                                               
from sklearn.model_selection import GridSearchCV                    #GrisSearchCV -> when you have limited no of obs you can use GridSearchCV
clf = GridSearchCV(model_EXT, parameter_space, n_jobs=-1, cv=5)     #Randomize Search -> When you have Large no of obs, it will Randomly try out the multiple combinations --> Randomize search will not search for all obs

In [59]:
clf.fit(X_train,Y_train)

In [60]:
print('Best parameter found:/n', clf.best_params_)

Best parameter found:/n {'max_depth': 15, 'min_samples_leaf': 3, 'n_estimators': 100}


In [61]:
# According to Best Parameter it will give the best Accuracy

In [62]:
clf.best_score_        #accuracy of the best parameter using the 5-fold CV

0.902262443438914

In [63]:
Y_pred = clf.predict(X_test)

In [64]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)
 
print("Classification report: ")
 
print(classification_report(Y_test,Y_pred))
 
acc=accuracy_score(Y_test, Y_pred)
print("Accuracy of the model: ",acc)

[[ 57   0  14   0]
 [ 11   1   0   0]
 [  1   0 184   0]
 [  5   0   0   4]]
Classification report: 
              precision    recall  f1-score   support

           0       0.77      0.80      0.79        71
           1       1.00      0.08      0.15        12
           2       0.93      0.99      0.96       185
           3       1.00      0.44      0.62         9

    accuracy                           0.89       277
   macro avg       0.92      0.58      0.63       277
weighted avg       0.89      0.89      0.87       277

Accuracy of the model:  0.8880866425992779
