# Problem Statement - Build a Random Forest model to predict the class of a car in the Car Evaluation dataset.

In [1]:
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# Importing the dataset

In [2]:
df = pd.read_csv("car_evaluation.csv")

In [3]:
col_name = ["buying","maint","doors","persons","lug_boot","safety","class"]
df.columns = col_name

# Exploratory Data Analysis

In [4]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [5]:
df.tail()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
1722,low,low,5more,more,med,med,good
1723,low,low,5more,more,med,high,vgood
1724,low,low,5more,more,big,low,unacc
1725,low,low,5more,more,big,med,good
1726,low,low,5more,more,big,high,vgood


In [6]:
df["class"].value_counts()

unacc    1209
acc       384
good       69
vgood      65
Name: class, dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1727 entries, 0 to 1726
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1727 non-null   object
 1   maint     1727 non-null   object
 2   doors     1727 non-null   object
 3   persons   1727 non-null   object
 4   lug_boot  1727 non-null   object
 5   safety    1727 non-null   object
 6   class     1727 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [8]:
df.dtypes

buying      object
maint       object
doors       object
persons     object
lug_boot    object
safety      object
class       object
dtype: object

# Splitting data into train and test sets

In [9]:
x = df.iloc[0:, 0 : 6] #input variable
y = df.iloc[:, -1] #target variable

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.30, random_state=(12))

# Feature Engineering

In [11]:
encoder = ce.OrdinalEncoder(cols=["buying","maint","doors","persons","lug_boot","safety"])

In [12]:
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [13]:
rfc = RandomForestClassifier(n_estimators = 100, random_state=(40))

In [14]:
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)

In [15]:
print(accuracy_score(y_test, y_pred)*100)

94.41233140655106


# Conclusion

Thus, the problem was solved by performing Ordinal Encoding and by using RandomForestClassifier with 100 decision trees and we used the accuracy_score metric to predict the accuracy of the Random Forest model with 100 decision trees and the model was 94.41% accurate.
