# Comparative study of regressor and classifier with decision tree using modern tools

### Importing libraries

In [37]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score,mean_absolute_error,mean_squared_error,r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix


### Reading data

In [38]:
dataset=pd.read_csv("data.csv")

In [39]:
dataset.head(10)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
5,15728773,Male,27,58000,0
6,15598044,Female,27,84000,0
7,15694829,Female,32,150000,1
8,15600575,Male,25,33000,0
9,15727311,Female,35,65000,0


Feature Selection/importance


In [53]:
dataset.corr()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
User ID,1.0,-0.025249,-0.000721,0.071097,0.00712
Gender,-0.025249,1.0,-0.073741,-0.060435,-0.042469
Age,-0.000721,-0.073741,1.0,0.155238,0.622454
EstimatedSalary,0.071097,-0.060435,0.155238,1.0,0.362083
Purchased,0.00712,-0.042469,0.622454,0.362083,1.0


Selecting Age and EstimimatedSalary as Features

In [40]:
#Features
x= dataset[['Age','EstimatedSalary']]
#Labels
y = dataset['Purchased']

Splitting the data into 75 % for training and 25% for testing

In [41]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size = 0.25,random_state=42)

Normalizing using a minmax scale

In [42]:
scale_x = MinMaxScaler()
x_train_s = scale_x.fit_transform(x_train)
x_test_s = scale_x.transform(x_test)

Fitting the data in the Decision Tree Classifier

In [43]:
model_dt = DecisionTreeClassifier()
model_dt.fit(x_train_s, y_train)
y_predict = model_dt.predict(x_test_s)


## Perfomance

In [44]:
print('The accuracy of the Decision tree classifier is',accuracy_score(y_test, y_predict))

The accuracy of the Decision tree classifier is 0.82


In [45]:
print("MSE:", mean_squared_error(y_test, y_predict))
print("RMSE:",np.sqrt(mean_squared_error(y_test, y_predict)))
print("MAE:", mean_absolute_error(y_test, y_predict))

MSE: 0.18
RMSE: 0.4242640687119285
MAE: 0.18


In [46]:
confusion_matrix(y_test, y_predict)

array([[56,  7],
       [11, 26]], dtype=int64)

### Adding Gender to the features

In [47]:
from sklearn.preprocessing import LabelEncoder

# Create the label encoder
le = LabelEncoder()
dataset['Gender'] = le.fit_transform(dataset['Gender'])

In [48]:
x_train['Gender']=dataset['Gender']

In [49]:
x_test['Gender']=dataset['Gender']

In [50]:
scale_x = MinMaxScaler()
x_train_s = scale_x.fit_transform(x_train)
x_test_s = scale_x.transform(x_test)

In [51]:
model_dt = DecisionTreeClassifier()
model_dt.fit(x_train_s, y_train)
y_predict = model_dt.predict(x_test_s)

In [52]:
print('The accuracy of the Decision tree classifier is after adding Gender ',accuracy_score(y_test, y_predict))

The accuracy of the Decision tree classifier is after adding Gender  0.81


In [17]:
print("MSE:", mean_squared_error(y_test, y_predict))
print("RMSE:",  np.sqrt(mean_squared_error(y_test, y_predict)))
print("MAE:", mean_absolute_error(y_test, y_predict))

MSE: 0.18
RMSE: 0.22779922779922768
MAE: 0.18


In [18]:
confusion_matrix(y_test, y_predict)

array([[55,  8],
       [10, 27]], dtype=int64)

## Decision Tree as a Regressor

In [19]:
#Features
x_train= x_train.drop('Gender',axis=1)

In [20]:
x_test = x_test.drop('Gender',axis=1)

In [25]:
scale_x = MinMaxScaler()
x_train_s = scale_x.fit_transform(x_train)
x_test_s = scale_x.transform(x_test)

In [26]:
model_r= DecisionTreeRegressor()
model_r.fit(x_train_s, y_train)
y_predict = model_r.predict(x_test_s)

In [28]:
from sklearn.metrics import r2_score

# Assuming y_test and y_predict are NumPy arrays or Pandas Series
accuracy = model_r.score(x_test_s, y_test)*100

# Calculate the "accuracy" based on R-squared

print('The accuracy of the regressor: {:.2f}%'.format(accuracy))


The accuracy of the regressor: 27.07%


In [29]:
print("MSE:", mean_squared_error(y_test, y_predict))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_predict)))
print("MAE:", mean_absolute_error(y_test, y_predict))

MSE: 0.17
RMSE: 0.41231056256176607
MAE: 0.17


## Adding Gender to the features

In [30]:
from sklearn.preprocessing import LabelEncoder

# Create the label encoder
le = LabelEncoder()

dataset['Gender'] = le.fit_transform(dataset['Gender'])

In [31]:
x_train['Gender']=dataset['Gender']

In [32]:
x_test['Gender']=dataset['Gender']

In [33]:
scale_x = MinMaxScaler()
x_train_s = scale_x.fit_transform(x_train)
x_test_s = scale_x.transform(x_test)

In [34]:
model_r = DecisionTreeRegressor()
model_r.fit(x_train_s, y_train)
y_predict = model_r.predict(x_test_s)

In [35]:
from sklearn.metrics import r2_score

# Assuming y_test and y_predict are NumPy arrays or Pandas Series
r_squared = r_squared = model_r.score(x_test_s, y_test)

# Calculate the "accuracy" based on R-squared
accuracy = r_squared * 100.0
print('The accuracy of the regressor: {:.2f}%'.format(accuracy))


The accuracy of the regressor: 31.36%


In [36]:
print("MSE:", mean_squared_error(y_test, y_predict))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_predict)))
print("MAE:", mean_absolute_error(y_test, y_predict))

MSE: 0.16
RMSE: 0.4
MAE: 0.16
