In [None]:
# importing necessary libraries 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay

In [None]:
# importing required dataset
data = pd.read_csv(r"C:\Users\LENOVO\Desktop\Data Science Internship\bank+marketing\bank\bank.csv" ,sep=';')
data.head()

In [None]:
# data visualization
data.info()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.describe()

In [None]:
data.duplicated()

In [None]:
data.duplicated().sum()

In [None]:
# exploratory data analysis 
sns.histplot(x = "education",  kde=True, data = data, hue = "y")
plt.title("Education Status")
plt.show()

In [None]:
sns.histplot(x="age", data=data, kde=True, hue= "y")
plt.title("Age Distribution")
plt.show()

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(x="education", data= data, hue ="y", color= "yellow")
plt.title("Education Status")
print(plt)

In [None]:
plt.figure(figsize=(8,8))
sns.countplot(x="marital", data=data, hue="y", color= "green")
plt.title("Marital Status ")
print(plt)

In [None]:
plt.figure(figsize=(14,5))
sns.countplot(x="job", data= data, hue ="y")
plt.title("Occupation Distribution")
plt.show()

In [None]:
data.default.value_counts()

In [None]:
plt.figure(figsize=(6,5))
sns.countplot(x="housing", data= data, hue ="y")
plt.title("Housing Loan Status")
plt.show()

In [None]:
data.y.value_counts()
keys = data.y.value_counts().index
data = data.y.value_counts().values
plt.figure(figsize=(6,3.5))
explode = [0,0.1]
plt.pie(data,labels=keys,explode=explode, autopct='%.0f%%')
plt.show()

In [None]:
cols = data.select_dtypes("object").columns
cols

In [None]:
le = LabelEncoder()

data[cols] = data[cols].apply(le.fit_transform)

In [None]:
data.head(3)

In [None]:
#Splitting input and output
X = data.drop("y", axis=1)
y = data.y

In [None]:
scaler = StandardScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

In [None]:
#Train-test split
train_X, test_X, train_y, test_y = train_test_split(X_scaled, y, test_size=0.3)

In [None]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(train_X, train_y)

In [None]:
print('Train Score: {}'.format(decision_tree.score(train_X, train_y)))  
print('Test Score: {}'.format(decision_tree.score(test_X, test_y))) 

In [None]:
cross_val_score(decision_tree, train_X, train_y, cv=5).mean()

In [None]:
ypred = decision_tree.predict(test_X)
print(classification_report(test_y,ypred))

In [None]:
#Applying Grid search cv to find best estimaters to improve model performance

param_grid = {
    'max_depth': [3, 5, 7,10, None],
    'criterion' : ['gini', 'entropy'],
    'min_samples_leaf': [3, 5, 7, 9,10,20]
    }

In [None]:
gscv = GridSearchCV(decision_tree, param_grid, cv=5, verbose=1)
gscv.fit(train_X, train_y)

In [None]:
gscv.best_params_

In [None]:
gscv.best_estimator_

In [None]:
cross_val_score(gscv.best_estimator_, train_X, train_y, cv=5).mean() 

In [None]:
clf = DecisionTreeClassifier(criterion= 'gini', max_depth= 5, min_samples_leaf = 3)
clf.fit(train_X, train_y)

In [None]:
print('Train Score: {}'.format(clf.score(train_X, train_y)))
print('Test Score: {}'.format(clf.score(test_X, test_y)))

In [None]:
pred_y = clf.predict(test_X)

In [None]:
#Classification Report
print(classification_report(pred_y, test_y))

In [None]:
#Accuracy Score
accuracy = accuracy_score(test_y,pred_y)
print("Test Accuracy of Decision Tree Classifier : {}".format(accuracy*100))

In [None]:
#Visualizing the Tree
from sklearn import tree
fig = plt.figure(figsize=(15,10))
t= tree.plot_tree(clf,filled=True,feature_names=X.columns)