<a href="https://colab.research.google.com/github/chandanbn07/task3/blob/main/task3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset with proper delimiter
df = pd.read_csv('/content/banking.csv', sep=';')

# Strip column names to avoid whitespace issues
df.columns = df.columns.str.strip()

# Ensure target column 'y' exists
if 'y' not in df.columns:
    df.rename(columns={df.columns[-1]: 'y'}, inplace=True)

# Check unique values in 'y' before encoding
print("Unique values in 'y' before encoding:", df['y'].unique())

# Convert 'y' to binary format (1 = 'yes', 0 = 'no')
df['y'] = df['y'].map({'yes': 1, 'no': 0})

# Drop rows where 'y' is NaN (if any conversion issue happened)
df = df.dropna(subset=['y'])

# Replace 'unknown' values in categorical columns with the most frequent value (mode)
for col in df.select_dtypes(include=['object']).columns:
    df[col].replace('unknown', np.nan, inplace=True)
    df[col].fillna(df[col].mode()[0], inplace=True)

# Encode categorical variables using Label Encoding
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Define features and target
X = df.drop(columns=['y'])
y = df['y']

# Ensure 'y' is binary
assert len(y.unique()) == 2, "Error: More than two unique values found in 'y'."

# Train-test split (no stratify if it causes errors)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features for better performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Decision Tree Classifier
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate model
print("\nModel Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='Blues', fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Visualize Decision Tree
plt.figure(figsize=(12, 6))
plot_tree(clf, feature_names=X.columns, class_names=['No', 'Yes'], filled=True)
plt.title('Decision Tree Visualization')
plt.show()

print("Decision Tree Classifier built and evaluated successfully!")

Unique values in 'y' before encoding: ['44,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,210,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0'
 '53,technician,married,unknown,no,no,no,cellular,nov,fri,138,1,999,0,nonexistent,-0.1,93.2,-42,4.021,5195.8,0'
 '28,management,single,university.degree,no,yes,no,cellular,jun,thu,339,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6,1'
 ...
 '42,admin.,single,university.degree,unknown,yes,yes,telephone,may,wed,62,3,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191,0'
 '48,technician,married,professional.course,no,no,yes,telephone,oct,tue,200,2,999,0,nonexistent,-3.4,92.431,-26.9,0.742,5017.5,0'
 '25,student,single,high.school,no,no,no,telephone,may,fri,112,4,999,0,nonexistent,1.1,93.994,-36.4,4.859,5191,0']


AssertionError: Error: More than two unique values found in 'y'.