<a href="https://colab.research.google.com/github/chandanbn07/task3/blob/main/task3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset with proper delimiter
df = pd.read_csv('/content/banking.csv', sep=';')

# Check if dataset is correctly read
print("First few rows of the dataset:")
print(df.head())

# Verify if the dataset was read correctly
print("\nColumn names detected:")
print(df.columns)

# Explicitly define the target column name
target_column = 'y'

# Check if the target column exists in the DataFrame
if target_column not in df.columns:
    # If not found, try to find a column containing 'y' or 'target' in its name
    possible_targets = [col for col in df.columns if 'y' in col.lower() or 'target' in col.lower()]
    if possible_targets:
        target_column = possible_targets[0]
        print(f"Warning: Using '{target_column}' as target column. Please confirm if this is correct.")
    else:
        raise KeyError(f"Error: Target column ('y' or similar) not found. Please check the dataset.")

# Replace 'unknown' values with NaN for better handling
df.replace('unknown', np.nan, inplace=True)

# Fill missing values in categorical columns with mode
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Define features and target
X = df.drop(columns=[target_column])  # Drop the correct target column
y = df[target_column]

# Check if y has only two unique values
print("Unique values in 'y':", y.unique())

# Train-test split (no stratify if it causes errors)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features for better performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) # Now X_train should have data
X_test = scaler.transform(X_test)

# Train Decision Tree Classifier
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate model
print("\nModel Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='Blues', fmt='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Visualize Decision Tree
plt.figure(figsize=(12, 6))
plot_tree(clf, feature_names=X.columns, class_names=['No', 'Yes'], filled=True)
plt.title('Decision Tree Visualization')
plt.show()

print("✅ Decision Tree Classifier built and evaluated successfully!")