In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")


In [None]:
df = pd.read_csv('/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv')

In [None]:
print(df.head())

In [None]:
print(df.info())

In [None]:
print(df.isnull().sum())

In [None]:
# Statistical summary
print(df.describe())

# Data Cleaning and Preprocessing

In [None]:
# Drop columns not needed for modeling
df.drop(columns=['RowNumber', 'CustomerId', 'Surname'], inplace=True)

# Encode categorical variables
labelencoder = LabelEncoder()
df['Gender'] = labelencoder.fit_transform(df['Gender'])
df['Geography'] = labelencoder.fit_transform(df['Geography'])

# Confirm changes
print(df.head())


# Define Features and Target

In [None]:
# Features (independent variables)
X = df.drop(columns='Exited')

# Target variable (dependent variable)
y = df['Exited']


# Split Data into Training and Testing Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)


# Train the Model

In [None]:
# Initialize the Random Forest model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Check training accuracy
train_accuracy = model.score(X_train, y_train)
print("Training Accuracy: {:.2f}%".format(train_accuracy * 100))


# Make Predictions

In [None]:
# Predict on test data
y_pred = model.predict(X_test)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: {:.2f}%".format(test_accuracy * 100))


# Evaluate the Model

* Assess the model's performance using various metrics like confusion matrix and ROC curve.

# Classification Report

In [None]:
print("Classification Report:\n", classification_report(y_test, y_pred))


* Provides metrics like precision, recall, F1-score, and support for each class (0 and 1).

# Confusion Matrix

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


* A confusion matrix visualizes the model’s performance by showing the counts of true positives, true negatives, false positives, and false negatives

# ROC Curve

In [None]:
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 5))
plt.plot(fpr, tpr, color='blue', lw=2, label="AUC = {:.2f}".format(roc_auc))
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.show()


* The ROC curve shows the tradeoff between the true positive rate (sensitivity) and the
   false positive rate.
*  AUC (area under the curve) quantifies the model’s ability to distinguish between classes.