# Customer Churn Prediction using Random Forest

This notebook trains a Random Forest classifier to predict customer churn, evaluates the model, and analyzes feature importance.

In [None]:

# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Step 2: Load Dataset
data = pd.read_csv('data_for_predictions.csv')
data.head()


In [None]:

# Step 3: Inspect Dataset
print(data.info())
print(data.describe())


In [None]:

# Step 4: Preprocessing
if data['Churn'].dtype == 'object':
    data['Churn'] = data['Churn'].map({'No':0, 'Yes':1})

# Drop unnecessary columns like Customer_ID
if 'Customer_ID' in data.columns:
    data = data.drop('Customer_ID', axis=1)

# Check for missing values
print(data.isnull().sum())


In [None]:

# Step 5: Split Features and Target
X = data.drop('Churn', axis=1)
y = data['Churn']

# Step 6: Split into Training and Testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:

# Step 7: Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [None]:

# Step 8: Make Predictions
y_pred = rf_model.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Classification Report
print(classification_report(y_test, y_pred))

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


In [None]:

# Step 9: Feature Importance
importances = rf_model.feature_importances_
feature_names = X.columns
feat_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)

# Plot Feature Importance
plt.figure(figsize=(10,6))
sns.barplot(x=feat_importances, y=feat_importances.index)
plt.title("Feature Importance from Random Forest")
plt.show()



## Discussion

- **Evaluation Metrics:** Accuracy, Precision, Recall, and F1-score were chosen to balance overall correctness and the cost of false positives/negatives.  
- **Model Performance:** Review the confusion matrix and metrics to decide if the model performance is satisfactory.  
- **Feature Importance:** Highlights which features contribute most to predicting churn, informing potential business interventions.
