TASK - 3 CUSTOMER CHURN PREDICTION

Develop a model to predict customer churn for a subscription- based
service or business. Use historical customer data, including features like
usage behavior and customer demographics, and try algorithms like
Logistic Regression, Random Forests, or Gradient Boosting to predict
churn.

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

# Load your historical customer data
# Replace 'your_dataset.csv' with the actual file path
dataset_path = 'C:/Users/Dharsana/Downloads/archive (4)/Churn_Modelling.csv'
df = pd.read_csv(dataset_path)

# Identify the target variable (Exited) and features
target = 'Exited'
non_numeric_columns = ['Surname', 'Geography', 'Gender']
features = df.drop(columns=[target] + non_numeric_columns)  # Exclude target and non-numeric columns

# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Define features and target variable for training and testing sets
X_train, y_train = train_data[features.columns], train_data[target]
X_test, y_test = test_data[features.columns], test_data[target]

# Define transformers for numeric and categorical columns
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_features = X_train.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Choose a classifier (Logistic Regression, Random Forest, or Gradient Boosting)
# For simplicity, let's use all three for comparison
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier()
}

# Train and evaluate each classifier
for name, classifier in classifiers.items():
    print(f"Training {name}...")
    model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', classifier)])
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    # Print results
    print(f"\nResults for {name}:")
    print(f"Accuracy: {accuracy}")
    print("Confusion Matrix:\n", confusion_mat)
    print("Classification Report:\n", classification_rep)
    print("=" * 50)


Training Logistic Regression...

Results for Logistic Regression:
Accuracy: 0.807
Confusion Matrix:
 [[1555   52]
 [ 334   59]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.97      0.89      1607
           1       0.53      0.15      0.23       393

    accuracy                           0.81      2000
   macro avg       0.68      0.56      0.56      2000
weighted avg       0.77      0.81      0.76      2000

Training Random Forest...

Results for Random Forest:
Accuracy: 0.859
Confusion Matrix:
 [[1553   54]
 [ 228  165]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.97      0.92      1607
           1       0.75      0.42      0.54       393

    accuracy                           0.86      2000
   macro avg       0.81      0.69      0.73      2000
weighted avg       0.85      0.86      0.84      2000

Training Gradient Boosting...

Results for Gradient Bo