In [72]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [73]:
!pip install xgboost




In [74]:
# Import custom modules
import sys
import logging
sys.path.append('../src')

from utils import PipelineUtils
from data_processing import DataProcessor
from clustering import CustomerClustering
from model_training import ModelTrainer

In [76]:
# Initialize utilities
utils = PipelineUtils()
utils.setup_logging(log_level=logging.INFO)

2025-09-22 02:17:13,700 - utils - INFO - Logging setup completed


In [77]:
# Set style
plt.style.use('ggplot')
sns.set_palette("viridis")

In [79]:
# Initialize data processor
processor = DataProcessor()

In [82]:
import os
import sys

# Change the current working directory to the project's root.
# This is a robust way to ensure all file paths are correct.
try:
    os.chdir(r'C:\Users\GCV\churn_prediction')
    print(f"Working directory changed to: {os.getcwd()}")
except FileNotFoundError:
    print("Warning: Could not change directory. Please check the path.")

# Add the project root to the Python path
sys.path.append(os.getcwd())

# Now, your relative paths will work correctly.
from src.data_processing import DataProcessor


df = processor.load_data_from_local('data/marketing_campaign.csv')
df_clean = processor.clean_data()

2025-09-22 02:29:02,504 - data_processing - INFO - Loading data from local file: data/marketing_campaign.csv
2025-09-22 02:29:02,517 - data_processing - INFO - Data loaded successfully. Shape: (2240, 29)
2025-09-22 02:29:02,518 - data_processing - INFO - Starting data cleaning...
2025-09-22 02:29:02,518 - data_processing - INFO - Original data shape: (2240, 29)
2025-09-22 02:29:02,521 - data_processing - INFO - Original missing values:
ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4  

Working directory changed to: C:\Users\GCV\churn_prediction


In [83]:
# Create RFM features
print(" STEP 2: Creating RFM features...")
rfm_df = processor.create_rfm_features()

2025-09-22 02:29:03,475 - data_processing - INFO - Creating RFM features...
2025-09-22 02:29:03,477 - data_processing - INFO - Snapshot date for RFM: 2014-06-30 00:00:00


 STEP 2: Creating RFM features...


2025-09-22 02:29:03,695 - data_processing - INFO - RFM features created. Shape: (2240, 4)
2025-09-22 02:29:03,716 - data_processing - INFO - RFM stats:
                 ID      Recency  Frequency       Monetary
count   2240.000000  2240.000000     2240.0    2240.000000
mean    5592.159821   354.582143        1.0   52237.975446
std     3246.662198   202.122512        0.0   25037.955891
min        0.000000     1.000000        1.0    1730.000000
25%     2828.250000   181.750000        1.0   35538.750000
50%     5458.500000   356.500000        1.0   51381.500000
75%     8427.750000   530.000000        1.0   68289.750000
max    11191.000000   700.000000        1.0  666666.000000


In [84]:
# Create churn labels
print(" STEP 3: Creating churn labels...")
rfm_with_churn = processor.create_churn_label(recency_threshold=90)
print(" Data processing completed!")
print(f"Final dataset shape: {rfm_with_churn.shape}")
print(f"Churn distribution:\n{rfm_with_churn['churn'].value_counts()}")

2025-09-22 02:29:04,657 - data_processing - INFO - Creating churn label with threshold: 90 days
2025-09-22 02:29:04,661 - data_processing - INFO - Churn distribution:
churn
1    1960
0     280
Name: count, dtype: int64
2025-09-22 02:29:04,663 - data_processing - INFO - Churn rate: 87.50%


 STEP 3: Creating churn labels...
 Data processing completed!
Final dataset shape: (2240, 5)
Churn distribution:
churn
1    1960
0     280
Name: count, dtype: int64


In [85]:
# Initialize clustering
clustering = CustomerClustering(random_state=42)

In [87]:
# Prepare data for clustering
print(" STEP 4: Preparing data for clustering...")
X_scaled = clustering.prepare_data(rfm_df, scale_features=True)

2025-09-22 02:29:21,441 - clustering - INFO - Preparing data for clustering...
2025-09-22 02:29:21,443 - clustering - INFO - Data shape: (2240, 3)
2025-09-22 02:29:21,446 - clustering - INFO - Features scaled using StandardScaler


 STEP 4: Preparing data for clustering...


In [88]:
# Find optimal number of clusters
print(" STEP 5: Finding optimal clusters...")
optimal_info = clustering.find_optimal_clusters(X_scaled, max_clusters=8)

2025-09-22 02:29:23,840 - clustering - INFO - Finding optimal number of clusters...
2025-09-22 02:29:23,842 - clustering - INFO - Testing k = 2...
2025-09-22 02:29:23,999 - clustering - INFO - Testing k = 3...


 STEP 5: Finding optimal clusters...


2025-09-22 02:29:24,180 - clustering - INFO - Testing k = 4...
2025-09-22 02:29:24,337 - clustering - INFO - Testing k = 5...
2025-09-22 02:29:24,503 - clustering - INFO - Testing k = 6...
2025-09-22 02:29:24,666 - clustering - INFO - Testing k = 7...
2025-09-22 02:29:24,837 - clustering - INFO - Testing k = 8...
2025-09-22 02:29:25,017 - clustering - INFO - Optimal number of clusters: 2
2025-09-22 02:29:25,018 - clustering - INFO - Best silhouette score: 0.3904


In [89]:
# Perform clustering with optimal k
print(" STEP 6: Performing clustering...")
cluster_labels = clustering.perform_clustering(X_scaled, n_clusters=optimal_info['optimal_k'])

2025-09-22 02:29:27,004 - clustering - INFO - Performing K-means clustering with 2 clusters...
2025-09-22 02:29:27,144 - clustering - INFO - Clustering completed. Silhouette score: 0.3904
2025-09-22 02:29:27,146 - clustering - INFO - Cluster distribution:
1    1135
0    1105
Name: count, dtype: int64


 STEP 6: Performing clustering...


In [90]:
# Analyze clusters
print(" STEP 7: Analyzing clusters...")
clustered_data, cluster_stats, cluster_sizes = clustering.analyze_clusters(rfm_df, cluster_labels)

2025-09-22 02:29:30,504 - clustering - INFO - Analyzing cluster characteristics...
2025-09-22 02:29:30,514 - clustering - INFO - Cluster statistics:
2025-09-22 02:29:30,519 - clustering - INFO - 
        Recency                   Frequency               Monetary            \
           mean     std  min  max      mean  std min max      mean       std   
Cluster                                                                        
0        177.61  101.48    1  393       1.0  0.0   1   1  53396.86  28500.39   
1        526.87  102.03  335  700       1.0  0.0   1   1  51109.73  21080.81   

                           
            min       max  
Cluster                    
0        1730.0  666666.0  
1        2447.0  160803.0  
2025-09-22 02:29:30,521 - clustering - INFO - 
Cluster sizes:
         Size  Percentage
Cluster                  
1        1135       50.67
0        1105       49.33


 STEP 7: Analyzing clusters...


In [94]:
import os
from src.clustering import CustomerClustering
import pandas as pd
import numpy as np

# Change to the project root directory if needed
try:
    os.chdir(r'C:\Users\GCV\churn_prediction')
    print(f"Working directory changed to: {os.getcwd()}")
except FileNotFoundError:
    print("Warning: Could not change directory. Please check the path.")

# Create a dummy RFM DataFrame for demonstration
np.random.seed(42)
n_samples = 1000
rfm_data = pd.DataFrame({
    'Recency': np.random.gamma(2, 20, n_samples),
    'Frequency': np.random.poisson(3, n_samples),
    'Monetary': np.random.lognormal(3, 1, n_samples)
})
rfm_data.index.name = 'CustomerID'

# Instantiate the clustering class
clustering = CustomerClustering(random_state=42)

# Prepare and find optimal clusters
X_scaled = clustering.prepare_data(rfm_data, scale_features=True)
optimal_info = clustering.find_optimal_clusters(X_scaled, max_clusters=8)
optimal_k = optimal_info['optimal_k']

# Perform clustering and analyze results
cluster_labels = clustering.perform_clustering(X_scaled, n_clusters=optimal_k)
clustered_data, cluster_stats, cluster_sizes = clustering.analyze_clusters(rfm_data, cluster_labels)

# Save the clustering results to the correct output directory
clustering.save_clustering_results(clustered_data, output_dir="clustering_results")

print("✅ Clustering process completed and results saved.")

Error: Required files not found. Please ensure `clustering.py` has run correctly.


FileNotFoundError: [Errno 2] No such file or directory: 'clustering_results/clustered_customers.csv'

In [91]:
# Now import the selector
import os
import sys

# Change the current working directory to the project's root.
# This assumes your notebook is in a subdirectory (e.g., 'notebooks/').
# The '../' moves up one directory.
# If your notebook is in the root, this line is not needed.
os.chdir(r'C:\Users\GCV\churn_prediction')
print(f"Current working directory: {os.getcwd()}")

# Now that we're in the right directory, import the module.
from src.model_selection import ModelSelector

Current working directory: C:\Users\GCV\churn_prediction


In [19]:


# Step 2: Initialize the selector
selector = ModelSelector(config_path="configs/model_config.json")  # specify your config path

# Step 3: Select the best model
best_name, best_model, all_results = selector.select_best_model()

# Step 4: Save the results and best model
selector.save_results()

# Step 5: Inspect results
print(f"Best model: {best_name}")
print(f"AUC Score: {all_results[best_name]['test_auc']:.4f}")

# Optional: See all evaluation metrics
for model, metrics in all_results.items():
    print(f"\nModel: {model}")
    for metric, value in metrics.items():
        if metric != 'model':
            print(f"  {metric}: {value}")



2025-09-22 00:49:43,868 - src.model_selection - INFO - Loaded data: 2240 samples, 3 features
2025-09-22 00:49:43,874 - src.model_selection - INFO - 🔍 Evaluating RandomForest...


Fitting 5 folds for each of 216 candidates, totalling 1080 fits


2025-09-22 00:53:31,225 - src.model_selection - INFO - RandomForest best params: {'class_weight': 'balanced', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
2025-09-22 00:53:32,219 - src.model_selection - INFO -     RandomForest: AUC = 1.0000 ± 0.0000
2025-09-22 00:53:32,220 - src.model_selection - INFO - 🔍 Evaluating XGBoost...


Fitting 5 folds for each of 81 candidates, totalling 405 fits


2025-09-22 00:53:38,437 - src.model_selection - INFO - XGBoost best params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}
2025-09-22 00:53:38,569 - src.model_selection - INFO -     XGBoost: AUC = 1.0000 ± 0.0000
2025-09-22 00:53:38,571 - src.model_selection - INFO - 🔍 Evaluating LogisticRegression...


Fitting 5 folds for each of 20 candidates, totalling 100 fits


2025-09-22 00:53:41,516 - src.model_selection - INFO - LogisticRegression best params: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
2025-09-22 00:53:41,601 - src.model_selection - INFO -     LogisticRegression: AUC = 1.0000 ± 0.0001
2025-09-22 00:53:41,602 - src.model_selection - INFO - 🏆 BEST MODEL: RandomForest (AUC: 1.0000)
2025-09-22 00:53:41,672 - src.model_selection - INFO - ✅ Saved RandomForest as best model


Best model: RandomForest
AUC Score: 1.0000

Model: RandomForest
  cv_mean: 1.0
  cv_std: 0.0
  test_auc: 1.0
  test_accuracy: 1.0
  test_f1: 1.0
  test_precision: 1.0
  test_recall: 1.0

Model: XGBoost
  cv_mean: 1.0
  cv_std: 0.0
  test_auc: 1.0
  test_accuracy: 0.875
  test_f1: 0.9333333333333333
  test_precision: 0.875
  test_recall: 1.0

Model: LogisticRegression
  cv_mean: 0.9999005078973944
  cv_std: 0.00010587305099591048
  test_auc: 1.0
  test_accuracy: 1.0
  test_f1: 1.0
  test_precision: 1.0
  test_recall: 1.0


In [20]:
# Correct import statement to get the function
from src.model_optimizers.random_forest_optimizer import optimize_random_forest

# You no longer need to instantiate a class, as the logic is in a function.
# The function requires a model_path and a config dictionary.
# You need to define these variables before calling the function.
model_path = "models/Best_Churn_Model.pkl"  # Example path
config = {
    "optimization": {
        "max_trees_production": 50  # Example config value
    }
}

print("⚡ STEP 11: Optimizing Random Forest for production...")

# Correct function call
optimized_rf = optimize_random_forest(model_path, config)

print("✅ Optimization complete.")

⚡ STEP 11: Optimizing Random Forest for production...
✅ Optimization complete.


In [22]:
# 1. Load the data using the ModelSelector instance
# Assuming 'selector' has already been instantiated. If not, do so first.
# from src.model_selection import ModelSelector
# selector = ModelSelector(config_path="configs/model_config.json")

X, y = selector.load_data()

# 2. Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2, # Or use the value from your config
    random_state=42, # Or use the value from your config
    stratify=y
)

# 3. Now, you can safely evaluate the optimized model
optimized_results = selector.evaluate_model(
    model=optimized_rf,
    model_name='RandomForestOptimized',
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test
)

print("✅ Model selection and optimization completed!")

2025-09-22 00:56:30,275 - src.model_selection - INFO - Loaded data: 2240 samples, 3 features


✅ Model selection and optimization completed!


In [23]:
# 1. Import the necessary class
from src.model_selection import ModelSelector
import pandas as pd

# 2. Instantiate the ModelSelector class
selector = ModelSelector(config_path="configs/model_config.json")

# 3. Call the method to get the results and assign them to a variable
best_model_name, best_model, all_results = selector.select_best_model()

# 4. Get feature names from the loaded data for the feature importance display
X, _ = selector.load_data()
feature_names = X.columns

print("=== MODEL PERFORMANCE ===")
for model_name, metrics in all_results.items():
    print(f"\n{model_name.upper()}:")
    # Use the correct keys from the metrics dictionary
    print(f" Accuracy: {metrics['test_accuracy']:.4f}")
    print(f" Precision: {metrics['test_precision']:.4f}")
    print(f" Recall: {metrics['test_recall']:.4f}")
    print(f" F1-Score: {metrics['test_f1']:.4f}")
    print(f" ROC-AUC: {metrics['test_auc']:.4f}")
    
    # Show feature importance for relevant models
    if 'RandomForest' in model_name:
        # Check if the model has feature_importances_ and if feature_names are defined
        if hasattr(metrics.get('model'), 'feature_importances_') and feature_names is not None:
            # Create a list of (feature, importance) tuples and sort by importance
            feature_importances = list(zip(feature_names, metrics['model'].feature_importances_))
            feature_importances.sort(key=lambda x: x[1], reverse=True)
            print(" Top Features:")
            for feature, importance in feature_importances[:3]:
                print(f"   - {feature}: {importance:.4f}")


2025-09-22 00:56:35,340 - src.model_selection - INFO - Loaded data: 2240 samples, 3 features
2025-09-22 00:56:35,347 - src.model_selection - INFO - 🔍 Evaluating RandomForest...


Fitting 5 folds for each of 216 candidates, totalling 1080 fits


2025-09-22 01:00:18,393 - src.model_selection - INFO - RandomForest best params: {'class_weight': 'balanced', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
2025-09-22 01:00:19,464 - src.model_selection - INFO -     RandomForest: AUC = 1.0000 ± 0.0000
2025-09-22 01:00:19,465 - src.model_selection - INFO - 🔍 Evaluating XGBoost...


Fitting 5 folds for each of 81 candidates, totalling 405 fits


2025-09-22 01:00:23,756 - src.model_selection - INFO - XGBoost best params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}
2025-09-22 01:00:23,877 - src.model_selection - INFO -     XGBoost: AUC = 1.0000 ± 0.0000
2025-09-22 01:00:23,879 - src.model_selection - INFO - 🔍 Evaluating LogisticRegression...


Fitting 5 folds for each of 20 candidates, totalling 100 fits


2025-09-22 01:00:26,419 - src.model_selection - INFO - LogisticRegression best params: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
2025-09-22 01:00:26,500 - src.model_selection - INFO -     LogisticRegression: AUC = 1.0000 ± 0.0001
2025-09-22 01:00:26,502 - src.model_selection - INFO - 🏆 BEST MODEL: RandomForest (AUC: 1.0000)
2025-09-22 01:00:26,511 - src.model_selection - INFO - Loaded data: 2240 samples, 3 features


=== MODEL PERFORMANCE ===

RANDOMFOREST:
 Accuracy: 1.0000
 Precision: 1.0000
 Recall: 1.0000
 F1-Score: 1.0000
 ROC-AUC: 1.0000
 Top Features:
   - Recency: 0.9794
   - Monetary: 0.0206
   - Frequency: 0.0000

XGBOOST:
 Accuracy: 0.8750
 Precision: 0.8750
 Recall: 1.0000
 F1-Score: 0.9333
 ROC-AUC: 1.0000

LOGISTICREGRESSION:
 Accuracy: 1.0000
 Precision: 1.0000
 Recall: 1.0000
 F1-Score: 1.0000
 ROC-AUC: 1.0000


In [47]:
# Generate business insights
from src.insights.business_insights import BusinessInsights
print("💡 STEP 12: Generating business insights...")
insights = BusinessInsights()

💡 STEP 12: Generating business insights...


In [92]:
import os
import sys

# Change to the project root directory
try:
    os.chdir(r'C:\Users\GCV\churn_prediction')
    print(f"Working directory changed to: {os.getcwd()}")
except FileNotFoundError:
    print("Warning: Could not change directory. Please check the path.")

# Add the project root to the Python path
sys.path.append(os.getcwd())

# Import the necessary class
from src.insights.business_insights import BusinessInsights

# Instantiate the BusinessInsights class
insights = BusinessInsights()

# Load the model information
insights.load_model_info()

# Calculate and get the feature importance DataFrame
feature_importance_df = insights.calculate_feature_importance()

# Plot the feature importance and save the file
insights.plot_feature_importance(feature_importance_df, 'output/feature_importance.png')

# Display a message to confirm the plot was created
print("Feature importance plot has been generated.")



2025-09-22 02:30:54,149 - src.insights.business_insights - INFO - Generating insights for RandomForest model


Working directory changed to: C:\Users\GCV\churn_prediction


2025-09-22 02:30:54,515 - src.insights.business_insights - INFO - Feature importance plot saved to output/feature_importance.png


Feature importance plot has been generated.


In [93]:
import os
import sys
import pandas as pd
from src.insights.business_insights import BusinessInsights
from src.model_selection import ModelSelector
from sklearn.cluster import KMeans

# Change to the project root directory
try:
    os.chdir(r'C:\Users\GCV\churn_prediction')
    print(f"Working directory changed to: {os.getcwd()}")
except FileNotFoundError:
    print("Warning: Could not change directory. Please check the path.")

# Add the project root to the Python path
sys.path.append(os.getcwd())

# 1. Load Data
selector = ModelSelector(config_path="configs/model_config.json")
X, y = selector.load_data()
X = X.reset_index(drop=True)
customer_ids = X.index.values # Assuming the index is the customer_id
feature_names = ['Recency', 'Frequency', 'Monetary']
X_features = X[feature_names]

# 2. Perform K-Means Clustering for Segmentation
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
kmeans.fit(X_features)
X_features['segment'] = kmeans.labels_

# 3. Prepare data for analysis
clustered_data = X_features.copy()
clustered_data['customer_id'] = customer_ids

# 4. Instantiate BusinessInsights and load the model
insights = BusinessInsights()
insights.load_model_info()

# 5. Analyze the segments using the new method
# 'optimized_rf' and 'X' should be defined in a previous cell
# 'X' here is used as the features for the model, as per your code
# You'll need to make sure 'optimized_rf' is available
segmentation_insights = insights.analyze_segments(clustered_data, insights.model, feature_names)

# Display the results
print("\n=== Customer Segmentation Insights ===")
print(segmentation_insights)

2025-09-22 02:31:01,659 - src.model_selection - INFO - Loaded data: 2240 samples, 3 features
2025-09-22 02:31:01,831 - src.insights.business_insights - INFO - Generating insights for RandomForest model


Working directory changed to: C:\Users\GCV\churn_prediction


AttributeError: 'BusinessInsights' object has no attribute 'analyze_segments'