In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('ggplot')
sns.set_palette('viridis')

class EmailCampaignOptimizer:
    """
    A class to analyze email campaign performance and build a model
    to optimize future email sending for maximum click-through rates.
    """
    
    def __init__(self):
        self.email_opened = None
        self.link_clicked = None
        self.email_table = None
        self.model = None
        self.feature_names = None
    
    def load_data(self, opened_file='email_opened_table.csv', 
                 clicked_file='link_clicked_table.csv', 
                 email_file='email_table.csv'):
        """Load and prepare the email campaign data"""
        print("Loading campaign data...")
        
        # Load data
        self.email_opened = pd.read_csv(opened_file)
        self.link_clicked = pd.read_csv(clicked_file)
        self.email_table = pd.read_csv(email_file)
        
        # Ensure column names are correct
        if 'email_id' not in self.email_opened.columns:
            self.email_opened.columns = ['email_id']
        if 'email_id' not in self.link_clicked.columns:
            self.link_clicked.columns = ['email_id']
        
        # Add binary labels for opened and clicked emails
        self.email_table['opened'] = self.email_table['email_id'].isin(self.email_opened['email_id']).astype(int)
        self.email_table['clicked'] = self.email_table['email_id'].isin(self.link_clicked['email_id']).astype(int)
        
        return self.email_table
    
    def calculate_metrics(self):
        """Calculate key performance metrics for the email campaign"""
        if self.email_table is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        
        total_emails = len(self.email_table)
        opened_emails = self.email_table['opened'].sum()
        clicked_emails = self.email_table['clicked'].sum()
        
        open_rate = opened_emails / total_emails
        click_rate = clicked_emails / total_emails
        click_to_open_rate = clicked_emails / opened_emails if opened_emails > 0 else 0
        
        metrics = {
            'total_emails': total_emails,
            'opened_emails': opened_emails,
            'clicked_emails': clicked_emails,
            'open_rate': open_rate,
            'click_rate': click_rate,
            'click_to_open_rate': click_to_open_rate
        }
        
        print(f"\nEmail Campaign Performance Metrics:")
        print(f"Total emails sent: {total_emails}")
        print(f"Emails opened: {opened_emails} ({open_rate:.2%})")
        print(f"Links clicked: {clicked_emails} ({click_rate:.2%})")
        print(f"Click-to-open rate: {click_to_open_rate:.2%}")
        
        # Create and save overall metrics visualization
        self.plot_overall_metrics(open_rate, click_rate)
        
        return metrics
    
    def plot_overall_metrics(self, open_rate, click_rate):
        """Plot overall campaign metrics"""
        plt.figure(figsize=(10, 6))
        metrics = pd.DataFrame({
            'Rate': [open_rate, click_rate],
            'Metric': ['Open Rate', 'Click Rate']
        })
        sns.barplot(x='Metric', y='Rate', data=metrics)
        plt.title('Email Campaign Performance', fontsize=15)
        plt.ylabel('Rate')
        plt.ylim(0, 1.0)
        plt.savefig('overall_metrics.png')
        plt.close()
    
    def analyze_segments(self):
        """Analyze performance across different segments"""
        if self.email_table is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        
        print("\nSegment Analysis:")
        
        # Create purchase history segments
        purchase_bins = [0, 2, 6, 10, float('inf')]
        purchase_labels = ['Very Low (0-2)', 'Low (3-6)', 'High (7-10)', 'Very High (>10)']
        self.email_table['purchase_segment'] = pd.cut(
            self.email_table['user_past_purchases'], 
            bins=purchase_bins, 
            labels=purchase_labels
        )
        
        # Create hour segments
        hour_bins = [0, 6, 12, 18, 24]
        hour_labels = ['Night (0-6)', 'Morning (6-12)', 'Afternoon (12-18)', 'Evening (18-24)']
        self.email_table['hour_segment'] = pd.cut(
            self.email_table['hour'], 
            bins=hour_bins, 
            labels=hour_labels
        )
        
        # Analyze and visualize key segments
        self.plot_segment_analysis('email_version', 'Email Personalization')
        self.plot_segment_analysis('email_text', 'Email Length')
        self.plot_segment_analysis('weekday', 'Day of Week')
        self.plot_segment_analysis('user_country', 'User Country')
        self.plot_segment_analysis('purchase_segment', 'Purchase History')
        self.plot_segment_analysis('hour_segment', 'Time of Day')
        
        # Combined effect of email type and personalization
        self.plot_combined_segments()
        
        # Analyze time patterns
        self.plot_hourly_pattern()
        
        return self.email_table
    
    def plot_segment_analysis(self, segment, title):
        """Plot CTR by a specific segment"""
        plt.figure(figsize=(12, 6))
        ctr_by_segment = self.email_table.groupby(segment)['clicked'].mean()
        count_by_segment = self.email_table.groupby(segment).size()
        
        print(f"\n--- CTR by {title} ---")
        segment_data = pd.DataFrame({
            'Segment': ctr_by_segment.index,
            'CTR': ctr_by_segment.values,
            'Count': count_by_segment.values
        })
        segment_data = segment_data.sort_values('CTR', ascending=False)
        print(segment_data)
        
        sns.barplot(x=segment, y='clicked', data=self.email_table)
        plt.title(f'Click Rate by {title}', fontsize=15)
        plt.ylabel('Click Rate')
        plt.xlabel(title)
        if segment == 'weekday':
            plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f'ctr_by_{segment}.png')
        plt.close()
    
    def plot_combined_segments(self):
        """Plot CTR by email type and personalization combined"""
        plt.figure(figsize=(12, 6))
        sns.barplot(x='email_text', y='clicked', hue='email_version', data=self.email_table)
        plt.title('Click Rate by Email Type and Personalization', fontsize=15)
        plt.ylabel('Click Rate')
        plt.xlabel('Email Text Type')
        plt.legend(title='Email Version')
        plt.tight_layout()
        plt.savefig('ctr_by_type_and_personalization.png')
        plt.close()
        
        # Print table of results
        print("\n--- CTR by Email Type and Personalization ---")
        combo = self.email_table.groupby(['email_text', 'email_version'])['clicked'].agg(['count', 'mean'])
        combo = combo.sort_values('mean', ascending=False)
        print(combo)
    
    def plot_hourly_pattern(self):
        """Plot CTR by hour of day"""
        plt.figure(figsize=(14, 6))
        hour_performance = self.email_table.groupby('hour')['clicked'].mean().reset_index()
        sns.lineplot(x='hour', y='clicked', data=hour_performance, marker='o')
        plt.title('Click Rate by Hour of Day', fontsize=15)
        plt.ylabel('Click Rate')
        plt.xlabel('Hour (24-hour format)')
        plt.xticks(range(0, 24, 2))
        plt.grid(True)
        plt.tight_layout()
        plt.savefig('ctr_by_hour.png')
        plt.close()
    
    def engineer_features(self):
        """Prepare data for the machine learning model"""
        if self.email_table is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        
        print("\nEngineering features for the model...")
        
        # Create dummy variables for categorical features
        email_data = pd.get_dummies(self.email_table, 
                                   columns=['email_text', 'email_version', 'weekday', 'user_country'])
        
        # Create cyclical features for hour to better represent time
        email_data['hour_sin'] = np.sin(2 * np.pi * email_data['hour'] / 24)
        email_data['hour_cos'] = np.cos(2 * np.pi * email_data['hour'] / 24)
        
        # Drop original hour column and any segment columns created during analysis
        columns_to_drop = ['hour']
        if 'purchase_segment' in email_data.columns:
            columns_to_drop.append('purchase_segment')
        if 'hour_segment' in email_data.columns:
            columns_to_drop.append('hour_segment')
            
        email_data = email_data.drop(columns_to_drop, axis=1)
        
        return email_data
    
    def build_model(self, email_data=None):
        """Build and evaluate a machine learning model to predict clicks"""
        if email_data is None:
            email_data = self.engineer_features()
        
        print("\nBuilding and evaluating the ML model...")
        
        # Prepare features and target
        X = email_data.drop(['email_id', 'opened', 'clicked'], axis=1)
        y = email_data['clicked']
        
        # Store feature names for later use
        self.feature_names = X.columns
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.25, random_state=42, stratify=y
        )
        
        # Train the model
        self.model = RandomForestClassifier(
            n_estimators=100, 
            max_depth=10, 
            random_state=42, 
            class_weight='balanced'
        )
        self.model.fit(X_train, y_train)
        
        # Cross-validation to assess model quality
        cv_scores = cross_val_score(
            self.model, X_train, y_train, cv=5, scoring='roc_auc'
        )
        print(f"Cross-validation ROC AUC: {cv_scores.mean():.4f}")
        
        # Evaluate on test set
        y_pred_proba = self.model.predict_proba(X_test)[:, 1]
        y_pred = self.model.predict(X_test)
        
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        print(f"Test set ROC AUC: {roc_auc:.4f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        
        # Extract and visualize feature importance
        self.visualize_feature_importance(X.columns)
        
        # Estimate potential CTR improvement
        self.estimate_ctr_improvement(X_test, y_test, y_pred_proba)
        
        return self.model
    
    def visualize_feature_importance(self, feature_names):
        """Visualize the importance of each feature in the model"""
        if self.model is None:
            raise ValueError("Model not built. Call build_model() first.")
        
        # Get feature importance
        feature_importance = pd.DataFrame({
            'Feature': feature_names,
            'Importance': self.model.feature_importances_
        }).sort_values(by='Importance', ascending=False)
        
        # Display top features
        print("\nTop 15 Most Important Features:")
        print(feature_importance.head(15))
        
        # Visualize
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
        plt.title('Top 15 Most Important Features for Email Click Prediction', fontsize=15)
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        plt.close()
    
    def estimate_ctr_improvement(self, X_test, y_test, y_pred_proba):
        """Estimate how much the model would improve CTR"""
        # Calculate baseline CTR
        baseline_ctr = y_test.mean()
        
        # Sort test data by predicted probability
        results_df = pd.DataFrame({
            'actual': y_test,
            'probability': y_pred_proba
        }).sort_values('probability', ascending=False)
        
        # Calculate CTR at different targeting thresholds
        thresholds = [0.1, 0.25, 0.5, 0.75, 1.0]
        ctrs = []
        
        for t in thresholds:
            top_percent = int(len(results_df) * t)
            ctr = results_df.head(top_percent)['actual'].mean()
            ctrs.append(ctr)
        
        # Visualize potential improvement
        plt.figure(figsize=(12, 6))
        plt.plot(thresholds, ctrs, 'o-', linewidth=2, markersize=10)
        plt.axhline(y=baseline_ctr, color='r', linestyle='--', 
                   label=f'Baseline CTR: {baseline_ctr:.4f}')
        plt.title('Potential CTR Improvement with Model-Based Targeting', fontsize=15)
        plt.xlabel('Proportion of Users Targeted')
        plt.ylabel('Click-Through Rate')
        plt.grid(True)
        plt.legend()
        plt.tight_layout()
        plt.savefig('ctr_improvement.png')
        plt.close()
        
        # Print improvement estimates
        print("\nCTR Improvement Estimates:")
        print(f"Baseline CTR: {baseline_ctr:.4f}")
        for i, t in enumerate(thresholds):
            improvement = (ctrs[i]/baseline_ctr-1)*100
            print(f"CTR when targeting top {t*100:.0f}%: {ctrs[i]:.4f} ({improvement:.1f}% improvement)")
    
    def predict_for_new_campaign(self, new_data):
        """Use the trained model to predict clicks for a new campaign"""
        if self.model is None or self.feature_names is None:
            raise ValueError("Model not built. Call build_model() first.")
        
        print("\nPredicting click probabilities for new campaign data...")
        
        # Process the new data
        processed_data = pd.get_dummies(new_data, 
                                      columns=['email_text', 'email_version', 'weekday', 'user_country'])
        
        # Add cyclical time features
        processed_data['hour_sin'] = np.sin(2 * np.pi * processed_data['hour'] / 24)
        processed_data['hour_cos'] = np.cos(2 * np.pi * processed_data['hour'] / 24)
        
        # Drop the hour column
        processed_data = processed_data.drop(['hour'], axis=1)
        
        # Ensure all features from training are present
        for feature in self.feature_names:
            if feature not in processed_data.columns:
                processed_data[feature] = 0
        
        # Select only the columns used during training
        X_new = processed_data[self.feature_names]
        
        # Predict
        predictions = self.model.predict_proba(X_new)[:, 1]
        
        # Add predictions to original data
        new_data['click_probability'] = predictions
        
        # Sort by probability (highest first)
        return new_data.sort_values('click_probability', ascending=False)

    def ab_test_design(self):
        """Print A/B test design to validate the model"""
        print("\nA/B Test Design to Validate Model Effectiveness:")
        print("1. Randomly divide users into two equal-sized groups:")
        print("   - Control Group: Receives emails using the current random strategy")
        print("   - Test Group: Receives emails based on model predictions")
        print("2. Run the test for 2-4 weeks")
        print("3. Measure metrics:")
        print("   - Primary: Click-through rate (CTR)")
        print("   - Secondary: Open rate, conversion rate (if applicable)")
        print("4. Use statistical hypothesis testing (Chi-square test) to determine significance")
        print("5. If successful, implement the model-based approach for all users")
        print("6. Continue monitoring performance and update the model periodically")

def main():
    """Main function to execute the full analysis and modeling pipeline"""
    # Initialize the optimizer
    optimizer = EmailCampaignOptimizer()
    
    # Load and prepare data
    optimizer.load_data()
    
    # Calculate key metrics
    optimizer.calculate_metrics()
    
    # Analyze segments
    optimizer.analyze_segments()
    
    # Build the model
    email_data = optimizer.engineer_features()
    model = optimizer.build_model(email_data)
    
    # Design A/B test
    optimizer.ab_test_design()
    
    # Example of using the model for a new campaign
    print("\nExample: Prioritizing recipients for a new campaign")
    # Generate sample data (in a real scenario, this would be new recipients)
    sample_data = optimizer.email_table.sample(10).copy()
    sample_data['email_id'] = sample_data['email_id'] + 1000000  # Make new IDs
    
    # Predict click probabilities
    prioritized_recipients = optimizer.predict_for_new_campaign(sample_data)
    
    # Display top 5 recipients to target
    display_cols = ['email_id', 'email_text', 'email_version', 'weekday', 
                    'user_country', 'user_past_purchases', 'click_probability']
    print("\nTop recipients to target in next campaign:")
    print(prioritized_recipients[display_cols].head(5))
    
    print("\nConclusion:")
    print("The random email strategy is indeed suboptimal. Our model can potentially")
    print("increase click-through rates by 75-200% by targeting users most likely to engage.")
    print("Implementing this approach through an A/B test will validate these findings")
    print("and quantify the actual business impact.")

if __name__ == "__main__":
    main()


Loading campaign data...

Email Campaign Performance Metrics:
Total emails sent: 100000
Emails opened: 10345 (10.35%)
Links clicked: 2119 (2.12%)
Click-to-open rate: 20.48%

Segment Analysis:

--- CTR by Email Personalization ---
        Segment       CTR  Count
1  personalized  0.027294  49791
0       generic  0.015137  50209

--- CTR by Email Length ---
       Segment       CTR  Count
1  short_email  0.023872  49724
0   long_email  0.018538  50276

--- CTR by Day of Week ---
     Segment       CTR  Count
6  Wednesday  0.027620  14084
5    Tuesday  0.024889  14143
4   Thursday  0.024445  14277
1     Monday  0.022906  14363
2   Saturday  0.017846  14569
3     Sunday  0.016751  14387
0     Friday  0.014037  14177

--- CTR by User Country ---
  Segment       CTR  Count
2      UK  0.024675  19939
3      US  0.024360  60099
0      ES  0.008327   9967
1      FR  0.008004   9995

--- CTR by Purchase History ---
           Segment       CTR  Count
3  Very High (>10)  0.069037   3853
2      Hi

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('ggplot')
sns.set_palette('viridis')

class EmailCampaignOptimizer:
    """
    A class to analyze email campaign performance and build a model
    to optimize future email sending for maximum click-through rates.
    """
    
    def __init__(self):
        self.email_opened = None
        self.link_clicked = None
        self.email_table = None
        self.model = None
        self.feature_names = None
    
    def load_data(self, opened_file='email_opened_table.csv', 
                 clicked_file='link_clicked_table.csv', 
                 email_file='email_table.csv'):
        """Load and prepare the email campaign data"""
        print("Loading campaign data...")
        
        # Load data
        self.email_opened = pd.read_csv(opened_file)
        self.link_clicked = pd.read_csv(clicked_file)
        self.email_table = pd.read_csv(email_file)
        
        # Ensure column names are correct
        if 'email_id' not in self.email_opened.columns:
            self.email_opened.columns = ['email_id']
        if 'email_id' not in self.link_clicked.columns:
            self.link_clicked.columns = ['email_id']
        
        # Add binary labels for opened and clicked emails
        self.email_table['opened'] = self.email_table['email_id'].isin(self.email_opened['email_id']).astype(int)
        self.email_table['clicked'] = self.email_table['email_id'].isin(self.link_clicked['email_id']).astype(int)
        
        return self.email_table
    
    def calculate_metrics(self):
        """Calculate key performance metrics for the email campaign"""
        if self.email_table is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        
        total_emails = len(self.email_table)
        opened_emails = self.email_table['opened'].sum()
        clicked_emails = self.email_table['clicked'].sum()
        
        open_rate = opened_emails / total_emails
        click_rate = clicked_emails / total_emails
        click_to_open_rate = clicked_emails / opened_emails if opened_emails > 0 else 0
        
        metrics = {
            'total_emails': total_emails,
            'opened_emails': opened_emails,
            'clicked_emails': clicked_emails,
            'open_rate': open_rate,
            'click_rate': click_rate,
            'click_to_open_rate': click_to_open_rate
        }
        
        print(f"\nEmail Campaign Performance Metrics:")
        print(f"Total emails sent: {total_emails}")
        print(f"Emails opened: {opened_emails} ({open_rate:.2%})")
        print(f"Links clicked: {clicked_emails} ({click_rate:.2%})")
        print(f"Click-to-open rate: {click_to_open_rate:.2%}")
        
        # Create and save overall metrics visualization
        self.plot_overall_metrics(open_rate, click_rate)
        
        return metrics
    
    def plot_overall_metrics(self, open_rate, click_rate):
        """Plot overall campaign metrics"""
        plt.figure(figsize=(10, 6))
        metrics = pd.DataFrame({
            'Rate': [open_rate, click_rate],
            'Metric': ['Open Rate', 'Click Rate']
        })
        sns.barplot(x='Metric', y='Rate', data=metrics)
        plt.title('Email Campaign Performance', fontsize=15)
        plt.ylabel('Rate')
        plt.ylim(0, 1.0)
        plt.savefig('overall_metrics.png')
        plt.close()
    
    def analyze_segments(self):
        """Analyze performance across different segments"""
        if self.email_table is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        
        print("\nSegment Analysis:")
        
        # Create purchase history segments
        purchase_bins = [0, 2, 6, 10, float('inf')]
        purchase_labels = ['Very Low (0-2)', 'Low (3-6)', 'High (7-10)', 'Very High (>10)']
        self.email_table['purchase_segment'] = pd.cut(
            self.email_table['user_past_purchases'], 
            bins=purchase_bins, 
            labels=purchase_labels
        )
        
        # Create hour segments
        hour_bins = [0, 6, 12, 18, 24]
        hour_labels = ['Night (0-6)', 'Morning (6-12)', 'Afternoon (12-18)', 'Evening (18-24)']
        self.email_table['hour_segment'] = pd.cut(
            self.email_table['hour'], 
            bins=hour_bins, 
            labels=hour_labels
        )
        
        # Analyze and visualize key segments
        self.plot_segment_analysis('email_version', 'Email Personalization')
        self.plot_segment_analysis('email_text', 'Email Length')
        self.plot_segment_analysis('weekday', 'Day of Week')
        self.plot_segment_analysis('user_country', 'User Country')
        self.plot_segment_analysis('purchase_segment', 'Purchase History')
        self.plot_segment_analysis('hour_segment', 'Time of Day')
        
        # Combined effect of email type and personalization
        self.plot_combined_segments()
        
        # Analyze time patterns
        self.plot_hourly_pattern()
        
        return self.email_table
    
    def plot_segment_analysis(self, segment, title):
        """Plot CTR by a specific segment"""
        plt.figure(figsize=(12, 6))
        ctr_by_segment = self.email_table.groupby(segment)['clicked'].mean()
        count_by_segment = self.email_table.groupby(segment).size()
        
        print(f"\n--- CTR by {title} ---")
        segment_data = pd.DataFrame({
            'Segment': ctr_by_segment.index,
            'CTR': ctr_by_segment.values,
            'Count': count_by_segment.values
        })
        segment_data = segment_data.sort_values('CTR', ascending=False)
        print(segment_data)
        
        sns.barplot(x=segment, y='clicked', data=self.email_table)
        plt.title(f'Click Rate by {title}', fontsize=15)
        plt.ylabel('Click Rate')
        plt.xlabel(title)
        if segment == 'weekday':
            plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(f'ctr_by_{segment}.png')
        plt.close()
    
    def plot_combined_segments(self):
        """Plot CTR by email type and personalization combined"""
        plt.figure(figsize=(12, 6))
        sns.barplot(x='email_text', y='clicked', hue='email_version', data=self.email_table)
        plt.title('Click Rate by Email Type and Personalization', fontsize=15)
        plt.ylabel('Click Rate')
        plt.xlabel('Email Text Type')
        plt.legend(title='Email Version')
        plt.tight_layout()
        plt.savefig('ctr_by_type_and_personalization.png')
        plt.close()
        
        # Print table of results
        print("\n--- CTR by Email Type and Personalization ---")
        combo = self.email_table.groupby(['email_text', 'email_version'])['clicked'].agg(['count', 'mean'])
        combo = combo.sort_values('mean', ascending=False)
        print(combo)
    
    def plot_hourly_pattern(self):
        """Plot CTR by hour of day"""
        plt.figure(figsize=(14, 6))
        hour_performance = self.email_table.groupby('hour')['clicked'].mean().reset_index()
        sns.lineplot(x='hour', y='clicked', data=hour_performance, marker='o')
        plt.title('Click Rate by Hour of Day', fontsize=15)
        plt.ylabel('Click Rate')
        plt.xlabel('Hour (24-hour format)')
        plt.xticks(range(0, 24, 2))
        plt.grid(True)
        plt.tight_layout()
        plt.savefig('ctr_by_hour.png')
        plt.close()
    
    def engineer_features(self):
        """Prepare data for the machine learning model"""
        if self.email_table is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        
        print("\nEngineering features for the model...")
        
        # Create dummy variables for categorical features
        email_data = pd.get_dummies(self.email_table, 
                                   columns=['email_text', 'email_version', 'weekday', 'user_country'])
        
        # Create cyclical features for hour to better represent time
        email_data['hour_sin'] = np.sin(2 * np.pi * email_data['hour'] / 24)
        email_data['hour_cos'] = np.cos(2 * np.pi * email_data['hour'] / 24)
        
        # Drop original hour column and any segment columns created during analysis
        columns_to_drop = ['hour']
        if 'purchase_segment' in email_data.columns:
            columns_to_drop.append('purchase_segment')
        if 'hour_segment' in email_data.columns:
            columns_to_drop.append('hour_segment')
            
        email_data = email_data.drop(columns_to_drop, axis=1)
        
        return email_data
    
    def build_model(self, email_data=None):
        """Build and evaluate a machine learning model to predict clicks"""
        if email_data is None:
            email_data = self.engineer_features()
        
        print("\nBuilding and evaluating the ML model...")
        
        # Prepare features and target
        X = email_data.drop(['email_id', 'opened', 'clicked'], axis=1)
        y = email_data['clicked']
        
        # Store feature names for later use
        self.feature_names = X.columns
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.25, random_state=42, stratify=y
        )
        
        # Train the model
        self.model = RandomForestClassifier(
            n_estimators=100, 
            max_depth=10, 
            random_state=42, 
            class_weight='balanced'
        )
        self.model.fit(X_train, y_train)
        
        # Cross-validation to assess model quality
        cv_scores = cross_val_score(
            self.model, X_train, y_train, cv=5, scoring='roc_auc'
        )
        print(f"Cross-validation ROC AUC: {cv_scores.mean():.4f}")
        
        # Evaluate on test set
        y_pred_proba = self.model.predict_proba(X_test)[:, 1]
        y_pred = self.model.predict(X_test)
        
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        print(f"Test set ROC AUC: {roc_auc:.4f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        
        # Extract and visualize feature importance
        self.visualize_feature_importance(X.columns)
        
        # Estimate potential CTR improvement
        self.estimate_ctr_improvement(X_test, y_test, y_pred_proba)
        
        return self.model
    
    def visualize_feature_importance(self, feature_names):
        """Visualize the importance of each feature in the model"""
        if self.model is None:
            raise ValueError("Model not built. Call build_model() first.")
        
        # Get feature importance
        feature_importance = pd.DataFrame({
            'Feature': feature_names,
            'Importance': self.model.feature_importances_
        }).sort_values(by='Importance', ascending=False)
        
        # Display top features
        print("\nTop 15 Most Important Features:")
        print(feature_importance.head(15))
        
        # Visualize
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
        plt.title('Top 15 Most Important Features for Email Click Prediction', fontsize=15)
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        plt.close()
    
    def estimate_ctr_improvement(self, X_test, y_test, y_pred_proba):
        """Estimate how much the model would improve CTR"""
        # Calculate baseline CTR
        baseline_ctr = y_test.mean()
        
        # Sort test data by predicted probability
        results_df = pd.DataFrame({
            'actual': y_test,
            'probability': y_pred_proba
        }).sort_values('probability', ascending=False)
        
        # Calculate CTR at different targeting thresholds
        thresholds = [0.1, 0.25, 0.5, 0.75, 1.0]
        ctrs = []
        
        for t in thresholds:
            top_percent = int(len(results_df) * t)
            ctr = results_df.head(top_percent)['actual'].mean()
            ctrs.append(ctr)
        
        # Visualize potential improvement
        plt.figure(figsize=(12, 6))
        plt.plot(thresholds, ctrs, 'o-', linewidth=2, markersize=10)
        plt.axhline(y=baseline_ctr, color='r', linestyle='--', 
                   label=f'Baseline CTR: {baseline_ctr:.4f}')
        plt.title('Potential CTR Improvement with Model-Based Targeting', fontsize=15)
        plt.xlabel('Proportion of Users Targeted')
        plt.ylabel('Click-Through Rate')
        plt.grid(True)
        plt.legend()
        plt.tight_layout()
        plt.savefig('ctr_improvement.png')
        plt.close()
        
        # Print improvement estimates
        print("\nCTR Improvement Estimates:")
        print(f"Baseline CTR: {baseline_ctr:.4f}")
        for i, t in enumerate(thresholds):
            improvement = (ctrs[i]/baseline_ctr-1)*100
            print(f"CTR when targeting top {t*100:.0f}%: {ctrs[i]:.4f} ({improvement:.1f}% improvement)")
    
    def predict_for_new_campaign(self, new_data):
        """Use the trained model to predict clicks for a new campaign"""
        if self.model is None or self.feature_names is None:
            raise ValueError("Model not built. Call build_model() first.")
        
        print("\nPredicting click probabilities for new campaign data...")
        
        # Process the new data
        processed_data = pd.get_dummies(new_data, 
                                      columns=['email_text', 'email_version', 'weekday', 'user_country'])
        
        # Add cyclical time features
        processed_data['hour_sin'] = np.sin(2 * np.pi * processed_data['hour'] / 24)
        processed_data['hour_cos'] = np.cos(2 * np.pi * processed_data['hour'] / 24)
        
        # Drop the hour column
        processed_data = processed_data.drop(['hour'], axis=1)
        
        # Ensure all features from training are present
        for feature in self.feature_names:
            if feature not in processed_data.columns:
                processed_data[feature] = 0
        
        # Select only the columns used during training
        X_new = processed_data[self.feature_names]
        
        # Predict
        predictions = self.model.predict_proba(X_new)[:, 1]
        
        # Add predictions to original data
        new_data['click_probability'] = predictions
        
        # Sort by probability (highest first)
        return new_data.sort_values('click_probability', ascending=False)

    def ab_test_design(self):
        """Print A/B test design to validate the model"""
        print("\nA/B Test Design to Validate Model Effectiveness:")
        print("1. Randomly divide users into two equal-sized groups:")
        print("   - Control Group: Receives emails using the current random strategy")
        print("   - Test Group: Receives emails based on model predictions")
        print("2. Run the test for 2-4 weeks")
        print("3. Measure metrics:")
        print("   - Primary: Click-through rate (CTR)")
        print("   - Secondary: Open rate, conversion rate (if applicable)")
        print("4. Use statistical hypothesis testing (Chi-square test) to determine significance")
        print("5. If successful, implement the model-based approach for all users")
        print("6. Continue monitoring performance and update the model periodically")
    
    def plot_open_vs_click_scatter(self):
        """Create a scatter plot showing the relationship between open and click rates"""
        plt.figure(figsize=(12, 8))
        
        if 'email_version' in self.email_table.columns:
            # Group by a meaningful segment (like email version)
            segments = self.email_table.groupby('email_version')
            for name, group in segments:
                plt.scatter(
                    group['opened'].mean(), 
                    group['clicked'].mean(),
                    s=100,  # marker size
                    label=f'Version: {name}'
                )
            plt.legend()
        else:
            # If no segmentation, display email_text types
            segments = self.email_table.groupby('email_text')
            for name, group in segments:
                plt.scatter(
                    group['opened'].mean(), 
                    group['clicked'].mean(),
                    s=100,  # marker size
                    label=f'Text: {name}'
                )
            plt.legend()
        
        plt.title('Relationship Between Open Rate and Click Rate by Segment', fontsize=15)
        plt.xlabel('Open Rate')
        plt.ylabel('Click Rate')
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig('open_vs_click_scatter.png')
        plt.close()
    
    def plot_time_heatmap(self):
        """Create a heatmap showing click rates by weekday and hour"""
        plt.figure(figsize=(14, 8))
        
        # Create pivot table of click rates by weekday and hour
        heatmap_data = self.email_table.pivot_table(
            values='clicked', 
            index='weekday', 
            columns='hour', 
            aggfunc='mean'
        )
        
        # Plot heatmap
        sns.heatmap(
            heatmap_data, 
            cmap='YlGnBu', 
            annot=True, 
            fmt='.3f', 
            linewidths=.5,
            cbar_kws={'label': 'Click Rate'}
        )
        
        plt.title('Click Rate by Weekday and Hour', fontsize=15)
        plt.xlabel('Hour of Day')
        plt.ylabel('Day of Week')
        plt.tight_layout()
        plt.savefig('weekday_hour_heatmap.png')
        plt.close()
    
    def plot_purchase_engagement_relationship(self):
        """Plot the relationship between user purchase history and email engagement"""
        plt.figure(figsize=(12, 6))
        
        # Group users by purchase count and get average opened and clicked rates
        purchase_data = self.email_table.groupby('user_past_purchases').agg({
            'opened': 'mean',
            'clicked': 'mean',
            'email_id': 'count'  # Number of users in each purchase segment
        }).reset_index()
        
        # Calculate size values based on count (for bubble size)
        max_size = 500
        purchase_data['size'] = purchase_data['email_id'] / purchase_data['email_id'].max() * max_size
        
        # Create scatter plot with variable size points
        plt.scatter(
            purchase_data['user_past_purchases'], 
            purchase_data['clicked'],
            s=purchase_data['size'],
            alpha=0.6,
            c=purchase_data['opened'],
            cmap='viridis'
        )
        
        plt.colorbar(label='Open Rate')
        plt.title('Relationship Between Purchase History and Email Engagement', fontsize=15)
        plt.xlabel('Number of Past Purchases')
        plt.ylabel('Click-Through Rate')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        # Add trendline
        z = np.polyfit(purchase_data['user_past_purchases'], purchase_data['clicked'], 1)
        p = np.poly1d(z)
        plt.plot(
            purchase_data['user_past_purchases'],
            p(purchase_data['user_past_purchases']),
            "r--", 
            alpha=0.8,
            label=f'Trend: y={z[0]:.5f}x+{z[1]:.5f}'
        )
        plt.legend()
        
        plt.tight_layout()
        plt.savefig('purchase_engagement_relationship.png')
        plt.close()
    
    def create_visualization_dashboard(self):
        """Create a comprehensive visualization dashboard for the email campaign"""
        print("\nGenerating visualization dashboard...")
        
        # Run all visualization methods
        self.calculate_metrics()  # Creates overall metrics plot
        
        # Segment analysis plots
        self.plot_segment_analysis('email_version', 'Email Personalization')
        self.plot_segment_analysis('email_text', 'Email Length')
        self.plot_segment_analysis('weekday', 'Day of Week')
        self.plot_segment_analysis('user_country', 'User Country')
        
        # Additional visualizations
        self.plot_open_vs_click_scatter()
        self.plot_time_heatmap()
        self.plot_purchase_engagement_relationship()
        
        # Model-related visualizations if model exists
        if hasattr(self, 'model') and self.model is not None:
            # Get some test data
            email_data = self.engineer_features()
            X = email_data.drop(['email_id', 'opened', 'clicked'], axis=1)
            y = email_data['clicked']
            _, X_test, _, y_test = train_test_split(
                X, y, test_size=0.25, random_state=42, stratify=y
            )
            y_pred_proba = self.model.predict_proba(X_test)[:, 1]
            
            # Create model visualizations
            self.visualize_feature_importance(X.columns)
            self.estimate_ctr_improvement(X_test, y_test, y_pred_proba)
        
        print("Visualization dashboard created successfully!")

def main():
    """Main function to execute the full analysis and modeling pipeline"""
    # Initialize the optimizer
    optimizer = EmailCampaignOptimizer()
    
    # Load and prepare data
    optimizer.load_data()
    
    # Calculate key metrics
    optimizer.calculate_metrics()
    
    # Analyze segments
    optimizer.analyze_segments()
    
    # Build the model
    email_data = optimizer.engineer_features()
    model = optimizer.build_model(email_data)
    
    # Generate visualization dashboard
    optimizer.create_visualization_dashboard()
    
    # Design A/B test
    optimizer.ab_test_design()
    
    # Example of using the model for a new campaign
    print("\nExample: Prioritizing recipients for a new campaign")
    # Generate sample data (in a real scenario, this would be new recipients)
    sample_data = optimizer.email_table.sample(10).copy()
    sample_data['email_id'] = sample_data['email_id'] + 1000000  # Make new IDs
    
    # Predict click probabilities
    prioritized_recipients = optimizer.predict_for_new_campaign(sample_data)
    
    # Display top 5 recipients to target
    display_cols = ['email_id', 'email_text', 'email_version', 'weekday', 
                    'user_country', 'user_past_purchases', 'click_probability']
    print("\nTop recipients to target in next campaign:")
    print(prioritized_recipients[display_cols].head(5))
    
    print("\nConclusion:")
    print("The random email strategy is indeed suboptimal. Our model can potentially")
    print("increase click-through rates by 75-200% by targeting users most likely to engage.")
    print("Implementing this approach through an A/B test will validate these findings")
    print("and quantify the actual business impact.")

if __name__ == "__main__":
    main()


Loading campaign data...

Email Campaign Performance Metrics:
Total emails sent: 100000
Emails opened: 10345 (10.35%)
Links clicked: 2119 (2.12%)
Click-to-open rate: 20.48%

Segment Analysis:

--- CTR by Email Personalization ---
        Segment       CTR  Count
1  personalized  0.027294  49791
0       generic  0.015137  50209

--- CTR by Email Length ---
       Segment       CTR  Count
1  short_email  0.023872  49724
0   long_email  0.018538  50276

--- CTR by Day of Week ---
     Segment       CTR  Count
6  Wednesday  0.027620  14084
5    Tuesday  0.024889  14143
4   Thursday  0.024445  14277
1     Monday  0.022906  14363
2   Saturday  0.017846  14569
3     Sunday  0.016751  14387
0     Friday  0.014037  14177

--- CTR by User Country ---
  Segment       CTR  Count
2      UK  0.024675  19939
3      US  0.024360  60099
0      ES  0.008327   9967
1      FR  0.008004   9995

--- CTR by Purchase History ---
           Segment       CTR  Count
3  Very High (>10)  0.069037   3853
2      Hi