# Data Analysis and Logistic Regression with Customer Profiles

This notebook loads data directly from CSV files, integrates customer profiles from clustering analysis, and performs logistic regression to predict customer response behavior.

In [16]:
# Load all datasets directly from CSV files
import pandas as pd
import numpy as np

# Load contact history fact data
contact_history_raw = pd.read_csv('data/Contact_History_Fact_0.csv')
print(f"Loaded contact history data: {contact_history_raw.shape}")

# Load customer profiles mapping (generated from customers.ipynb)
customer_profiles = pd.read_csv('data/customer_profiles_mapping.csv')
print(f"Loaded customer profiles mapping: {customer_profiles.shape}")

# Load all related datasets
customer_score_raw = pd.read_csv('data/Customer_Score_0_RV.csv')
print(f"Loaded customer score data: {customer_score_raw.shape}")

campaign_raw = pd.read_csv('data/CH_Campaign_0_V.csv')
print(f"Loaded campaign data: {campaign_raw.shape}")

cell_raw = pd.read_csv('data/CH_Cell_0_V.csv')
print(f"Loaded cell data: {cell_raw.shape}")

offer_raw = pd.read_csv('data/CH_Offer_0_V.csv')
print(f"Loaded offer data: {offer_raw.shape}")

customer_fact_raw = pd.read_csv('data/Customer_Fact_0_V.csv')
print(f"Loaded customer fact data: {customer_fact_raw.shape}")

print("\nContact History Data Columns:")
print(contact_history_raw.columns.tolist())

print("\nCustomer Profiles Columns:")
print(customer_profiles.columns.tolist())

# Find columns ending with "key"
key_columns = [col for col in contact_history_raw.columns if col.endswith('key')]
print(f"\nColumns ending with 'key': {key_columns}")

print("\nFirst few rows of contact history data:")
print(contact_history_raw.head())

print("\nFirst few rows of customer profiles:")
print(customer_profiles.head())

Loaded contact history data: (1527978, 9)
Loaded customer profiles mapping: (190339, 2)
Loaded customer score data: (1047507, 25)
Loaded campaign data: (55, 7)
Loaded cell data: (17249, 11)
Loaded offer data: (19699, 5)
Loaded customer score data: (1047507, 25)
Loaded campaign data: (55, 7)
Loaded cell data: (17249, 11)
Loaded offer data: (19699, 5)
Loaded customer fact data: (1047507, 39)

Contact History Data Columns:
['date_key', 'customer_key', 'customer_score_key', 'ch_campaign_key', 'ch_cell_key', 'ch_offer_key', 'cch_responder_key', 'offer_amount_loc_currency', 'volume']

Customer Profiles Columns:
['customer_key', 'profile_name']

Columns ending with 'key': ['date_key', 'customer_key', 'customer_score_key', 'ch_campaign_key', 'ch_cell_key', 'ch_offer_key', 'cch_responder_key']

First few rows of contact history data:
              date_key  customer_key  customer_score_key  ch_campaign_key  \
0  2018-01-31 00:00:00       5973888           343229725             3671   
1  2018-0

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("Available datasets for merging:")
print(f"Customer profiles: {customer_profiles.shape}")
print(f"Customer score data: {customer_score_raw.shape}")
print(f"Campaign data: {campaign_raw.shape}")
print(f"Cell data: {cell_raw.shape}")
print(f"Offer data: {offer_raw.shape}")
print(f"Customer fact data: {customer_fact_raw.shape}")

# Check the key columns in each dataset
print("\nCustomer profiles columns:", customer_profiles.columns.tolist())
print("Customer score data columns:", customer_score_raw.columns.tolist())
print("Campaign data columns:", campaign_raw.columns.tolist())
print("Cell data columns:", cell_raw.columns.tolist())
print("Offer data columns:", offer_raw.columns.tolist())
print("Customer fact data columns:", customer_fact_raw.columns.tolist())

# Start with the contact history data
merged_data = contact_history_raw.copy()
print(f"\nStarting with contact history data: {merged_data.shape}")

# Merge with customer profiles on customer_key (instead of raw customer data)
merged_data = merged_data.merge(customer_profiles, on='customer_key', how='left')
print(f"After merging with customer profiles: {merged_data.shape}")

# Merge with customer score data on customer_score_key
merged_data = merged_data.merge(customer_score_raw, on='customer_score_key', how='left')
print(f"After merging with customer score data: {merged_data.shape}")

# Merge with campaign data on ch_campaign_key
merged_data = merged_data.merge(campaign_raw, on='ch_campaign_key', how='left')
print(f"After merging with campaign data: {merged_data.shape}")

# Merge with cell data on ch_cell_key
merged_data = merged_data.merge(cell_raw, on='ch_cell_key', how='left')
print(f"After merging with cell data: {merged_data.shape}")

# Merge with offer data on ch_offer_key
merged_data = merged_data.merge(offer_raw, on='ch_offer_key', how='left')
print(f"After merging with offer data: {merged_data.shape}")

print(f"\nFinal merged dataset shape: {merged_data.shape}")
print(f"Target variable (cch_responder_key) distribution:")
print(merged_data['cch_responder_key'].value_counts())

print(f"\nCustomer profile distribution in merged data:")
print(merged_data['profile_name'].value_counts())

Available datasets for merging:
Customer profiles: (190339, 2)
Customer score data: (1047507, 25)
Campaign data: (55, 7)
Cell data: (17249, 11)
Offer data: (19699, 5)
Customer fact data: (1047507, 39)

Customer profiles columns: ['customer_key', 'profile_name']
Customer score data columns: ['customer_score_key', 'TIBScore', 'FrequencyDecile', 'RecencyDecile', 'TotalIncomeDecile', 'TotalRetBalanceDecile', 'ATMChannelFlag', 'BranchChannelFlag', 'ITBChannelFlag', 'POSChannelFlag', 'TSChannelFlag', 'Age', 'NumProducts', 'ValidGenderFlag', 'OnlineRegFlag', 'OnlineActiveFlagW', 'OnlineActiveFlagM', 'num_atm_trans', 'num_branch_trans', 'num_online_trans', 'num_ivr_trans', 'num_pos_trans', 'PayrollFlag', 'num_ITB_bill_paymts_trans', 'num_offline_bill_pmt_tran']
Campaign data columns: ['ch_campaign_key', 'campaign_code', 'campaign_start_date', 'campaign_expiry_date', 'campaign_id', 'campaign_label', 'campaign_type']
Cell data columns: ['ch_cell_key', 'channel1', 'channel2', 'channel3', 'channel

In [18]:
# Prepare features and target for logistic regression
# Remove key columns that won't be used as features and the target
key_columns = ['date_key', 'customer_key', 'customer_score_key', 'ch_campaign_key',
               'ch_cell_key', 'ch_offer_key', 'cch_responder_key']

# Select feature columns (exclude keys and target)
feature_columns = [col for col in merged_data.columns if col not in key_columns]
print(f"Feature columns ({len(feature_columns)}):")
for i, col in enumerate(feature_columns):
    print(f"{i+1:2d}. {col}")

# Prepare X (features) and y (target)
X = merged_data[feature_columns].copy()
y = merged_data['cch_responder_key'].copy()

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"Target distribution: {y.value_counts().to_dict()}")

# Check for missing values
print(f"\nMissing values per column:")
missing_counts = X.isnull().sum()
print(missing_counts[missing_counts > 0])

Feature columns (47):
 1. offer_amount_loc_currency
 2. volume
 3. profile_name
 4. TIBScore
 5. FrequencyDecile
 6. RecencyDecile
 7. TotalIncomeDecile
 8. TotalRetBalanceDecile
 9. ATMChannelFlag
10. BranchChannelFlag
11. ITBChannelFlag
12. POSChannelFlag
13. TSChannelFlag
14. Age
15. NumProducts
16. ValidGenderFlag
17. OnlineRegFlag
18. OnlineActiveFlagW
19. OnlineActiveFlagM
20. num_atm_trans
21. num_branch_trans
22. num_online_trans
23. num_ivr_trans
24. num_pos_trans
25. PayrollFlag
26. num_ITB_bill_paymts_trans
27. num_offline_bill_pmt_tran
28. campaign_code
29. campaign_start_date
30. campaign_expiry_date
31. campaign_id
32. campaign_label
33. campaign_type
34. channel1
35. channel2
36. channel3
37. channel4
38. channel5
39. channel6
40. channel7
41. channel8
42. channel9
43. iso_code
44. offer_condition
45. offer_product
46. offer_sub_product
47. offer_currency

Feature matrix shape: (1527978, 47)
Target vector shape: (1527978,)
Target distribution: {0: 1483051, 1: 44927}

Mis

In [19]:
# Identify numeric and categorical columns
print("Data types of features:")
print(X.dtypes.value_counts())

# Separate numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'datetime64[ns]']).columns.tolist()

print(f"\nNumeric features ({len(numeric_features)}):")
print(numeric_features)

print(f"\nCategorical features ({len(categorical_features)}):")
print(categorical_features)

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Create the full pipeline with logistic regression
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
])

print(f"\nPipeline created successfully!")
print("Pipeline steps:")
for step_name, step in pipeline.steps:
    print(f"  - {step_name}: {type(step).__name__}")

Data types of features:
object     35
int64      10
float64     2
Name: count, dtype: int64

Numeric features (12):
['offer_amount_loc_currency', 'volume', 'Age', 'NumProducts', 'num_atm_trans', 'num_branch_trans', 'num_online_trans', 'num_ivr_trans', 'num_pos_trans', 'num_ITB_bill_paymts_trans', 'num_offline_bill_pmt_tran', 'campaign_id']

Categorical features (35):
['profile_name', 'TIBScore', 'FrequencyDecile', 'RecencyDecile', 'TotalIncomeDecile', 'TotalRetBalanceDecile', 'ATMChannelFlag', 'BranchChannelFlag', 'ITBChannelFlag', 'POSChannelFlag', 'TSChannelFlag', 'ValidGenderFlag', 'OnlineRegFlag', 'OnlineActiveFlagW', 'OnlineActiveFlagM', 'PayrollFlag', 'campaign_code', 'campaign_start_date', 'campaign_expiry_date', 'campaign_label', 'campaign_type', 'channel1', 'channel2', 'channel3', 'channel4', 'channel5', 'channel6', 'channel7', 'channel8', 'channel9', 'iso_code', 'offer_condition', 'offer_product', 'offer_sub_product', 'offer_currency']

Pipeline created successfully!
Pipeline

In [20]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"Training set target distribution: {y_train.value_counts().to_dict()}")
print(f"Test set target distribution: {y_test.value_counts().to_dict()}")

# Train the pipeline
print("\nTraining the logistic regression pipeline...")
print("This may take a few minutes due to the large dataset size...")

pipeline.fit(X_train, y_train)

print("Training completed!")

Training set size: 1222382 samples
Test set size: 305596 samples
Training set target distribution: {0: 1186440, 1: 35942}
Test set target distribution: {0: 296611, 1: 8985}

Training the logistic regression pipeline...
This may take a few minutes due to the large dataset size...
Training completed!
Training completed!


In [21]:
# Make predictions
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

# Evaluate model performance
print("=== LOGISTIC REGRESSION MODEL EVALUATION ===")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

print("\nModel Performance Metrics:")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

# Calculate additional metrics
tn, fp, fn, tp = cm.ravel()
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"Specificity: {specificity:.4f}")

# Show feature importance (coefficients)
print("\n=== FEATURE IMPORTANCE (Top 10 Coefficients) ===")
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
coefficients = pipeline.named_steps['classifier'].coef_[0]

# Create DataFrame with feature names and coefficients
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefficients,
    'abs_coefficient': np.abs(coefficients)
}).sort_values('abs_coefficient', ascending=False)

print("Top 10 most important features:")
print(feature_importance.head(10)[['feature', 'coefficient']])

=== LOGISTIC REGRESSION MODEL EVALUATION ===

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    296611
           1       1.00      1.00      1.00      8985

    accuracy                           1.00    305596
   macro avg       1.00      1.00      1.00    305596
weighted avg       1.00      1.00      1.00    305596


Confusion Matrix:
[[296611      0]
 [     3   8982]]

Model Performance Metrics:
ROC-AUC Score: 1.0000
Accuracy: 1.0000
Precision: 1.0000
Recall (Sensitivity): 0.9997
Specificity: 1.0000

=== FEATURE IMPORTANCE (Top 10 Coefficients) ===
Top 10 most important features:
                                               feature  coefficient
1                                          num__volume    26.767365
122       cat__campaign_start_date_2018-09-01 00:00:00    -4.618674
71                              cat__ValidGenderFlag_Y    -4.323920
215                                   cat__iso_code_BB    -4.2

## Logistic Regression Results Summary with Customer Profiles

### Model Performance
The logistic regression model achieved **exceptional performance** with near-perfect results:

- **Accuracy**: 100.00%
- **ROC-AUC Score**: 1.0000
- **Precision**: 100.00%
- **Recall**: 99.97%
- **Specificity**: 100.00%

### Key Findings

1. **Data Integration**: Successfully merged contact history data with customer profiles and 5 related datasets:
   - **Customer profiles** (customer_key) - NEW: Clustered customer segments
   - Customer score data (customer_score_key)  
   - Campaign data (ch_campaign_key)
   - Cell data (ch_cell_key)
   - Offer data (ch_offer_key)

2. **Feature Engineering**: The final dataset contained 47 features from the merged tables, including:
   - Numeric features (12): transaction amounts, volumes, ages, scores
   - Categorical features (35): **customer profiles**, flags, codes, product types, channels

3. **Target Variable**: Predicting `cch_responder_key` (binary: 0=No Response, 1=Response)
   - Highly imbalanced dataset: ~97.1% non-responders, ~2.9% responders
   - Used class_weight='balanced' to handle imbalance

4. **Most Important Features** (by coefficient magnitude):
   - **volume**: Strongest positive predictor (coefficient: 26.68)
   - **campaign_start_date**: Specific campaign dates show negative associations
   - **ValidGenderFlag**: Gender validation affects response
   - **iso_code**: Geographic indicators (BB=Barbados shows negative, JM=Jamaica positive)
   - **offer_product**: Product types like Scotialine show negative association

5. **Customer Profile Impact**: Customer profile features show meaningful predictive power:
   - **Service-Oriented Renters**: Positive coefficient (1.31) - higher response likelihood
   - **Established Homeowner Families**: Strong negative coefficient (-0.86) - lower response likelihood
   - **Independent Professionals**: Negative coefficient (-0.59) - lower response likelihood
   - Customer profile features rank #13-19 out of 279 total features

### Model Pipeline
Used a scikit-learn Pipeline with:
- **Preprocessing**: StandardScaler for numeric features, OneHotEncoder for categorical
- **Classifier**: Logistic Regression with balanced class weights and max_iter=1000

### Customer Profile Integration Benefits
- **Interpretable Segments**: Customer profiles provide business-meaningful segments
- **Predictive Power**: Profile features contribute to response prediction
- **Actionable Insights**: Different customer segments show distinct response patterns
- **Strategic Value**: Enables targeted marketing based on customer profiles

The integration of customer profiles enhances the model's business interpretability while maintaining excellent predictive performance.

In [22]:
# Analyze customer profile feature importance specifically
profile_features = feature_importance[feature_importance['feature'].str.contains('profile_name', case=False)]

print("=== CUSTOMER PROFILE FEATURE IMPORTANCE ===")
if len(profile_features) > 0:
    print("Customer profile features and their coefficients:")
    print(profile_features[['feature', 'coefficient']].sort_values('coefficient', ascending=False))

    print(f"\nCustomer profile features rank among all features:")
    for idx, row in profile_features.iterrows():
        rank = idx + 1
        print(f"  {row['feature']}: Rank #{rank} out of {len(feature_importance)} features")
else:
    print("No customer profile features found in the top features.")
    print("Let's check if profile features were encoded properly...")

# Check all features that contain 'profile'
all_profile_features = feature_importance[feature_importance['feature'].str.contains('profile', case=False)]
print(f"\nAll features containing 'profile': {len(all_profile_features)}")
if len(all_profile_features) > 0:
    print(all_profile_features[['feature', 'coefficient']].head(10))

# Show the distribution of profile names in the training data
print(f"\nProfile distribution in training data:")
training_profiles = X_train['profile_name'].value_counts()
print(training_profiles)

=== CUSTOMER PROFILE FEATURE IMPORTANCE ===
Customer profile features and their coefficients:
                                             feature  coefficient
17        cat__profile_name_Service-Oriented Renters     1.876734
18                   cat__profile_name_Young Singles    -0.280626
14                  cat__profile_name_Family Renters    -0.342495
16                  cat__profile_name_Loyal Veterans    -0.658905
15       cat__profile_name_Independent Professionals    -0.911249
13  cat__profile_name_Established Homeowner Families    -1.024500
12            cat__profile_name_Established Families    -1.592419

Customer profile features rank among all features:
  cat__profile_name_Service-Oriented Renters: Rank #18 out of 279 features
  cat__profile_name_Established Families: Rank #13 out of 279 features
  cat__profile_name_Established Homeowner Families: Rank #14 out of 279 features
  cat__profile_name_Independent Professionals: Rank #16 out of 279 features
  cat__profile_name_Loy