# Synthetic Brand Generation with GANs

## Project Goal
Generate synthetic brand data to address class imbalance in hierarchical clustering using:
- **CTGAN**: For generating realistic brand features (ESG metrics, demographics, business characteristics)
- **DistilGPT2**: For generating realistic brand names

## Dataset
- **Source**: `data/raw/brand_information.csv`
- **Size**: 3,605 brands with 80+ features
- **Problem**: Hierarchical clustering produces only 2 clusters (severe imbalance)

## Notebook Structure
1. **Phase 1**: Data Preparation & Exploration
2. **Phase 2**: CTGAN Training (Tabular Features)
3. **Phase 3**: Brand Name Generation (DistilGPT2)
4. **Phase 4**: Synthetic Data Generation
5. **Phase 5**: Evaluation & Clustering Comparison

## Setup & Installation

In [None]:
!git clone https://github.com/dyegofern/csca5642-deep-learning.git
#!pip install -q sdv transformers torch pandas numpy scikit-learn matplotlib seaborn plotly scipy

# Import libraries
import sys
import os
from google.colab import drive

MAPPED_DIR = '/content/csca5642-deep-learning'

# Mount Google Drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

DATA_PATH = MAPPED_DIR + '/data/raw/brand_information.csv'

# Set output and model directories to Google Drive
DRIVE_OUTPUT_BASE = '/content/drive/MyDrive/Colab_Output/SyntheticBrandGeneration'
OUTPUT_DIR = os.path.join(DRIVE_OUTPUT_BASE, 'outputs')
MODEL_DIR = os.path.join(DRIVE_OUTPUT_BASE, 'models')

# Add src to path (for local imports)
src_path = MAPPED_DIR + '/src'
if src_path not in sys.path:
    sys.path.append(src_path)

# Add data to path (for local imports) - as specified by the user
data_path = MAPPED_DIR + '/data'
if data_path not in sys.path:
    sys.path.append(data_path)

In [None]:
# Import our custom modules
from data_processor import BrandDataProcessor
from tabular_gan import TabularBrandGAN
from brand_name_generator import BrandNameGenerator
from evaluator import BrandDataEvaluator

# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print(" All modules loaded successfully!")

## Configuration

In [None]:
# ============================================================
# CONFIGURATION
# ============================================================

# Model Loading Option
FROM_PRETRAINED = True  # Set to True to load pre-trained models, False to train from scratch

# Training parameters
CTGAN_EPOCHS = 300
CTGAN_BATCH_SIZE = 500
GPT2_EPOCHS = 3
GPT2_BATCH_SIZE = 8

# Generation parameters  
MIN_BRANDS_PER_COMPANY = 100  # Minimum brands per company (as requested)
DIVERSITY_TEMPERATURE = 0.7    # Brand name temperature (0.6-0.8 recommended)
ADD_DIVERSITY_NOISE = True     # Add 2% noise to tabular features

# Model paths
ctgan_model_path = os.path.join(MODEL_DIR, 'ctgan_brand_model.pkl')
gpt2_output_dir = os.path.join(MODEL_DIR, 'brand_name_generator')

print(f"Data path: {DATA_PATH}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Model directory: {MODEL_DIR}")
print(f" Configuration:")
print(f"  Load from pretrained: {FROM_PRETRAINED}")
print(f"  Min brands per company: {MIN_BRANDS_PER_COMPANY}")
print(f"  Brand name temperature: {DIVERSITY_TEMPERATURE}")
print(f"  Add diversity noise: {ADD_DIVERSITY_NOISE}")

---
# Phase 1: Data Preparation & Exploration

Load and explore the brand dataset, then prepare it for GAN training.

## 1.1 Load Data

In [None]:
# Initialize data processor
processor = BrandDataProcessor(DATA_PATH)

# Load data
df = processor.load_data()
df.head()

## 1.2 Exploratory Data Analysis

In [None]:
# Explore the dataset
stats = processor.explore_data()

# Visualize brands per company
brand_counts = pd.Series(stats['brands_per_company'])
top_companies = brand_counts.head(20)

plt.figure(figsize=(14, 6))
top_companies.plot(kind='bar')
plt.title('Top 20 Companies by Number of Brands')
plt.xlabel('Company')
plt.ylabel('Number of Brands')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print(f"\nCompanies with 1 brand: {(brand_counts == 1).sum()}")
print(f"Companies with 20+ brands: {(brand_counts >= 20).sum()}")
print(f"Companies with 50+ brands: {(brand_counts >= 50).sum()}")

In [None]:
# Identify multi-brand companies (good for training)
multi_brand_companies = processor.get_multi_brand_companies(min_brands=10)

print(f"\nIdentified {len(multi_brand_companies)} companies with 10+ brands")
print("\nExample companies:")
for company in multi_brand_companies[:10]:
    n_brands = len(processor.get_company_brands(company))
    print(f"  {company}: {n_brands} brands")

## 1.3 Data Cleaning & Preprocessing

In [None]:
# Clean the data
df_clean = processor.clean_data(drop_text_heavy=True)

# Check for missing values
missing = df_clean.isnull().sum()
print(f"\nRemaining missing values: {missing[missing > 0]}")

In [None]:
# Prepare data for GAN training
train_df, val_df = processor.prepare_for_gan(test_size=0.25)

print(f"\nTraining set shape: {train_df.shape}")
print(f"Validation set shape: {val_df.shape}")
print(f"\nFeatures for GAN: {list(train_df.columns)}")

---
# Phase 2: CTGAN Training (Tabular Features)

Train CTGAN to generate realistic brand features conditioned on company name.

## 2.1 Initialize and Train CTGAN

In [None]:
# Initialize CTGAN
ctgan = TabularBrandGAN(
    epochs=CTGAN_EPOCHS,
    batch_size=CTGAN_BATCH_SIZE,
    verbose=True
)

# Identify discrete columns (categorical features that were encoded)
discrete_cols = processor.categorical_features + ['company_name']
discrete_cols = [col for col in discrete_cols if col in train_df.columns]

print(f"Discrete columns for CTGAN: {discrete_cols}")

In [None]:
# Train or load CTGAN model
if FROM_PRETRAINED and os.path.exists(ctgan_model_path):
    print("üìÇ Loading pre-trained CTGAN model...")
    ctgan.load_model(ctgan_model_path)
else:
    if FROM_PRETRAINED:
        print("‚ö†Ô∏è Pre-trained model not found, training from scratch...")
    else:
        print("üîÑ Training new CTGAN model...")
    
    # Train CTGAN (this may take 10-30 minutes depending on GPU)
    ctgan.train(train_df, discrete_columns=discrete_cols)
    
    # Save the trained model
    ctgan.save_model(ctgan_model_path)

## 2.2 Test CTGAN Generation

In [None]:
# Test: Generate 5 synthetic brands for a specific company
test_company = multi_brand_companies[0]
test_company_encoded = processor.label_encoders['company_name'].transform([test_company])[0]

print(f"Testing generation for: {test_company} (encoded: {test_company_encoded})")

test_synthetic = ctgan.generate(
    n_samples=5,
    condition_column='company_name',
    condition_value=test_company_encoded
)

# Decode and display
test_decoded = processor.decode_categorical(test_synthetic)
test_decoded.head()

---
# Phase 3: Brand Name Generation (DistilGPT2)

Fine-tune DistilGPT2 to generate realistic brand names.

## 3.1 Prepare Brand Name Training Data

In [None]:
# Prepare dataframe with brand_name, company_name, industry_name
brand_name_df = df_clean[['brand_name', 'company_name', 'industry_name']].dropna()

print(f"Brand name training data: {len(brand_name_df)} examples")
print("\nExample training data:")
brand_name_df.head(10)

## 3.2 Fine-tune DistilGPT2

In [None]:
# Initialize brand name generator
name_generator = BrandNameGenerator(model_name='distilgpt2')

# Prepare model
name_generator.prepare_model()

In [None]:
# Train or load brand name generator
if FROM_PRETRAINED and os.path.exists(gpt2_output_dir):
    print("Loading pre-trained brand name generator...")
    name_generator.load_model(gpt2_output_dir)
else:
    if FROM_PRETRAINED:
        print("Pre-trained model not found, training from scratch...")
    else:
        print("Training new brand name generator...")
    
    # Fine-tune on brand names (this may take 15-30 minutes)
    name_generator.fine_tune(
        brands_df=brand_name_df,
        epochs=GPT2_EPOCHS,
        batch_size=GPT2_BATCH_SIZE,
        output_dir=gpt2_output_dir
    )

## 3.3 Test Brand Name Generation

In [None]:
# Test: Generate brand names for different companies
test_companies = [
    ('PepsiCo, Inc.', 'Non-Alcoholic Beverages'),
    ('Mars, Incorporated', 'Processed Foods'),
    ('Nestle', 'Processed Foods')
]

for company, industry in test_companies:
    print(f"\n{'='*60}")
    print(f"Company: {company}")
    print(f"Industry: {industry}")
    print(f"{'='*60}")

    generated_names = name_generator.generate_brand_names(
        company_name=company,
        industry_name=industry,
        n_names=10,
        temperature=0.8
    )

    print("\nGenerated brand names:")
    for i, name in enumerate(generated_names, 1):
        print(f"  {i}. {name}")

---
# Phase 4: Synthetic Data Generation

Generate synthetic brands combining CTGAN features + DistilGPT2 names.

In [None]:
from tabular_gan import calculate_generation_targets

# Calculate how many brands to generate for each company
# This ensures minimum 100 brands per company (configurable)
generation_targets = calculate_generation_targets(
    data=train_df,
    company_column='company_name',
    min_brands_per_company=MIN_BRANDS_PER_COMPANY
)

print(f"Total synthetic brands to generate: {sum(generation_targets.values())}")

In [None]:
brand_counts = df['company_name'].value_counts()

# Identify companies with few brands (candidates for augmentation)
companies_needing_brands = brand_counts[brand_counts < MIN_BRANDS_PER_COMPANY].index.tolist()

print(f"Companies with < {MIN_BRANDS_PER_COMPANY} brands: {len(companies_needing_brands)}")
print(f"\nI'll generate synthetic brands for {min(150, len(companies_needing_brands))} companies")

target_companies = companies_needing_brands[:150]
brands_per_company = 10 #max(2, N_SYNTHETIC_BRANDS // len(target_companies))

In [None]:
# Generate synthetic features using efficient stratified generation
print(" Generating synthetic brand features...")

import time
start_time = time.time()

synthetic_features = ctgan.generate_stratified(
    company_distribution=generation_targets,
    verbose=True
)

elapsed = time.time() - start_time
print(f"Generation completed in {elapsed/60:.2f} minutes")
print(f"  Average: {elapsed/len(synthetic_features):.3f}s per brand")

# Add diversity noise if enabled
if ADD_DIVERSITY_NOISE:
    print("Adding diversity noise (2%)...")
    synthetic_features = ctgan.add_diversity_noise(
        synthetic_features,
        noise_level=0.02,
        numerical_cols=processor.numerical_features
    )

print(f"Generated {len(synthetic_features)} synthetic brand feature sets")
synthetic_features.head()

In [None]:
# Encode target companies
target_companies_encoded = [
    processor.label_encoders['company_name'].transform([comp])[0]
    for comp in target_companies
]

# Generate synthetic features for each company
synthetic_features = ctgan.generate_for_companies(
    companies=target_companies_encoded,
    n_per_company=brands_per_company
)

print(f"\nGenerated {len(synthetic_features)} synthetic brand feature sets")
synthetic_features.head()

# Decode categorical features first (so we have company/industry names for generation)
synthetic_decoded = processor.decode_categorical(synthetic_features.copy())

# Generate brand names with quality controls
print("
üè∑Ô∏è Generating brand names with quality controls...
")

synthetic_with_names = name_generator.generate_for_dataframe(
    synthetic_df=synthetic_decoded,
    n_names_per_brand=3,  # Generate 3 candidates, pick best
    temperature=DIVERSITY_TEMPERATURE,
    verbose=True
)

print("
‚úì Sample synthetic brands with CLEAN names:")
print(synthetic_with_names[['brand_name', 'company_name', 'industry_name']].head(20))

In [None]:
# Decode categorical features first (so we have company/industry names for generation)
synthetic_decoded = processor.decode_categorical(synthetic_features.copy())

# Generate brand names
print("\nGenerating brand names for synthetic data...")
synthetic_with_names = name_generator.generate_for_dataframe(
    synthetic_df=synthetic_decoded,
    n_names_per_brand=3,
    temperature=0.8
)

print("\nSample synthetic brands:")
synthetic_with_names[['brand_name', 'company_name', 'industry_name']].head(20)

## 4.4 Save Synthetic Data

In [None]:
# Save synthetic brands
synthetic_path = os.path.join(OUTPUT_DIR, 'synthetic_brands.csv')
synthetic_with_names.to_csv(synthetic_path, index=False)
print(f"Synthetic brands saved to: {synthetic_path}")

# Create augmented dataset (original + synthetic)
augmented_df = pd.concat([df_clean, synthetic_with_names], ignore_index=True)
augmented_path = os.path.join(OUTPUT_DIR, 'augmented_brands.csv')
augmented_df.to_csv(augmented_path, index=False)

print(f"\nAugmented dataset saved to: {augmented_path}")
print(f"Original brands: {len(df_clean)}")
print(f"Synthetic brands: {len(synthetic_with_names)}")
print(f"Total augmented: {len(augmented_df)}")

---
# Phase 5: Evaluation & Clustering Comparison

Evaluate synthetic data quality and compare clustering results.

## 5.1 Statistical Validation

In [None]:
# Initialize evaluator
evaluator = BrandDataEvaluator()

# Compare distributions (using numerical features)
numerical_cols = processor.numerical_features[:10]  # Sample of numerical features

ks_results = evaluator.compare_distributions(
    real_data=df_clean,
    synthetic_data=synthetic_with_names,
    numerical_cols=numerical_cols
)

In [None]:
# Compare correlations
real_corr, synth_corr = evaluator.compare_correlations(
    real_data=df_clean,
    synthetic_data=synthetic_with_names,
    numerical_cols=numerical_cols
)

In [None]:
# Visualize distribution comparisons
evaluator.plot_distribution_comparison(
    real_data=df_clean,
    synthetic_data=synthetic_with_names,
    features=numerical_cols[:6]
)

In [None]:
# Visualize correlation heatmaps
evaluator.plot_correlation_heatmaps(real_corr, synth_corr)

## 5.2 PCA Visualization

In [None]:
# PCA comparison
evaluator.plot_pca_comparison(
    original_data=df_clean,
    synthetic_data=synthetic_with_names,
    numerical_cols=numerical_cols
)

## 5.3 Clustering Comparison (Main Goal)

In [None]:
# Compare clustering: Original vs Augmented
clustering_comparison = evaluator.compare_clustering(
    original_data=df_clean,
    augmented_data=augmented_df,
    numerical_cols=numerical_cols
)

In [None]:
# Visualize cluster distributions
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Original clustering
orig_dist = clustering_comparison['original']['cluster_distribution']
ax1.bar(orig_dist.keys(), orig_dist.values())
ax1.set_title('Original Data: Cluster Sizes')
ax1.set_xlabel('Cluster')
ax1.set_ylabel('Number of Brands')

# Augmented clustering
aug_dist = clustering_comparison['augmented']['cluster_distribution']
ax2.bar(aug_dist.keys(), aug_dist.values())
ax2.set_title('Augmented Data: Cluster Sizes')
ax2.set_xlabel('Cluster')
ax2.set_ylabel('Number of Brands')

plt.tight_layout()
plt.show()

## 5.4 Generate Final Report

In [None]:
# Generate and display report
report = evaluator.generate_report()
print(report)

# Save report
report_path = os.path.join(OUTPUT_DIR, 'evaluation_report.txt')
with open(report_path, 'w') as f:
    f.write(report)
print(f"\nReport saved to: {report_path}")

---
# Summary & Conclusions

In [None]:
print("="*70)
print("SYNTHETIC BRAND GENERATION: FINAL SUMMARY")
print("="*70)
print(f"\nDataset Statistics:")
print(f"  Original brands: {len(df_clean)}")
print(f"  Synthetic brands generated: {len(synthetic_with_names)}")
print(f"  Augmented dataset total: {len(augmented_df)}")
print(f"  Augmentation ratio: {len(synthetic_with_names)/len(df_clean)*100:.1f}%")

print(f"\nClustering Results:")
orig = clustering_comparison['original']
aug = clustering_comparison['augmented']
print(f"  Original - Clusters: {orig['n_clusters']}, Silhouette: {orig['silhouette_score']:.4f}")
print(f"  Augmented - Clusters: {aug['n_clusters']}, Silhouette: {aug['silhouette_score']:.4f}")
print(f"  Improvement: {clustering_comparison['silhouette_improvement']:+.4f}")

print(f"\nOutput Files:")
print(f"  Synthetic brands: {synthetic_path}")
print(f"  Augmented dataset: {augmented_path}")
print(f"  CTGAN model: {ctgan_model_path}")
print(f"  Name generator: {gpt2_output_dir}")
print(f"  Evaluation report: {report_path}")

print("\nPipeline completed successfully!")
print("="*70)

---
## Optional: Load Pre-trained Models

If you want to skip training and load previously saved models: