# Synthetic Brand Generation V2 - Enhanced with Ensemble Methods

### University of Colorado Boulder - Introduction to Deep Learning
---
#### Dyego Fernandes de Sousa
---

### Improvements over V1

This notebook implements the following enhancements:

1. **TVAE (Tabular Variational Autoencoder)**: Alternative to CTGAN for better continuous distributions
2. **Gaussian Copula**: For better correlation structure preservation
3. **Larger Language Models**: GPT-2 Medium, Flan-T5, Phi-2, TinyLlama for improved brand name generation
4. **Ensemble Methods**: Voting/averaging across multiple generators

### Notebook Structure
1. **Phase 1**: Setup & Data Preparation
2. **Phase 2**: Tabular Ensemble Training (CTGAN + TVAE + Gaussian Copula)
3. **Phase 3**: LLM Ensemble Training (GPT-2 Medium + Flan-T5)
4. **Phase 4**: Synthetic Data Generation with Ensembles
5. **Phase 5**: Quality Evaluation & Comparison (V1 vs V2)

## Phase 1: Setup & Installation

### Optimized for Google Colab Pro (~15GB RAM, ~16GB VRAM)

In [None]:
# Clone repository and install dependencies
!git clone https://github.com/dyegofern/csca5642-deep-learning.git
!pip install -q sdv transformers torch pandas numpy scikit-learn matplotlib seaborn plotly scipy
!pip install -q peft bitsandbytes accelerate sentencepiece  # Additional V2 dependencies

import sys
import os
from google.colab import drive

MAPPED_DIR = '/content/csca5642-deep-learning'

# Mount Google Drive
print("Mounting Google Drive...")
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')
else:
    print("Google Drive already mounted")

DATA_PATH = MAPPED_DIR + '/data/raw/brand_information.csv'

# Set output and model directories to Google Drive
DRIVE_OUTPUT_BASE = '/content/drive/MyDrive/Colab_Output/SyntheticBrandGeneration_V2'
OUTPUT_DIR = os.path.join(DRIVE_OUTPUT_BASE, 'outputs')
MODEL_DIR = os.path.join(DRIVE_OUTPUT_BASE, 'models')

# Create directories
print(f"\nCreating directories in Google Drive...")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
print(f"Output directory: {OUTPUT_DIR}")
print(f"Model directory: {MODEL_DIR}")

# Add src to path
src_path = MAPPED_DIR + '/src'
if src_path not in sys.path:
    sys.path.append(src_path)

print(f"\nSetup complete!")

In [None]:
# Import V2 modules
from data_processor import BrandDataProcessor
from tabular_gan_v2 import (
    EnsembleSynthesizer,
    CTGANSynthesizerWrapper,
    TVAESynthesizerWrapper,
    GaussianCopulaSynthesizerWrapper,
    calculate_generation_targets
)
from brand_name_generator_v2 import BrandNameGeneratorV2
from evaluator import BrandDataEvaluator

# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import gc

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Check GPU
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

print("\nAll V2 modules loaded successfully!")

## Configuration

In [None]:
# Configuration for V2
FROM_PRETRAINED = False  # Set to True to load pre-trained models

# Tabular Ensemble Config
CTGAN_EPOCHS = 300
TVAE_EPOCHS = 300
BATCH_SIZE = 500
ENSEMBLE_WEIGHTS = {
    'ctgan': 0.40,
    'tvae': 0.35,
    'gaussian_copula': 0.25
}

# LLM Ensemble Config
LLM_MODELS = ['gpt2-medium', 'flan-t5-base']  # Can add 'phi-2', 'tinyllama' if memory allows
LLM_EPOCHS = 3

# Generation Config
MIN_BRANDS_PER_COMPANY = 10
DIVERSITY_TEMPERATURE = 0.7
ADD_DIVERSITY_NOISE = True

print("Configuration:")
print(f"  FROM_PRETRAINED: {FROM_PRETRAINED}")
print(f"  CTGAN_EPOCHS: {CTGAN_EPOCHS}")
print(f"  TVAE_EPOCHS: {TVAE_EPOCHS}")
print(f"  LLM_MODELS: {LLM_MODELS}")
print(f"  ENSEMBLE_WEIGHTS: {ENSEMBLE_WEIGHTS}")

## Phase 1: Data Preparation

In [None]:
# Load and process data
processor = BrandDataProcessor(DATA_PATH)
raw_data = processor.load_data()
print(f"Loaded {len(raw_data)} brands with {len(raw_data.columns)} features")

In [None]:
# Clean data
cleaned_data = processor.clean_data()
print(f"\nCleaned data: {len(cleaned_data)} rows, {len(cleaned_data.columns)} columns")

In [None]:
# Prepare for GAN training
train_df, val_df = processor.prepare_for_gan(test_size=0.2)

print(f"\nTraining set: {len(train_df)} brands")
print(f"Validation set: {len(val_df)} brands")

# Get column types
discrete_cols = processor.categorical_features
binary_cols = [col for col in train_df.columns if train_df[col].nunique() == 2 and set(train_df[col].unique()).issubset({0, 1})]
numerical_cols = [col for col in train_df.columns if col not in discrete_cols and col not in binary_cols]

print(f"\nColumn types:")
print(f"  Numerical: {len(numerical_cols)}")
print(f"  Categorical: {len(discrete_cols)}")
print(f"  Binary: {len(binary_cols)}")

## Phase 2: Tabular Ensemble Training

Training CTGAN, TVAE, and Gaussian Copula models

In [None]:
# Initialize Ensemble Synthesizer
tabular_ensemble = EnsembleSynthesizer(
    ctgan_epochs=CTGAN_EPOCHS,
    ctgan_batch_size=BATCH_SIZE,
    tvae_epochs=TVAE_EPOCHS,
    tvae_batch_size=BATCH_SIZE,
    gc_default_distribution='beta',
    weights=ENSEMBLE_WEIGHTS,
    verbose=True,
    cuda=True
)

print("Tabular Ensemble initialized with:")
print(f"  - CTGAN (epochs={CTGAN_EPOCHS})")
print(f"  - TVAE (epochs={TVAE_EPOCHS})")
print(f"  - Gaussian Copula (distribution=beta)")

In [None]:
if FROM_PRETRAINED:
    # Load pre-trained models
    print("Loading pre-trained tabular ensemble...")
    tabular_ensemble.load_models(os.path.join(MODEL_DIR, 'tabular_ensemble'))
else:
    # Train all models
    print("Training tabular ensemble (this will take ~30-60 minutes)...")
    training_times = tabular_ensemble.train(
        data=train_df,
        discrete_columns=discrete_cols,
        binary_columns=binary_cols
    )
    
    # Save models
    tabular_ensemble.save_models(os.path.join(MODEL_DIR, 'tabular_ensemble'))
    
    print(f"\nTraining times:")
    for model, time in training_times.items():
        print(f"  {model}: {time:.1f} seconds")

In [None]:
# Compare individual model quality
print("Evaluating individual model quality...")
comparison_df = tabular_ensemble.compare_all_models(train_df, n_samples=1000)
print("\nModel Comparison:")
display(comparison_df)

In [None]:
# Optionally optimize weights based on quality
# optimized_weights = tabular_ensemble.optimize_weights(train_df, n_eval_samples=1000)
# print(f"Optimized weights: {optimized_weights}")

## Phase 3: LLM Ensemble Training

Training GPT-2 Medium and Flan-T5 for brand name generation

In [None]:
# Prepare brand name training data
brands_df = processor.raw_data[['brand_name', 'company_name', 'industry_name']].dropna()
print(f"Brand name training data: {len(brands_df)} examples")
brands_df.head()

In [None]:
# Initialize LLM Ensemble Generator
llm_generator = BrandNameGeneratorV2(
    models=LLM_MODELS,
    memory_efficient=True,
    verbose=True
)

print(f"LLM Ensemble initialized with models: {LLM_MODELS}")

In [None]:
if FROM_PRETRAINED:
    # Load pre-trained models
    print("Loading pre-trained LLM ensemble...")
    llm_generator.load_model(os.path.join(MODEL_DIR, 'llm_ensemble'))
else:
    # Fine-tune all models
    print(f"Fine-tuning LLM ensemble (epochs={LLM_EPOCHS})...")
    print("This will train each model sequentially to save memory.")
    
    llm_generator.fine_tune(
        brands_df=brands_df,
        epochs=LLM_EPOCHS,
        output_dir=os.path.join(MODEL_DIR, 'llm_ensemble')
    )
    
    # Save ensemble config
    llm_generator.save_model(os.path.join(MODEL_DIR, 'llm_ensemble'))

In [None]:
# Test LLM generation
print("Testing LLM ensemble generation...")
llm_generator.prepare_model()

test_companies = [
    ("Apple", "Technology"),
    ("Nike", "Apparel"),
    ("Nestle", "Food & Beverage")
]

for company, industry in test_companies:
    names = llm_generator.generate_brand_names(company, industry, n_names=3)
    print(f"\n{company} ({industry}): {names}")

## Phase 4: Synthetic Data Generation

In [None]:
# Calculate generation targets
generation_targets = calculate_generation_targets(
    data=train_df,
    company_column='company_name',
    min_brands_per_company=MIN_BRANDS_PER_COMPANY
)

In [None]:
# Generate synthetic tabular features using ensemble
print("Generating synthetic features with ensemble...")
synthetic_features, failed_companies = tabular_ensemble.generate_stratified(
    company_distribution=generation_targets,
    verbose=True
)

print(f"\nGenerated {len(synthetic_features)} synthetic brand features")
if failed_companies:
    print(f"Failed companies: {len(failed_companies)}")

In [None]:
# Add diversity noise if enabled
if ADD_DIVERSITY_NOISE:
    print("Adding diversity noise to numerical features...")
    synthetic_features = tabular_ensemble.add_diversity_noise(
        synthetic_features,
        noise_level=0.02
    )

In [None]:
# Decode categorical features back to original values
print("Decoding categorical features...")
synthetic_decoded = processor.decode_categorical(synthetic_features)
print(f"Decoded {len(synthetic_decoded)} synthetic brands")

In [None]:
# Generate brand names using LLM ensemble
print("\nGenerating brand names with LLM ensemble...")
llm_generator.reset_uniqueness_tracker()

synthetic_with_names = llm_generator.generate_for_dataframe(
    synthetic_df=synthetic_decoded,
    temperature=DIVERSITY_TEMPERATURE,
    verbose=True
)

print(f"\nFinal synthetic dataset: {len(synthetic_with_names)} brands")

In [None]:
# Preview synthetic data
print("\nSample of generated synthetic brands:")
display(synthetic_with_names[['company_name', 'industry_name', 'brand_name']].head(20))

In [None]:
# Save synthetic data
synthetic_path = os.path.join(OUTPUT_DIR, 'synthetic_brands_v2.csv')
synthetic_with_names.to_csv(synthetic_path, index=False)
print(f"Synthetic data saved to {synthetic_path}")

# Create augmented dataset
original_decoded = processor.decode_categorical(train_df)
augmented_df = pd.concat([original_decoded, synthetic_with_names], ignore_index=True)

augmented_path = os.path.join(OUTPUT_DIR, 'augmented_brands_v2.csv')
augmented_df.to_csv(augmented_path, index=False)
print(f"Augmented data saved to {augmented_path}")
print(f"Total augmented size: {len(augmented_df)} brands")

## Phase 5: Quality Evaluation & Comparison

In [None]:
# Initialize evaluator
evaluator = BrandDataEvaluator()

# Get numerical columns for evaluation
eval_numerical_cols = [col for col in numerical_cols if col in synthetic_features.columns and col in train_df.columns]

In [None]:
# Distribution comparison
print("\n=== Distribution Comparison (KS Test) ===")
ks_results = evaluator.compare_distributions(train_df, synthetic_features, eval_numerical_cols)

# Count passes
passes = sum(1 for v in ks_results.values() if v['pvalue'] > 0.05)
print(f"\nKS Test Summary: {passes}/{len(ks_results)} features pass (p > 0.05)")

In [None]:
# Correlation comparison
print("\n=== Correlation Comparison ===")
real_corr, synth_corr = evaluator.compare_correlations(train_df, synthetic_features, eval_numerical_cols)

In [None]:
# Visualize distributions
print("\n=== Distribution Visualization ===")
evaluator.plot_distribution_comparison(train_df, synthetic_features, eval_numerical_cols[:6])
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'distribution_comparison_v2.png'), dpi=150)
plt.show()

In [None]:
# Correlation heatmaps
print("\n=== Correlation Heatmaps ===")
evaluator.plot_correlation_heatmaps(train_df, synthetic_features, eval_numerical_cols)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'correlation_comparison_v2.png'), dpi=150)
plt.show()

In [None]:
# PCA visualization
print("\n=== PCA Visualization ===")
evaluator.plot_pca_comparison(train_df, synthetic_features, eval_numerical_cols)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'pca_comparison_v2.png'), dpi=150)
plt.show()

In [None]:
# Clustering evaluation
print("\n=== Clustering Evaluation ===")
cluster_metrics = evaluator.evaluate_clustering(train_df, eval_numerical_cols)
print(f"Original data clustering - Silhouette: {cluster_metrics['silhouette']:.3f}")

augmented_cluster_metrics = evaluator.evaluate_clustering(augmented_df, eval_numerical_cols)
print(f"Augmented data clustering - Silhouette: {augmented_cluster_metrics['silhouette']:.3f}")

## Summary & Comparison

In [None]:
# Print final summary
print("="*60)
print("SYNTHETIC BRAND GENERATION V2 - SUMMARY")
print("="*60)
print(f"\nOriginal dataset: {len(train_df)} brands")
print(f"Synthetic generated: {len(synthetic_with_names)} brands")
print(f"Augmented total: {len(augmented_df)} brands")
print(f"\nModels used:")
print(f"  Tabular: CTGAN + TVAE + Gaussian Copula (ensemble)")
print(f"  Text: {', '.join(LLM_MODELS)} (ensemble)")
print(f"\nQuality metrics:")
print(f"  KS Test pass rate: {passes}/{len(ks_results)} ({100*passes/len(ks_results):.1f}%)")
print(f"  Clustering silhouette (augmented): {augmented_cluster_metrics['silhouette']:.3f}")
print("="*60)

## Clean Up

In [None]:
# Clear GPU memory
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("GPU memory cleared")