# Investor-Deal Data Generation
This notebook generates synthetic investor-deal interaction data with realistic features

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import os

# Set seed for reproducibility
np.random.seed(42)

## Define Parameters

In [None]:
n_investors = 50
n_deals = 1000
n_interactions = 500

# Feature categories
investor_types = ['Equity', 'Debt', 'Infrastructure']
regions = ['North America', 'Europe', 'Asia', 'Other']
sectors = ['Technology', 'Healthcare', 'Finance', 'Energy', 'Consumer']
stages = ['Seed', 'Series A', 'Series B', 'Growth', 'Late']
risk_profiles = ['Conservative', 'Moderate', 'Aggressive']

## Helper Functions

In [None]:
def generate_investor_features(n_investors):
    """Generate synthetic investor features"""
    investor_data = []
    
    for i in range(n_investors):
        investor = {
            'investorId': i,
            'type': np.random.choice(range(len(investor_types))),
            'region': np.random.choice(range(len(regions))),
            'risk': np.random.choice(range(len(risk_profiles))),
            'min_investment': np.random.uniform(0.5, 5.0),
            'max_investment': np.random.uniform(5.0, 50.0),
            'experience_years': np.random.uniform(1, 20),
            'portfolio_size': np.random.randint(5, 100)
        }
        investor_data.append(investor)
    
    return pd.DataFrame(investor_data)

In [None]:
def generate_deal_features(n_deals):
    """Generate synthetic deal features"""
    deal_data = []
    
    for i in range(n_deals):
        deal = {
            'dealId': i,
            'sector': np.random.choice(range(len(sectors))),
            'stage': np.random.choice(range(len(stages))),
            'region': np.random.choice(range(len(regions))),
            'deal_size': np.random.uniform(1, 100),
            'revenue_multiple': np.random.uniform(0.5, 10),
            'growth_rate': np.random.uniform(-0.2, 1.5),
            'profitability': np.random.uniform(-0.5, 0.3),
            'team_experience': np.random.uniform(0, 30),
            'market_size': np.random.uniform(0.1, 100)
        }
        deal_data.append(deal)
    
    return pd.DataFrame(deal_data)

In [None]:
def generate_interactions_with_preferences(investor_df, deal_df, n_interactions):
    """Generate interactions with preference patterns"""
    interactions = []
    
    for _ in tqdm(range(n_interactions), desc="Generating interactions"):
        investor_idx = np.random.choice(len(investor_df))
        investor = investor_df.iloc[investor_idx]
        
        # Sample subset of deals for efficiency
        sampled_deal_indices = np.random.choice(len(deal_df), size=50, replace=False)
        sampled_deals = deal_df.iloc[sampled_deal_indices].copy()
        
        # Calculate preference weights
        weights = np.ones(len(sampled_deals))
        
        # Regional preference
        weights[sampled_deals['region'] == investor['region']] *= 2.0
        
        # Investment size preference
        size_match = (sampled_deals['deal_size'] >= investor['min_investment']) & \
                    (sampled_deals['deal_size'] <= investor['max_investment'])
        weights[size_match] *= 1.5
        
        # Risk alignment
        if investor['risk'] == 0:  # Conservative
            weights[sampled_deals['stage'] >= 3] *= 1.8  # Prefer later stages
        elif investor['risk'] == 2:  # Aggressive  
            weights[sampled_deals['stage'] <= 1] *= 1.8  # Prefer early stages
        
        # Normalize weights
        weights = weights / weights.sum()
        
        # Sample deal based on weights
        selected_idx = np.random.choice(len(sampled_deals), p=weights)
        selected_deal_id = sampled_deals.iloc[selected_idx]['dealId']
        
        # Generate timestamp
        timestamp = pd.Timestamp('2021-01-01') + pd.Timedelta(days=np.random.randint(0, 365))
        
        interactions.append({
            'investorId': investor['investorId'],
            'dealId': int(selected_deal_id),
            'timestamp': timestamp
        })
    
    return pd.DataFrame(interactions)

## Generate Data

In [None]:
print("Generating investor features...")
investor_df = generate_investor_features(n_investors)
print(f"Generated {len(investor_df)} investors")
print(investor_df.head())

In [None]:
print("\nGenerating deal features...")  
deal_df = generate_deal_features(n_deals)
print(f"Generated {len(deal_df)} deals")
print(deal_df.head())

In [None]:
print("\nGenerating interactions...")
interactions_df = generate_interactions_with_preferences(investor_df, deal_df, n_interactions)
print(f"Generated {len(interactions_df)} interactions")
print(interactions_df.head())

## Data Statistics

In [None]:
print("Interaction Statistics:")
print(f"- Total interactions: {len(interactions_df)}")
print(f"- Unique investors: {interactions_df['investorId'].nunique()}")
print(f"- Unique deals: {interactions_df['dealId'].nunique()}")
print(f"- Avg interactions per investor: {len(interactions_df) / interactions_df['investorId'].nunique():.2f}")

## Save Data

In [None]:
# Create directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Save datasets
interactions_df.to_csv('data/enhanced_interactions.csv', index=False)
investor_df.to_csv('data/investor_features.csv', index=False)
deal_df.to_csv('data/deal_features.csv', index=False)

print("Data saved successfully!")
print("- data/enhanced_interactions.csv")
print("- data/investor_features.csv")
print("- data/deal_features.csv")