In [None]:
import numpy as np
import pandas as pd
from scipy.stats import norm, multivariate_normal
import datetime

# Set seed for reproducibility
np.random.seed(123123)

# Number of synthetic data points
N = 6000000

# Define the target ranges for our five market-floating variables
# Order: CDS, BestIVOL, S, bond_price, interest_rate
ranges = {
    "CDS": (40, 400),
    "BestIVOL": (5, 50),
    "S": (10, 100),
    "bond_price": (50, 150),
    "interest_rate": (0.01, 0.05)
}

# Observed correlation matrix for these variables (based on market data)
corr_matrix = np.array([
    [1.000000,  0.175292, -0.111677, -0.083193,  0.189044],
    [0.175292,  1.000000, -0.103392,  0.313718,  0.174254],
    [-0.111677, -0.103392,  1.000000,  0.111398, -0.002671],
    [-0.083193,  0.313718,  0.111398,  1.000000, -0.046268],
    [0.189044,  0.174254, -0.002671, -0.046268,  1.000000]
])

# Variable order: ["CDS", "BestIVOL", "S", "bond_price", "interest_rate"]

# 1. Sample from a multivariate normal distribution with mean 0 and covariance = correlation matrix
z = multivariate_normal.rvs(mean=[0, 0, 0, 0, 0], cov=corr_matrix, size=N)

# 2. Convert the normal samples to uniform [0,1] via the standard normal CDF
u = norm.cdf(z)

# 3. Map each uniform column to the desired range:
synthetic_market = {}
for i, key in enumerate(["CDS", "BestIVOL", "S", "bond_price", "interest_rate"]):
    lower, upper = ranges[key]
    synthetic_market[key] = lower + u[:, i] * (upper - lower)

# Generate remaining parameters independently
df_other = pd.DataFrame()

# Time to Maturity (TTM): 1 to 10 years, in days
df_other["ttm_days"] = (np.random.uniform(low=1, high=10, size=N) * 365).astype(int)

# Coupon Rate: 0.3% to 6%
df_other["coupon_rate"] = np.random.uniform(low=0.003, high=0.06, size=N)

# Coupon Payment Frequency: 80% for 2 times/year, 20% for 4 times/year
df_other["coupon_frequency"] = np.random.choice([2, 4], size=N, p=[0.8, 0.2])

# Conversion Ratio: 5 to 100
df_other["conversion_ratio"] = np.random.uniform(low=5, high=100, size=N)

# Conversion Price: 15 to 175
df_other["conversion_price"] = np.random.uniform(low=15, high=175, size=N)

# Dividend:
# - 30% of rows have 0 dividend; 70% have a dividend drawn uniformly from 0 to 0.175
dividends = np.zeros(N)
mask = np.random.rand(N) < 0.7
dividends[mask] = np.random.uniform(low=0, high=0.175, size=mask.sum())
df_other["dividend"] = dividends

# Issuance Date: fixed at 2020-01-01
df_other["issuance_date"] = pd.to_datetime("2020-01-01")

# First Coupon Date: randomly selected from a set of dates
possible_first_coupon_dates = [
    pd.to_datetime("2020-02-01"),  # 1 month later
    pd.to_datetime("2020-07-01"),  # 6 months later
    pd.to_datetime("2021-01-01"),  # 1 year later
    pd.to_datetime("2022-01-01"),  # 2 years later
    pd.to_datetime("2024-01-01")   # 4 years later
]
df_other["first_coupon_date"] = np.random.choice(possible_first_coupon_dates, size=N)

# Combine the market-floating variables and the independent parameters
df_market = pd.DataFrame(synthetic_market)
df_synth = pd.concat([df_market, df_other], axis=1)

# (Optional) Check summary statistics and correlations
print(df_synth.head())
print(df_synth.describe())

# Save the synthetic data to a CSV file
df_synth.to_csv("synthetic_data_with_correlation_6M.csv", index=False)
print("Synthetic data with meaningful correlation saved to 'synthetic_data_with_correlation.csv'.")


          CDS   BestIVOL          S  bond_price  interest_rate  ttm_days  \
0  314.581058   8.663206  83.107546  132.008448       0.015593      3289   
1  283.305024  26.822203  72.210115   55.083915       0.038125      2055   
2  136.986243   7.626415  82.517493   93.928265       0.022480      1592   
3  159.496333  40.471264  54.524990  119.136006       0.026245      2384   
4  102.157419  21.851595  84.679622  128.223672       0.012185      1492   

   coupon_rate  coupon_frequency  conversion_ratio  conversion_price  \
0     0.055550                 2         44.458507         28.662565   
1     0.054504                 2         93.299437         83.059960   
2     0.057977                 4         65.023843        102.284363   
3     0.039899                 2         74.449632        133.355052   
4     0.057158                 2         60.943056         44.114785   

   dividend issuance_date first_coupon_date  
0  0.000000    2020-01-01        2020-07-01  
1  0.012672    202