In [1]:
## Create Dataset for Analysis
# This notebook demonstrates the creation of a synthetic dataset for analyzing financial market structures 
# and their relationship with economic indicators.

In [2]:
import numpy as np
import pandas as pd
import datetime
import os

# Set output directory
output_dir = "../data"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [3]:
## Create Synthetic Economic Indicators

def create_economic_indicators_data(start_date='1960-01-01', end_date='2010-12-31', seed=43):
    np.random.seed(seed)
    months = pd.date_range(start_date, end_date, freq='M')
    num_months = len(months)
    
    # generate economic indicators
    household_income = 50000 + np.cumsum(np.random.normal(50, 200, num_months))
    cpi = 100 + np.cumsum(np.random.normal(0.1, 0.5, num_months))
    gov_debt = 1e12 + np.cumsum(np.random.normal(1e9, 5e9, num_months))
    
    df_econ = pd.DataFrame({
        'household_income': household_income,
        'cpi': cpi,
        'government_debt': gov_debt
    }, index=months)
    
    return df_econ

df_econ = create_economic_indicators_data()
df_econ.to_csv(os.path.join(output_dir, 'economic_indicators.csv'))
print("Saved economic_indicators.csv")
df_econ.head()

## Proper way to load this dataset
# df_loaded = pd.read_csv('../data/economic_indicators.csv', index_col=0, parse_dates=True)

# - economic_indicators.csv: monthly economic indicators (household_income, cpi, government_debt) from 1960-01-31 to 2010-12-31

Saved economic_indicators.csv


  months = pd.date_range(start_date, end_date, freq='M')


Unnamed: 0,household_income,cpi,government_debt
1960-01-31,50101.479985,99.937734,1006815000000.0
1960-02-29,49969.783699,99.939156,1006725000000.0
1960-03-31,49944.083077,100.024229,1005619000000.0
1960-04-30,49887.099958,100.300251,1009328000000.0
1960-05-31,50108.714627,100.119371,1012082000000.0


In [4]:
## Create Synthetic Stock Returns

def normalize(x):
    return (x - np.nanmin(x)) / (np.nanmax(x) - np.nanmin(x) + 1e-9)

def create_stock_returns_data(df_econ, start_date='1960-02-01', end_date='2010-12-31', num_stocks=100, seed=42):
    np.random.seed(seed)
    dates = pd.date_range(start_date, end_date, freq='B')  # start from Feb 1, 1960
    
    df_econ_daily = df_econ.reindex(dates, method='ffill')
    
    household_income = df_econ_daily['household_income'].values
    cpi = df_econ_daily['cpi'].values
    gov_debt = df_econ_daily['government_debt'].values
    
    h_income_norm = normalize(household_income)
    cpi_norm = normalize(cpi)
    debt_norm = normalize(gov_debt)
    
    stocks_per_sector = num_stocks // 4
    sectors = ['sector_A'] * stocks_per_sector + \
              ['sector_B'] * stocks_per_sector + \
              ['sector_C'] * stocks_per_sector + \
              ['sector_D'] * (num_stocks - 3*stocks_per_sector)
    
    num_days = len(dates)
    
    base_market_trend = np.cumsum(np.random.normal(0.0002, 0.01, num_days))
    
    sector_A_factor = np.cumsum(np.random.normal(0.0001, 0.008*(1 - h_income_norm), num_days))
    sector_B_factor = np.cumsum(np.random.normal(0.0001, 0.008*(1 + debt_norm), num_days))
    sector_C_factor = np.cumsum(np.random.normal(0.0001, 0.008*(1 - h_income_norm*0.5), num_days))
    sector_D_factor = np.cumsum(np.random.normal(0.0001, 0.008*(1 + cpi_norm), num_days))
    
    sector_factors = {
        'sector_A': sector_A_factor,
        'sector_B': sector_B_factor,
        'sector_C': sector_C_factor,
        'sector_D': sector_D_factor
    }
    
    returns_data = {}
    for i in range(num_stocks):
        sector = sectors[i]
        combined_factor = base_market_trend * 0.3 + sector_factors[sector] * 0.5
        noise = np.random.normal(0, 0.005 + 0.002*cpi_norm, num_days)
        stock_returns = combined_factor + noise
        returns_data[f'Stock_{i+1}'] = stock_returns
    
    df_returns = pd.DataFrame(returns_data, index=dates)
    return df_returns

df_returns = create_stock_returns_data(df_econ)
df_returns.to_csv(os.path.join(output_dir, 'stock_returns.csv'))
print("Saved stock_returns.csv")

df_returns.head()

## Proper way to load this dataset
# df_loaded = pd.read_csv('../data/stock_returns.csv', index_col=0, parse_dates=True)

# - stock_returns.csv: daily returns of 100 synthetic stocks from 1960-01-01 to 2010-12-31

Saved stock_returns.csv


Unnamed: 0,Stock_1,Stock_2,Stock_3,Stock_4,Stock_5,Stock_6,Stock_7,Stock_8,Stock_9,Stock_10,...,Stock_91,Stock_92,Stock_93,Stock_94,Stock_95,Stock_96,Stock_97,Stock_98,Stock_99,Stock_100
1960-02-01,0.004737,-0.007731,0.00265,-0.006843,0.002603,0.005599,-0.000249,0.012171,0.006131,-0.00649,...,4.5e-05,-0.00719,0.001179,-0.000717,0.003559,-0.005243,0.001707,0.005664,0.002348,0.005585
1960-02-02,0.013182,0.004764,0.009974,0.005031,0.009801,0.006647,0.004094,0.004177,-0.005372,0.001359,...,0.014416,-0.001579,-0.004739,0.008347,-0.001802,0.001831,0.004785,-0.003011,0.010786,0.003781
1960-02-03,0.00604,0.009065,0.016748,0.010452,0.00918,0.010431,0.011991,0.00682,0.007108,0.015192,...,0.008846,-0.003701,0.00478,0.003418,0.002533,0.005005,0.015376,0.00112,0.011902,0.000488
1960-02-04,0.007581,0.004363,0.016609,0.015009,0.018566,0.008029,0.012643,0.001856,0.018008,0.013046,...,0.005466,0.004073,0.006113,0.000271,0.00815,0.001033,0.008978,0.00486,0.003538,0.005467
1960-02-05,0.015811,0.013937,0.016879,0.009888,0.012851,0.017894,0.014917,0.014474,0.009288,0.00731,...,0.02228,0.015108,0.021407,0.013848,0.011812,0.013547,0.017625,0.009535,0.025477,0.02139
