In [1]:
import pandas as pd
import numpy as np

# Set the seed for reproducibility
np.random.seed(42)

# Parameters
n_entities = 10000  # Number of different entities (stocks)
start_date = '2023-01-01'
end_date = '2023-12-31'
regions = ['North America', 'Europe', 'Asia']  # Define regions
genders = ['Male', 'Female']  # Define genders

# Generate date range
dates = pd.date_range(start=start_date, end=end_date, freq='D')

# Generate synthetic stock prices using a random walk model
initial_prices = np.random.uniform(low=50, high=100, size=n_entities)
price_changes = np.random.normal(loc=0.001, scale=0.02, size=(len(dates), n_entities))  # Daily returns
stock_prices = np.cumprod(1 + price_changes, axis=0) * initial_prices

# Add noise
noise = np.random.normal(loc=0, scale=1, size=stock_prices.shape)
stock_prices += noise

# Introduce anomalies
for _ in range(int(0.01 * n_entities * len(dates))):  # 1% of the data as anomalies
    i, j = np.random.randint(0, len(dates)), np.random.randint(0, n_entities)
    stock_prices[i, j] += np.random.choice([-50, 50])  # Sudden spike or drop

# Generate static features with imbalance correctly
gender_assignments = np.random.choice(genders, n_entities, p=[0.7, 0.3])  # Correct probabilities for 2 genders
region_assignments = np.random.choice(regions, n_entities, p=[0.25, 0.25, 0.5])  # Correct probabilities for 3 regions

# Create DataFrame for dynamic features
df_stock_prices = pd.DataFrame(stock_prices, index=dates, columns=[f'Stock_{i}' for i in range(n_entities)])

# Create DataFrame for static features
df_static = pd.DataFrame({'Gender': gender_assignments, 'Region': region_assignments}, index=[f'Stock_{i}' for i in range(n_entities)])

# Combining dynamic and static features for a simplistic view
df_combined = pd.concat([df_stock_prices.T, df_static], axis=1)

df_combined.head()
df_combined.to_csv('timeseries_syntheticdata.csv')


In [15]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
# np.random.seed(42)

# Define simulation parameters
n_patients = 10
start_date = "2023-01-01"
end_date = "2023-02-12"  # One week of data
hours = pd.date_range(start=start_date, end=end_date, freq='H')

# Generate demographic data
demographics = pd.DataFrame({
    'Patient_ID': range(n_patients),
    'Age': np.random.randint(18, 85, size=n_patients),
    'Gender': np.random.choice(['Male', 'Female'], size=n_patients),
    'Mortality': np.random.choice(['No', 'Yes'], size=n_patients, p=[0.7, 0.3])  # Assume 10% mortality rate
})

# Generate hourly vital signs data
vital_signs = pd.DataFrame({
    'Time': np.tile(hours, n_patients),
    'Patient_ID': np.repeat(range(n_patients), len(hours)),
    'Heart_Rate': np.random.randint(60, 100, size=len(hours)*n_patients),
    'Blood_Pressure_Systolic': np.random.randint(90, 140, size=len(hours)*n_patients),
    'Blood_Pressure_Diastolic': np.random.randint(60, 90, size=len(hours)*n_patients),
    'Oxygen_Saturation': np.random.randint(95, 100, size=len(hours)*n_patients)
})

# Example: Join demographics with vital signs for a comprehensive view
# This step is illustrative; in practice, you'd likely keep these tables separate and join as needed for analysis
full_data = pd.merge(vital_signs, demographics, on='Patient_ID')
# print(set(full_data['Mortality']))
# print(sum((full_data['Mortality'] == 'Yes') & (full_data['Gender'] == 'Female')))
# print(sum((full_data['Mortality'] == 'Yes') & (full_data['Gender'] == 'Male')))
full_data.drop(columns='Time', inplace = True)
full_data.to_csv('vital_signs_data.csv', index = False)

{'Yes', 'No'}
2018
1009
