In [1]:
import numpy as np
from datetime import datetime
import random
import pandas as pd
from synesis_data_structures.time_series.synthetic import TimeSeriesStructure, TimeSeriesAggregationStructure


def generate_synthetic_time_series_data(
    num_entities: int = 5,
    min_timestamps: int = 400,
    max_timestamps: int = 600,
    seed: int = 42
) -> TimeSeriesStructure:
    """
    Generate synthetic time series data following the new DataFrame structure.

    Returns a TimeSeriesStructure instance with:
    - time_series_data: MultiIndex DataFrame with entity and timestamp
    - time_series_entity_metadata: Entity-specific static metadata
    - feature_information: Feature descriptions and metadata
    """
    # Set random seed for reproducibility
    np.random.seed(seed)
    random.seed(seed)

    # Constants
    CITY_COUNTRY_MAP = {
        'New York': 'USA',
        'London': 'UK',
        'Tokyo': 'Japan',
        'Paris': 'France',
        'Berlin': 'Germany',
        'Sydney': 'Australia',
        'Singapore': 'Singapore',
        'Dubai': 'UAE',
        'Mumbai': 'India',
        'São Paulo': 'Brazil'
    }
    CITIES = list(CITY_COUNTRY_MAP.keys())
    DISTRICTS = ['Downtown', 'Suburban',
                 'Industrial', 'Residential', 'Commercial']
    ADMIN_CLASSES = ['national', 'provincial', 'municipal', 'village']

    # Generate entity names (roads)
    road_names = [f"Road_{i:03d}" for i in range(num_entities)]

    # Generate timestamps (hourly data up to now)
    end_time = datetime.now()

    # Create the time series data
    all_data = []
    all_entities = []
    all_timestamps = []

    for entity in road_names:
        # Random number of timestamps for this entity
        n_timestamps = random.randint(min_timestamps, max_timestamps)
        entity_timestamps = pd.date_range(
            end=end_time, periods=n_timestamps, freq='h')

        # Generate traffic data (cars per hour)
        base_traffic = np.random.randint(100, 1000)
        traffic = base_traffic + np.random.normal(0, 100, n_timestamps)
        traffic = np.maximum(0, traffic)
        traffic = traffic.astype(int)

        # Generate temperature data (in Celsius)
        base_temp = np.random.uniform(10, 25)
        temp = base_temp + np.random.normal(0, 5, n_timestamps)

        # Generate rain data (binary)
        rain_prob = np.random.uniform(0.1, 0.3)
        rain = np.random.binomial(1, rain_prob, n_timestamps)

        # Combine all features
        entity_data = np.column_stack([traffic, temp, rain])

        all_data.append(entity_data)
        all_entities.extend([entity] * n_timestamps)
        all_timestamps.extend(entity_timestamps)

    # Create the main time series DataFrame
    time_series_data = pd.DataFrame(
        np.vstack(all_data),
        index=pd.MultiIndex.from_arrays(
            [all_entities, all_timestamps],
            names=['entity', 'timestamp']
        ),
        columns=['cars_per_hour', 'temperature', 'is_raining']
    )

    # Create entity metadata
    time_series_entity_metadata = pd.DataFrame({
        'city': np.random.choice(CITIES, num_entities),
        'district': np.random.choice(DISTRICTS, num_entities),
        'district_population': np.random.randint(10000, 1000000, num_entities),
        'city_population': np.random.randint(100000, 10000000, num_entities),
        'administrative_class': np.random.choice(ADMIN_CLASSES, num_entities)
    }, index=road_names)
    time_series_entity_metadata.index.name = 'entity'

    # Map cities to their correct countries
    time_series_entity_metadata['country'] = time_series_entity_metadata['city'].map(
        CITY_COUNTRY_MAP)

    # Create mapping for administrative_class
    admin_mapping = {i: cls for i, cls in enumerate(ADMIN_CLASSES)}

    # Convert administrative_class to integers
    time_series_entity_metadata['administrative_class'] = time_series_entity_metadata['administrative_class'].map(
        {v: k for k, v in admin_mapping.items()}
    )

    # Create feature information
    feature_information = pd.DataFrame({
        'unit': ['cars/hour', 'celsius', 'count'],
        'description': [
            'Number of cars passing per hour',
            'Temperature in Celsius',
            'Binary indicator for rain (0=no rain, 1=rain)'
        ],
        'type': ['numerical', 'numerical', 'categorical'],
        'subtype': ['discrete', 'continuous', 'discrete'],
        'scale': ['ratio', 'interval', 'nominal'],
        'source': ['data', 'data', 'data'],
        'category_id': [pd.NA, pd.NA, 0]  # category_id 0 for rain mapping
    }, index=['cars_per_hour', 'temperature', 'is_raining'])
    feature_information.index.name = 'name'

    # Add metadata features to feature information
    metadata_features = pd.DataFrame({
        'unit': ['string', 'string', 'count', 'count', 'string', 'string'],
        'description': [
            'City name',
            'District name',
            'Population of the district',
            'Population of the city',
            'Administrative classification level',
            'Country name'
        ],
        'type': ['categorical', 'categorical', 'numerical', 'numerical', 'categorical', 'categorical'],
        'subtype': ['discrete', 'discrete', 'discrete', 'discrete', 'discrete', 'discrete'],
        'scale': ['nominal', 'nominal', 'ratio', 'ratio', 'nominal', 'nominal'],
        'source': ['metadata', 'metadata', 'metadata', 'metadata', 'metadata', 'metadata'],
        # Different category IDs for different categorical features
        'category_id': [1, 2, pd.NA, pd.NA, 3, 4]
    }, index=['city', 'district', 'district_population', 'city_population', 'administrative_class', 'country'])
    metadata_features.index.name = 'name'

    # Combine all feature information
    feature_information = pd.concat([feature_information, metadata_features])

    return TimeSeriesStructure(
        time_series_data=time_series_data,
        entity_metadata=time_series_entity_metadata,
        feature_information=feature_information
    )


def generate_synthetic_time_series_aggregation_data(
    num_aggregations: int = 15,
    seed: int = 42
) -> TimeSeriesAggregationStructure:
    """
    Generate synthetic time series aggregation data.
    This function internally generates time series data and then creates aggregations based on it.

    Args:
        num_aggregations: Number of aggregations to generate
        seed: Random seed for reproducibility

    Returns:
        A TimeSeriesAggregationStructure instance
    """
    # Set random seed
    np.random.seed(seed)
    random.seed(seed)

    # Generate the base time series data internally
    time_series_structure = generate_synthetic_time_series_data(seed=seed)

    # Extract the time series data
    ts_data = time_series_structure.time_series_data
    ts_metadata = time_series_structure.entity_metadata

    # Get unique entities and their data
    entities = ts_data.index.get_level_values('entity').unique()

    # Generate aggregation inputs
    aggregation_inputs = []
    aggregation_outputs = []
    aggregation_metadata = []

    for agg_id in range(num_aggregations):
        # Randomly select entity and feature
        entity = random.choice(entities)
        feature = random.choice(['cars_per_hour', 'temperature', 'is_raining'])

        # Get entity data
        entity_data = ts_data.loc[entity]

        # Generate random time window
        start_idx = random.randint(
            0, len(entity_data) - 24)  # At least 24 hours
        # At least 12 hours window
        end_idx = random.randint(start_idx + 12, len(entity_data))

        start_timestamp = entity_data.index[start_idx]
        end_timestamp = entity_data.index[end_idx]

        # Calculate aggregation based on feature type
        window_data = entity_data.iloc[start_idx:end_idx + 1][feature]

        if feature == 'cars_per_hour':
            # Calculate multiple statistics for traffic
            mean_traffic = window_data.mean()
            max_traffic = window_data.max()
            traffic_variance = window_data.var()
            peak_hour_traffic = window_data.quantile(0.95)

            # Add multiple outputs for this aggregation
            outputs = {
                'mean_traffic': mean_traffic,
                'max_traffic': max_traffic,
                'traffic_variance': traffic_variance,
                'peak_hour_traffic': peak_hour_traffic
            }

        elif feature == 'temperature':
            # Calculate temperature statistics
            mean_temp = window_data.mean()
            temp_range = window_data.max() - window_data.min()
            temp_std = window_data.std()

            outputs = {
                'mean_temperature': mean_temp,
                'temperature_range': temp_range,
                'temperature_std': temp_std
            }

        else:  # is_raining
            # Calculate rain statistics
            total_rain_hours = window_data.sum()
            rain_frequency = window_data.mean()
            longest_rain_streak = (window_data.astype(bool).astype(int).groupby(
                (window_data.astype(bool).astype(int) !=
                 window_data.astype(bool).astype(int).shift()).cumsum()
            ).sum().max())

            outputs = {
                'total_rain_hours': total_rain_hours,
                'rain_frequency': rain_frequency,
                'longest_rain_streak': longest_rain_streak
            }

        # Add aggregation inputs
        aggregation_inputs.append({
            'aggregation_id': agg_id,
            'time_series_id': entity,
            'input_feature_name': feature,
            'start_timestamp': start_timestamp,
            'end_timestamp': end_timestamp
        })

        # Add aggregation outputs
        for output_name, output_value in outputs.items():
            aggregation_outputs.append({
                'aggregation_id': agg_id,
                output_name: output_value
            })

        # Add aggregation metadata
        aggregation_metadata.append({
            'aggregation_id': agg_id,
            'aggregation_type': f'{feature}_statistics',
            'window_size_hours': (end_timestamp - start_timestamp).total_seconds() / 3600,
            'entity_city': ts_metadata.loc[entity, 'city'],
            'entity_district': ts_metadata.loc[entity, 'district']
        })

    # Create DataFrames
    aggregation_inputs_df = pd.DataFrame(aggregation_inputs)
    aggregation_outputs_df = pd.DataFrame(aggregation_outputs)
    aggregation_metadata_df = pd.DataFrame(aggregation_metadata)

    # Set index for outputs and metadata
    aggregation_outputs_df.set_index('aggregation_id', inplace=True)
    aggregation_metadata_df.set_index('aggregation_id', inplace=True)

    # Create feature information for aggregation outputs
    output_features = []

    # Traffic output features
    traffic_outputs = ['mean_traffic', 'max_traffic',
                       'traffic_variance', 'peak_hour_traffic']
    for feature in traffic_outputs:
        output_features.append({
            'name': feature,
            'unit': 'cars/hour',
            'description': f'{feature.replace("_", " ").title()} from traffic data aggregation',
            'type': 'numerical',
            'subtype': 'continuous',
            'scale': 'ratio',
            'source': 'data',
            'category_id': pd.NA
        })

    # Temperature output features
    temp_outputs = ['mean_temperature', 'temperature_range', 'temperature_std']
    for feature in temp_outputs:
        output_features.append({
            'name': feature,
            'unit': 'celsius',
            'description': f'{feature.replace("_", " ").title()} from temperature data aggregation',
            'type': 'numerical',
            'subtype': 'continuous',
            'scale': 'interval',
            'source': 'data',
            'category_id': pd.NA
        })

    # Rain output features
    rain_outputs = ['total_rain_hours',
                    'rain_frequency', 'longest_rain_streak']
    for feature in rain_outputs:
        unit = 'hours' if feature == 'total_rain_hours' else 'count'
        output_features.append({
            'name': feature,
            'unit': unit,
            'description': f'{feature.replace("_", " ").title()} from rain data aggregation',
            'type': 'numerical',
            'subtype': 'discrete',
            'scale': 'ratio',
            'source': 'data',
            'category_id': pd.NA
        })

    # Metadata features
    metadata_features = ['aggregation_type',
                         'window_size_hours', 'entity_city', 'entity_district']
    for feature in metadata_features:
        if feature == 'window_size_hours':
            output_features.append({
                'name': feature,
                'unit': 'hours',
                'description': 'Duration of the aggregation window',
                'type': 'numerical',
                'subtype': 'continuous',
                'scale': 'ratio',
                'source': 'metadata',
                'category_id': pd.NA
            })
        else:
            output_features.append({
                'name': feature,
                'unit': 'string',
                'description': f'{feature.replace("_", " ").title()} from aggregation metadata',
                'type': 'categorical',
                'subtype': 'discrete',
                'scale': 'nominal',
                'source': 'metadata',
                'category_id': 5  # New category ID for aggregation metadata
            })

    aggregation_feature_information = pd.DataFrame(output_features)
    aggregation_feature_information.set_index('name', inplace=True)

    return TimeSeriesAggregationStructure(
        time_series_aggregation_outputs=aggregation_outputs_df,
        time_series_aggregation_inputs=aggregation_inputs_df,
        entity_metadata=aggregation_metadata_df,
        feature_information=aggregation_feature_information
    )

In [2]:
object_group = generate_synthetic_time_series_data()
time_series = object_group.time_series_data
entity_metadata = object_group.entity_metadata
feature_information = object_group.feature_information

In [3]:
time_series.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2456 entries, ('Road_000', Timestamp('2025-09-14 11:17:24.734089')) to ('Road_004', Timestamp('2025-10-07 21:17:24.734089'))
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cars_per_hour  2456 non-null   float64
 1   temperature    2456 non-null   float64
 2   is_raining     2456 non-null   float64
dtypes: float64(3)
memory usage: 85.8+ KB


In [4]:
print(time_series.head().to_string())

                                     cars_per_hour  temperature  is_raining
entity   timestamp                                                         
Road_000 2025-09-14 11:17:24.734089          146.0    21.416886         0.0
         2025-09-14 12:17:24.734089          253.0    32.217965         0.0
         2025-09-14 13:17:24.734089          249.0    17.246501         0.0
         2025-09-14 14:17:24.734089          338.0    19.202538         0.0
         2025-09-14 15:17:24.734089          110.0    21.066240         1.0


In [15]:
time_series.loc[["Road_000"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,cars_per_hour,temperature,is_raining
entity,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Road_000,2025-09-14 11:17:24.734089,146.0,21.416886,0.0
Road_000,2025-09-14 12:17:24.734089,253.0,32.217965,0.0
Road_000,2025-09-14 13:17:24.734089,249.0,17.246501,0.0
Road_000,2025-09-14 14:17:24.734089,338.0,19.202538,0.0
Road_000,2025-09-14 15:17:24.734089,110.0,21.066240,1.0
Road_000,...,...,...,...
Road_000,2025-10-07 17:17:24.734089,29.0,17.880172,1.0
Road_000,2025-10-07 18:17:24.734089,326.0,17.149222,0.0
Road_000,2025-10-07 19:17:24.734089,408.0,17.584881,0.0
Road_000,2025-10-07 20:17:24.734089,7.0,28.847353,0.0


In [5]:
entity_metadata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, Road_000 to Road_004
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   city                  5 non-null      object
 1   district              5 non-null      object
 2   district_population   5 non-null      int64 
 3   city_population       5 non-null      int64 
 4   administrative_class  5 non-null      int64 
 5   country               5 non-null      object
dtypes: int64(3), object(3)
memory usage: 280.0+ bytes


In [6]:
entity_metadata

Unnamed: 0_level_0,city,district,district_population,city_population,administrative_class,country
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Road_000,São Paulo,Residential,365247,484533,1,Brazil
Road_001,Tokyo,Commercial,76446,4403829,0,Japan
Road_002,New York,Commercial,563360,3702294,3,USA
Road_003,Tokyo,Industrial,43717,1361668,1,Japan
Road_004,Berlin,Industrial,228417,3250374,3,Germany


In [7]:
feature_information.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9 entries, cars_per_hour to country
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   unit         9 non-null      object
 1   description  9 non-null      object
 2   type         9 non-null      object
 3   subtype      9 non-null      object
 4   scale        9 non-null      object
 5   source       9 non-null      object
 6   category_id  5 non-null      object
dtypes: object(7)
memory usage: 576.0+ bytes


In [8]:
feature_information

Unnamed: 0_level_0,unit,description,type,subtype,scale,source,category_id
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
cars_per_hour,cars/hour,Number of cars passing per hour,numerical,discrete,ratio,data,
temperature,celsius,Temperature in Celsius,numerical,continuous,interval,data,
is_raining,count,"Binary indicator for rain (0=no rain, 1=rain)",categorical,discrete,nominal,data,0.0
city,string,City name,categorical,discrete,nominal,metadata,1.0
district,string,District name,categorical,discrete,nominal,metadata,2.0
district_population,count,Population of the district,numerical,discrete,ratio,metadata,
city_population,count,Population of the city,numerical,discrete,ratio,metadata,
administrative_class,string,Administrative classification level,categorical,discrete,nominal,metadata,3.0
country,string,Country name,categorical,discrete,nominal,metadata,4.0


In [9]:
time_series_aggregation = generate_synthetic_time_series_aggregation_data()
time_series_aggregation_outputs = time_series_aggregation.time_series_aggregation_outputs
time_series_aggregation_inputs = time_series_aggregation.time_series_aggregation_inputs

In [10]:
time_series_aggregation_outputs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 52 entries, 0 to 14
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   mean_traffic         7 non-null      float64
 1   max_traffic          7 non-null      float64
 2   traffic_variance     7 non-null      float64
 3   peak_hour_traffic    7 non-null      float64
 4   total_rain_hours     4 non-null      float64
 5   rain_frequency       4 non-null      float64
 6   longest_rain_streak  4 non-null      float64
 7   mean_temperature     4 non-null      float64
 8   temperature_range    4 non-null      float64
 9   temperature_std      4 non-null      float64
dtypes: float64(10)
memory usage: 4.5 KB


In [11]:
time_series_aggregation_outputs.head()

Unnamed: 0_level_0,mean_traffic,max_traffic,traffic_variance,peak_hour_traffic,total_rain_hours,rain_frequency,longest_rain_streak,mean_temperature,temperature_range,temperature_std
aggregation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,722.769231,,,,,,,,,
0,,941.0,,,,,,,,
0,,,9167.024038,,,,,,,
0,,,,893.4,,,,,,
1,204.297521,,,,,,,,,


In [12]:
time_series_aggregation_inputs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   aggregation_id      15 non-null     int64         
 1   time_series_id      15 non-null     object        
 2   input_feature_name  15 non-null     object        
 3   start_timestamp     15 non-null     datetime64[ns]
 4   end_timestamp       15 non-null     datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(2)
memory usage: 732.0+ bytes


In [13]:
time_series_aggregation_inputs.head()

Unnamed: 0,aggregation_id,time_series_id,input_feature_name,start_timestamp,end_timestamp
0,0,Road_001,cars_per_hour,2025-09-23 01:17:24.814467,2025-09-25 17:17:24.814467
1,1,Road_004,cars_per_hour,2025-09-30 22:17:24.814467,2025-10-05 22:17:24.814467
2,2,Road_000,cars_per_hour,2025-09-18 10:17:24.814467,2025-09-23 13:17:24.814467
3,3,Road_001,is_raining,2025-10-02 22:17:24.814467,2025-10-03 13:17:24.814467
4,4,Road_004,cars_per_hour,2025-10-03 14:17:24.814467,2025-10-07 13:17:24.814467
