## Converting time series format to DBN format 

In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import pickle 

In [3]:
import os
path = os.getcwd()
path

'/home/choudhar/Dyno/causalnex/causalnex/structure'

In [4]:
#defining folding functions 

def time_rename(dt):
    """
    Renames the columns in a DataFrame so that they end in '_t_0'.
    
    Parameters:
        dt (DataFrame): The DataFrame to be treated.
        
    Returns:
        DataFrame: The renamed DataFrame.
    """
    if not isinstance(dt.index, pd.DatetimeIndex):
        raise ValueError("DataFrame index should be a DatetimeIndex.")
    
    if any(col.endswith('_t_0') for col in dt.columns):
        print("One or more of the column names already ends in '_t_0'. No more suffixes will be added.")
    else:
        dt = dt.copy()
        for col in dt.columns:
            dt.rename(columns={col: f"{col}_t_0"}, inplace=True)
    
    return dt

def check_time0_formatted(obj):
    """
    Check if the column names end with '_t_0'.

    Parameters:
        obj (DataFrame or Series): The object to check.

    Returns:
        bool: True if all column names end with '_t_0', False otherwise.
    """
    if isinstance(obj, pd.DataFrame):
        return all(col.endswith('_t_0') for col in obj.columns)
    elif isinstance(obj, pd.Series):
        return obj.name.endswith('_t_0')
    else:
        raise ValueError("Input must be a DataFrame or Series.")
        
def fold_dt_rec(dt, n_prev, size, slice_=1):
    """
    Widens the dataset to take into account the t previous time slices.

    Parameters:
        dt (DataFrame): The DataFrame to be treated.
        n_prev (list): Names of the previous time slices.
        size (int): Number of time slices to unroll.
        slice_ (int): The current time slice being treated. Should not be modified when first calling.

    Returns:
        DataFrame: The extended DataFrame.
    """
    if size > slice_:
        n = [prev.replace(f"_t_{slice_-1}", f"_t_{slice_}") for prev in n_prev]
        dt[n] = dt[n_prev].shift(1)
        
        dt = dt.iloc[1:]
        dt = fold_dt_rec(dt, n, size, slice_ + 1)
        
    return dt

def fold_dt(dt, size):
    
    if not isinstance(dt, pd.DataFrame):
        dt = pd.DataFrame(dt)
    
    dt = dt.copy()
    
    if not check_time0_formatted(dt):
        dt = time_rename(dt)
    
    return fold_dt_rec(dt, dt.columns, size) 


In [5]:
#quick test 
#data = {'value': [10, 20, 30, 40, 50, 60, 70, 80, 90]}
#index = pd.date_range('2022-01-01', periods=9, freq='D')
#df = pd.DataFrame(data, index=index)
#fold_dt(df,4)

## synthetic data 

See https://github.com/causalens/cdml-neurips2020 for original data.

The data in the pickle file is List(pd.DataFrames) where each dataframe represnets a certain trajectory. 


In [6]:
filename = os.path.join(path, '../data/data_8n_100ts_30N_datasets.pickle')

with open(filename, 'rb') as f:
    data = pickle.load(f)

In [6]:

# for each element in the list 'data', set the index to datetime,
#then fold the dataset to get a DBN format 

size = 10 
folded_data_frames = []
for i in range(len(data)):
    df = pd.DataFrame(data[i])
    df.index = pd.to_datetime(df.index)
    folded_data = fold_dt(df, size)
    folded_data_frames.append(folded_data)
all_trajectories_folded = pd.concat(folded_data_frames,axis=0)
all_trajectories_folded.to_csv(os.path.join(path, 'synthetic_datasets/data_8n_100ts_30N.csv'),index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt[n] = dt[n_prev].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt[n] = dt[n_prev].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt[n] = dt[n_prev].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt[n] = dt[n_prev].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt[n] = dt[n_prev].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt[n] = dt[n_prev].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instea