# Imports

In [96]:
import os
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
print(os.getcwd())

/home/filip-marcus/models/notebooks


# Utility functions

In [97]:
def unix_to_datetime(unix_time):
    """
    Converts a unix timestamp to a datetime object with format yyyy-mm-dd.
    Note that UTC time is used.
    
    Args:
        unix_time: int of Unix timestamp
        
    Returns:
        datetime: a datetime object with format yyyy-mm-dd
    """
    
    timestamp = pd.to_datetime(unix_time, unit='s')

    return timestamp.strftime("%Y-%m-%d %H:%M:%S")

def get_event_sequence(data_df, first_date, last_date):
    """
    Fetches all events in data_df between first_date and last_date.
    data_df must contain columns 'date' and 'events'
    
    Args:
        data_df: pd.DataFrame input dataframe
        first_date: datetime of first date to be included in the event sequence
        last_data: datetime of last date to be included in the event sequence
        
    Returns:
        List of event sequence. First element is prediction time, then there is one element for each event.
        Each event element in the list can contain some other structure (list).
    """
    data_df['timestamp'] = pd.to_datetime(data_df['timestamp'], unit='s')
    data_df['timestamp'] = data_df['timestamp'].dt.to_pydatetime()
    
    filtered_df = data_df[(data_df['timestamp'] >= first_date) & (data_df['timestamp'] <= last_date)]
        
    event_sequence = []
    event_sequence.append(last_date)

    for _, row in filtered_df.iterrows():
        event = row['features']
        event_time = row['timestamp']
        event_class = row['PQ class']
        # event_class = 'simple_event'
        event_sequence.append([event_time, event_class, event])
        
    return event_sequence
        
    
def get_label(data_df, timestamp):
    """
    Returns the class label for the event at date
    """
    timestamp = pd.to_datetime(timestamp)
    
    df = data_df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp']).dt.floor('S')
    
    row = data_df[data_df['timestamp'] == timestamp]
    
    if not row.empty:
        return row['PQ class'].iloc[0]
    else:
        return 'Timestamp not found in dataframe'

def wide_to_long_df(df):

    long_format_data = []

    for idx, row in df.iterrows():
        
        for col in range(1,len(row)):
            event = row[col]
            
            if type(event) == float: # skip nan values
                continue
            
            else:
                # each event is of the form [timestamp, label, feature_vector]
                # print(event)
                timestamp, label, feature_vector = event

            event_dict = {
                "sequence_id": idx,
                "event_idx": col-1,
                "timestamp": timestamp,
                "event_label": label,
                "feature_vector": feature_vector
            }

            long_format_data.append(event_dict)

    long_df = pd.DataFrame(long_format_data)
    
    return long_df

    

# Eneryield

### Classification data

In [98]:
# Path to csv file containing event data
raw_cls_data_path = '/home/filip-marcus/data/db_access/data/J03/classification/2025_january_classification_model.csv'

# Read csv as pd.DataFrame
cls_data_df = pd.read_csv(raw_cls_data_path)
cls_data_df.head()

Unnamed: 0,sample_id,PQ class,Root Cause 1,Root Cause 2,timestamp
0,1,interruption,unknown,unknown,1601986000.0
1,2,VD,unknown,unknown,1601986000.0
2,3,unbalance_u,unknown,unknown,1601986000.0
3,4,unbalance_u,unknown,unknown,1601986000.0
4,5,unbalance_u,unknown,unknown,1601988000.0


In [99]:
no_rows_to_keep = cls_data_df.shape[0] # all rows

# keep only PQ class and timestamp
df_pqclass = cls_data_df.iloc[:no_rows_to_keep, [1,4]]

# create list of all unique classes in the selected data
unique_classes = df_pqclass['PQ class'].unique().tolist()   # filter out some classes? such as interruption/ongoing_interruption
print('Classes in this selection: ', unique_classes)

Classes in this selection:  ['interruption', 'VD', 'unbalance_u', 'transient', 'interruption end', 'normal', 'current_deviation', 'unbalance_i', 'ongoing interruption', 'harmonics_i', 'harmonics_u']


### Scalars

In [100]:
raw_scalars_data_path = '/home/filip-marcus/data/db_access/data/J03/scalars.csv'
df_scalars = pd.read_csv(raw_scalars_data_path)

# Keep some number of scalars
no_scalars_to_keep = 50
# df_scalars = df_scalars.iloc[:no_rows_to_keep, :no_scalars_to_keep + 4]

# Keep some subset of scalars
manual_small_collection = [
    "sample_id",
    "timestamp",
    "rms_I0_max",
    "rms_I1_max",
    "rms_I2_max",
    "rms_I3_max",
    "rms_U0_max",
    "rms_U1_max",
    "rms_U2_max",
    "rms_U3_max",
    "rms_I0_min",
    "rms_I1_min",
    "rms_I2_min",
    "rms_I3_min",
    "rms_U0_min",
    "rms_U1_min",
    "rms_U2_min",
    "rms_U3_min",
    "fft_U0_thdf",
    "fft_U1_thdf",
    "fft_U2_thdf",
    "fft_U3_thdf",
    "fft_I0_thdf",
    "fft_I1_thdf",
    "fft_I2_thdf",
    "fft_I3_thdf",
    "impedance_integral_Phase_1_definite_integral",
    "impedance_integral_Phase_2_definite_integral",
    "impedance_integral_Phase_3_definite_integral",
]


manual_medium_collection = [
    "sample_id",
    "timestamp",
    "I_initial_amplitude",
    "U_initial_amplitude",
    "rms_I0_max",
    "rms_I1_max",
    "rms_I2_max",
    "rms_I3_max",
    "rms_U0_max",
    "rms_U1_max",
    "rms_U2_max",
    "rms_U3_max",
    "rms_I0_min",
    "rms_I1_min",
    "rms_I2_min",
    "rms_I3_min",
    "rms_U0_min",
    "rms_U1_min",
    "rms_U2_min",
    "rms_U3_min",
    "rms_integral_I0_definite_integral",
    "rms_integral_I1_definite_integral",
    "rms_integral_I2_definite_integral",
    "rms_integral_I3_definite_integral",
    "rms_integral_U0_definite_integral",
    "rms_integral_U1_definite_integral",
    "rms_integral_U2_definite_integral",
    "rms_integral_U3_definite_integral",
    "apparent_power_integral_Phase_0_definite_integral",
    "apparent_power_integral_Phase_1_definite_integral",
    "apparent_power_integral_Phase_2_definite_integral",
    "apparent_power_integral_Phase_3_definite_integral",
    "impedance_integral_Phase_1_definite_integral",
    "impedance_integral_Phase_2_definite_integral",
    "impedance_integral_Phase_3_definite_integral",
    "impedance_Phase_1_max",
    "impedance_Phase_2_max",
    "impedance_Phase_3_max",
    "impedance_Phase_1_min",
    "impedance_Phase_2_min",
    "impedance_Phase_3_min",
    "resistance_integral_Phase_1_definite_integral",
    "resistance_integral_Phase_2_definite_integral",
    "resistance_integral_Phase_3_definite_integral",
    "reactance_integral_Phase_1_definite_integral",
    "reactance_integral_Phase_2_definite_integral",
    "reactance_integral_Phase_3_definite_integral",
    "fft_U0_thdf",
    "fft_U1_thdf",
    "fft_U2_thdf",
    "fft_U3_thdf",
    "fft_I0_thdf",
    "fft_I1_thdf",
    "fft_I2_thdf",
    "fft_I3_thdf",
    "spm_integral_I_definite_integral",
    "spm_integral_U_definite_integral",
]


manual_large_collection = [
    "sample_id",
    "timestamp",
    "I_initial_amplitude",
    "U_initial_amplitude",
    "rms_integral_I0_definite_integral",
    "rms_integral_I1_definite_integral",
    "rms_integral_I2_definite_integral",
    "rms_integral_I3_definite_integral",
    "rms_integral_U0_definite_integral",
    "rms_integral_U1_definite_integral",
    "rms_integral_U2_definite_integral",
    "rms_integral_U3_definite_integral",
    "phase_shift_I1_range",
    "phase_shift_I2_range",
    "phase_shift_I3_range",
    "phase_shift_U1_range",
    "phase_shift_U2_range",
    "phase_shift_U3_range",
    "relative_phase_I1_I2_std",
    "relative_phase_I1_I3_std",
    "relative_phase_I2_I3_std",
    "relative_phase_U1_U2_std",
    "relative_phase_U1_U3_std",
    "relative_phase_U2_U3_std",
    "rms_derivative_I0_max",
    "rms_derivative_I1_max",
    "rms_derivative_I2_max",
    "rms_derivative_I3_max",
    "rms_derivative_U0_max",
    "rms_derivative_U1_max",
    "rms_derivative_U2_max",
    "rms_derivative_U3_max",
    "rms_derivative_I0_min",
    "rms_derivative_I1_min",
    "rms_derivative_I2_min",
    "rms_derivative_I3_min",
    "rms_derivative_U0_min",
    "rms_derivative_U1_min",
    "rms_derivative_U2_min",
    "rms_derivative_U3_min",
    "apparent_power_integral_Phase_0_definite_integral",
    "apparent_power_integral_Phase_1_definite_integral",
    "apparent_power_integral_Phase_2_definite_integral",
    "apparent_power_integral_Phase_3_definite_integral",
    "apparent_power_Phase_0_max",
    "apparent_power_Phase_1_max",
    "apparent_power_Phase_2_max",
    "apparent_power_Phase_3_max",
    "apparent_power_Phase_0_min",
    "apparent_power_Phase_1_min",
    "apparent_power_Phase_2_min",
    "apparent_power_Phase_3_min",
    "impedance_integral_Phase_1_definite_integral",
    "impedance_integral_Phase_2_definite_integral",
    "impedance_integral_Phase_3_definite_integral",
    "impedance_Phase_1_max",
    "impedance_Phase_2_max",
    "impedance_Phase_3_max",
    "impedance_Phase_1_min",
    "impedance_Phase_2_min",
    "impedance_Phase_3_min",
    "resistance_integral_Phase_1_definite_integral",
    "resistance_integral_Phase_2_definite_integral",
    "resistance_integral_Phase_3_definite_integral",
    "reactance_integral_Phase_1_definite_integral",
    "reactance_integral_Phase_2_definite_integral",
    "reactance_integral_Phase_3_definite_integral",
    "measurements_transform_subtract_previous_cycle_I1_rmse",
    "measurements_transform_subtract_previous_cycle_I2_rmse",
    "measurements_transform_subtract_previous_cycle_I3_rmse",
    "measurements_transform_subtract_previous_cycle_U1_rmse",
    "measurements_transform_subtract_previous_cycle_U2_rmse",
    "measurements_transform_subtract_previous_cycle_U3_rmse",
    "fft_I1_harmonic_1",
    "fft_I2_harmonic_1",
    "fft_I3_harmonic_1",
    "fft_U1_harmonic_1",
    "fft_U2_harmonic_1",
    "fft_U3_harmonic_1",
    "fft_I1_harmonic_2",
    "fft_I2_harmonic_2",
    "fft_I3_harmonic_2",
    "fft_U1_harmonic_2",
    "fft_U2_harmonic_2",
    "fft_U3_harmonic_2",
    "fft_I1_harmonic_3",
    "fft_I2_harmonic_3",
    "fft_I3_harmonic_3",
    "fft_U1_harmonic_3",
    "fft_U2_harmonic_3",
    "fft_U3_harmonic_3",
    "fft_I1_harmonic_4",
    "fft_I2_harmonic_4",
    "fft_I3_harmonic_4",
    "fft_U1_harmonic_4",
    "fft_U2_harmonic_4",
    "fft_U3_harmonic_4",
    "fft_I1_harmonic_5",
    "fft_I2_harmonic_5",
    "fft_I3_harmonic_5",
    "fft_U1_harmonic_5",
    "fft_U2_harmonic_5",
    "fft_U3_harmonic_5",
    "fft_I1_harmonic_6",
    "fft_I2_harmonic_6",
    "fft_I3_harmonic_6",
    "fft_U1_harmonic_6",
    "fft_U2_harmonic_6",
    "fft_U3_harmonic_6",
    "fft_I1_harmonic_7",
    "fft_I2_harmonic_7",
    "fft_I3_harmonic_7",
    "fft_U1_harmonic_7",
    "fft_U2_harmonic_7",
    "fft_U3_harmonic_7",
    "fft_I1_harmonic_8",
    "fft_I2_harmonic_8",
    "fft_I3_harmonic_8",
    "fft_U1_harmonic_8",
    "fft_U2_harmonic_8",
    "fft_U3_harmonic_8",
    "fft_I1_harmonic_9",
    "fft_I2_harmonic_9",
    "fft_I3_harmonic_9",
    "fft_U1_harmonic_9",
    "fft_U2_harmonic_9",
    "fft_U3_harmonic_9",
    "fft_I1_harmonic_10",
    "fft_I2_harmonic_10",
    "fft_I3_harmonic_10",
    "fft_U1_harmonic_10",
    "fft_U2_harmonic_10",
    "fft_U3_harmonic_10",
    "spm_integral_I_definite_integral",
    "spm_integral_U_definite_integral",
]


df_scalars = df_scalars[manual_small_collection]
# df_scalars = df_scalars.iloc[:no_rows_to_keep, :5]
df_scalars.shape

(78016, 29)

### Merge scalars and class data

In [101]:
# merge the dataframes on timestamp
data_df = pd.merge(df_scalars, df_pqclass, on = 'timestamp', how = 'inner')

# add all features to a list, and place this list in a column called features
feature_cols = data_df.columns[4:-1]
data_df['features'] = data_df[feature_cols].apply(lambda row: row.tolist(), axis=1)


# remove feature columns after they are combined to list
data_df = data_df.drop(columns = feature_cols)
data_df = data_df.drop(data_df.columns[[0, 2, 3]], axis=1)

print('data_df shape: ', data_df.shape)
# data_df['is_interruption'] = (data_df['PQ class'] == 'interruption').astype(int)
data_df.head()

data_df shape:  (78026, 3)


Unnamed: 0,timestamp,PQ class,features
0,1601986000.0,interruption,"[120.9818644784, 106.6533229807, 0.0450764024,..."
1,1601986000.0,VD,"[25.4995054006, 66.254183138, 0.060216956, 0.7..."
2,1601986000.0,unbalance_u,"[0.736997181, 0.6437838372, 0.0259733392, 0.68..."
3,1601986000.0,unbalance_u,"[0.7440775213, 0.6484118803, 0.026043639, 0.68..."
4,1601988000.0,unbalance_u,"[1.4184988322, 1.5421083585, 0.0270025499, 0.6..."


### Function to generate training exampled with a specific lookback window

In [102]:
def generate_training_examples(data_df: pd.DataFrame, max_num_train_ex: int, look_back: int) -> pd.DataFrame:
    """
    Creates a pd.DataFrame containing training examples for the imputation proxy task.
    This task masks one event randomly in each event sequence and sets the target as class or timestamp of the masked event.
    For this proxy task, at least two events are needed in each sequence.
    Any event sequences with less than this will not be included in the output dataframe.
    
    Assumes data_df has columns 'timestamp' and 'events'.
    
    Args:
        data_df: pd.DataFrame containing input data
        num_train_ex: int specifying how many training examples to include in the output dataframe
        look_back: int specifying the look-back window in number of days each event sequence is
    
    
    Returns:
        A DataFrame where each column is an event sequence. The first row is id of the last date in each event sequence.
       
    """
    
   
        
    # convert all timestamps from unix to datetime
    data_df['timestamp'] = pd.to_datetime(data_df['timestamp'].apply(unix_to_datetime))
    
    first_date = data_df['timestamp'].iloc[0]
    last_date = data_df['timestamp'].iloc[-1]
    difference = last_date - first_date
    
    print(f'Input data contains {data_df.shape[0]} entries ranging from {first_date} to {last_date}')
    
    # get event sequences between each date in pred_times, add them as column in a new dataframe
    train_df = pd.DataFrame()
    k = 0
    columns_to_add = []
    
    # loop through all days in the input data
    for i in range(min(difference.days, max_num_train_ex)):
        last_date_i = last_date - timedelta(days=i)
        last_date_i = last_date_i.replace(hour=0, minute=0, second=0, microsecond=0) 
        first_date = last_date_i - timedelta(days=look_back)
        
        seq = get_event_sequence(data_df, first_date, last_date_i)
        
        if len(seq) < 2:  # need at least two events per sequence for this proxy task
            continue
        
        # sequence id
        column_id = 'Seq id: ' + str(k)
        k += 1
        
        seq_series = pd.Series(seq)
        seq_series.name = column_id
        if i==0:
            columns_to_add.append(seq_series)
        else:
            last_col = columns_to_add[-1]
            
            if not np.array_equal(last_col.values, seq_series.values):
                columns_to_add.append(seq_series)
    print(columns_to_add)
    train_df = pd.concat(columns_to_add, axis=1)
    train_df = train_df.T
    train_df.reset_index(drop=True, inplace=True)
    
    print(f'Created DataFrame with training examples with {train_df.shape[0]} event sequences')
    
    return train_df

# Lookback window is 7 days here
train_df = generate_training_examples(data_df, max_num_train_ex=100000, look_back=7)
train_df.head() #now each row is an event sequence and each column in an event (+ target as first col)

Input data contains 78026 entries ranging from 2020-10-06 12:11:01 to 2025-02-03 12:13:46
[0                                   2025-02-03 00:00:00
1     [2025-01-27 04:30:28, normal, [0.6988338375, 0...
2     [2025-01-27 07:31:08, normal, [0.6869380715, 0...
3     [2025-01-27 13:14:45, normal, [0.6998509211, 0...
4     [2025-01-27 13:17:24, normal, [0.7027293463, 0...
                            ...                        
70    [2025-02-02 18:52:27, normal, [0.6875818046, 0...
71    [2025-02-02 18:54:26, normal, [0.6905475264, 0...
72    [2025-02-02 18:57:11, normal, [0.7046805095, 0...
73    [2025-02-02 19:23:07, normal, [0.696225337, 0....
74    [2025-02-02 19:26:46, normal, [0.6858566366, 0...
Name: Seq id: 0, Length: 75, dtype: object, 0                                   2025-02-02 00:00:00
1     [2025-01-27 04:30:28, normal, [0.6988338375, 0...
2     [2025-01-27 07:31:08, normal, [0.6869380715, 0...
3     [2025-01-27 13:14:45, normal, [0.6998509211, 0...
4     [2025-01-27 13:17:2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2710,2711,2712,2713,2714,2715,2716,2717,2718,2719
0,2025-02-03,"[2025-01-27 04:30:28, normal, [0.6988338375, 0...","[2025-01-27 07:31:08, normal, [0.6869380715, 0...","[2025-01-27 13:14:45, normal, [0.6998509211, 0...","[2025-01-27 13:17:24, normal, [0.7027293463, 0...","[2025-01-27 14:30:41, normal, [0.6971136596, 0...","[2025-01-27 14:56:16, normal, [0.6822369089, 0...","[2025-01-27 16:24:26, normal, [0.6843556841, 0...","[2025-01-28 15:17:59, normal, [0.7181263366, 0...","[2025-01-29 15:16:43, normal, [0.7032370937, 0...",...,,,,,,,,,,
1,2025-02-02,"[2025-01-27 04:30:28, normal, [0.6988338375, 0...","[2025-01-27 07:31:08, normal, [0.6869380715, 0...","[2025-01-27 13:14:45, normal, [0.6998509211, 0...","[2025-01-27 13:17:24, normal, [0.7027293463, 0...","[2025-01-27 14:30:41, normal, [0.6971136596, 0...","[2025-01-27 14:56:16, normal, [0.6822369089, 0...","[2025-01-27 16:24:26, normal, [0.6843556841, 0...","[2025-01-28 15:17:59, normal, [0.7181263366, 0...","[2025-01-29 15:16:43, normal, [0.7032370937, 0...",...,,,,,,,,,,
2,2025-02-01,"[2025-01-27 04:30:28, normal, [0.6988338375, 0...","[2025-01-27 07:31:08, normal, [0.6869380715, 0...","[2025-01-27 13:14:45, normal, [0.6998509211, 0...","[2025-01-27 13:17:24, normal, [0.7027293463, 0...","[2025-01-27 14:30:41, normal, [0.6971136596, 0...","[2025-01-27 14:56:16, normal, [0.6822369089, 0...","[2025-01-27 16:24:26, normal, [0.6843556841, 0...","[2025-01-28 15:17:59, normal, [0.7181263366, 0...","[2025-01-29 15:16:43, normal, [0.7032370937, 0...",...,,,,,,,,,,
3,2025-01-31,"[2025-01-24 00:11:38, normal, [0.6938916792, 0...","[2025-01-24 01:03:06, normal, [0.6813441886, 0...","[2025-01-24 02:11:34, normal, [0.693210788, 0....","[2025-01-24 02:15:08, normal, [0.691046176, 0....","[2025-01-24 02:25:53, normal, [0.7035614267, 0...","[2025-01-24 02:28:53, normal, [0.6988692111, 0...","[2025-01-24 02:31:54, normal, [0.6887241022, 0...","[2025-01-24 02:33:40, normal, [0.6897274723, 0...","[2025-01-24 02:38:08, normal, [0.6955427001, 0...",...,,,,,,,,,,
4,2025-01-30,"[2025-01-23 00:18:17, normal, [0.6777778802, 0...","[2025-01-23 00:28:20, normal, [0.6846104796, 0...","[2025-01-23 00:42:54, normal, [0.6913555797, 0...","[2025-01-23 00:54:23, normal, [0.6935539427, 0...","[2025-01-23 01:07:45, normal, [0.6916797126, 0...","[2025-01-23 01:10:29, normal, [0.6843347085, 0...","[2025-01-23 01:32:26, normal, [0.6901243595, 0...","[2025-01-23 01:58:30, normal, [0.7049150265, 0...","[2025-01-23 01:59:23, normal, [0.6937994655, 0...",...,,,,,,,,,,


In [103]:
# Convert train_df from wide format to long format
train_df_long = wide_to_long_df(train_df)
train_df_long.head()

Unnamed: 0,sequence_id,event_idx,timestamp,event_label,feature_vector
0,0,0,2025-01-27 04:30:28,normal,"[0.6988338375, 0.7263958335, 0.0017976273, 0.7..."
1,0,1,2025-01-27 07:31:08,normal,"[0.6869380715, 0.7226359923, 0.0017540426, 0.7..."
2,0,2,2025-01-27 13:14:45,normal,"[0.6998509211, 0.7241181198, 0.0018230867, 0.7..."
3,0,3,2025-01-27 13:17:24,normal,"[0.7027293463, 0.7175021612, 0.0017495459, 0.7..."
4,0,4,2025-01-27 14:30:41,normal,"[0.6971136596, 0.7120790765, 0.0017735374, 0.7..."


In [104]:
# Expand the feature_vector list into separate columns
train_df_expanded = train_df_long['feature_vector'].apply(pd.Series) # byt ifall du inte vill ha long, ändra också feature_vector till features

# Rename the columns
train_df_expanded.columns = [f'feature_{i}' for i in range(train_df_expanded.shape[1])]

# Now, concatenate the expanded columns with the original DataFrame (without 'feature_vector')
train_df_final = pd.concat([train_df_long.drop(columns=['feature_vector']), train_df_expanded], axis=1) # ändra feature / feature_vector

train_df_final.head()

Unnamed: 0,sequence_id,event_idx,timestamp,event_label,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,...,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24
0,0,0,2025-01-27 04:30:28,normal,0.698834,0.726396,0.001798,0.708996,0.706273,0.708684,...,0.00604,0.006761,0.006419,0.311337,0.018351,0.015774,0.014797,0.001493,-0.001883,0.003464
1,0,1,2025-01-27 07:31:08,normal,0.686938,0.722636,0.001754,0.708891,0.706739,0.708441,...,0.007994,0.007495,0.00792,0.357,0.024826,0.022119,0.023466,0.002538,0.004522,0.002937
2,0,2,2025-01-27 13:14:45,normal,0.699851,0.724118,0.001823,0.708797,0.70595,0.707954,...,0.007637,0.008774,0.008371,0.327839,0.030375,0.035786,0.036372,-2.3e-05,0.001657,-0.002386
3,0,3,2025-01-27 13:17:24,normal,0.702729,0.717502,0.00175,0.70901,0.705329,0.707827,...,0.007437,0.008743,0.008619,0.326526,0.031908,0.036591,0.037561,-0.002049,-0.003815,-0.000891
4,0,4,2025-01-27 14:30:41,normal,0.697114,0.712079,0.001774,0.708862,0.705461,0.708092,...,0.007695,0.009059,0.009055,0.365932,0.037331,0.045981,0.04653,-0.000109,0.001299,-0.000575


In [203]:
# number of unique sequences in the eneryield data
num_subjects_eneryield = train_df_final['sequence_id'].nunique()

In [204]:
# split data into sepreate dataframes for each event_type
train_df_VD = train_df_final[train_df_final['event_label']=='VD']
train_df_interruption = train_df_final[train_df_final['event_label']=='interruption']
train_df_unbalance_u = train_df_final[train_df_final['event_label']=='unbalance_u']
train_df_unbalance_i = train_df_final[train_df_final['event_label']=='unbalance_i']
train_df_current_deviation = train_df_final[train_df_final['event_label']=='current_deviation']
train_df_harmonics_u = train_df_final[train_df_final['event_label']=='harmonics_u']
train_df_harmonics_i = train_df_final[train_df_final['event_label']=='harmonics_i']
train_df_transient = train_df_final[train_df_final['event_label']=='transient']
train_df_normal = train_df_final[train_df_final['event_label']=='normal']

In [205]:
# save dataframes as .csv files
save_dir='/home/filip-marcus/ESGPT_test/data/eneryield_ecom_mimic_pred_ecom_cosm/pretrain/'
train_df_VD.to_csv(save_dir+'eneryield_VD.csv', index=False)
train_df_interruption.to_csv(save_dir+'eneryield_interruption.csv', index=False)
train_df_unbalance_u.to_csv(save_dir+'eneryield_unbalance_u.csv', index=False)
train_df_unbalance_i.to_csv(save_dir+'eneryield_unbalance_i.csv', index=False)
train_df_current_deviation.to_csv(save_dir+'eneryield_current_deviation.csv', index=False)
train_df_harmonics_u.to_csv(save_dir+'eneryield_harmonics_u.csv', index=False)
train_df_harmonics_i.to_csv(save_dir+'eneryield_harmonics_i.csv', index=False)
train_df_transient.to_csv(save_dir+'eneryield_transient.csv', index=False)
train_df_normal.to_csv(save_dir+'eneryield_normal.csv', index=False)


### Creates a dataframe for subject if you are inly supposed to train on eneryield data

In [108]:
subjects_df = pd.DataFrame({'sequence_id': (train_df_final['sequence_id'].unique())})
subjects_df['sequence_id'] = subjects_df['sequence_id']+1
subjects_df['dummy_static'] = 1
subjects_df

Unnamed: 0,sequence_id,dummy_static
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1
...,...,...
1415,1416,1
1416,1417,1
1417,1418,1
1418,1419,1


# Ecom Electronic

### Load data

In [242]:
# Load eCommerce data

ecom_df = pd.read_csv('/home/filip-marcus/data/kaggle_ecommers_electronic/events.csv')

ecom_df.shape

ecom_df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2020-09-24 11:57:06 UTC,view,1996170,2144415922528452715,electronics.telephone,,31.9,1515915625519388267,LJuJVLEjPT
1,2020-09-24 11:57:26 UTC,view,139905,2144415926932472027,computers.components.cooler,zalman,17.16,1515915625519380411,tdicluNnRY
2,2020-09-24 11:57:27 UTC,view,215454,2144415927158964449,,,9.81,1515915625513238515,4TMArHtXQy
3,2020-09-24 11:57:33 UTC,view,635807,2144415923107266682,computers.peripherals.printer,pantum,113.81,1515915625519014356,aGFYrNgC08
4,2020-09-24 11:57:36 UTC,view,3658723,2144415921169498184,,cameronsino,15.87,1515915625510743344,aa4mmk0kwQ


In [243]:
# Sort events by user_id
ecom_df.sort_values(by="user_id")

# Create new user_ids ranging from 0 to len (df["user_id"])
unique_users = {old_id: new_id for new_id, old_id in enumerate(ecom_df["user_id"].unique(), start=1)}

# Replace user_id with new unique IDs
ecom_df["user_id"] = ecom_df["user_id"].map(unique_users)-1
print(ecom_df.head())

                event_time event_type  product_id          category_id  \
0  2020-09-24 11:57:06 UTC       view     1996170  2144415922528452715   
1  2020-09-24 11:57:26 UTC       view      139905  2144415926932472027   
2  2020-09-24 11:57:27 UTC       view      215454  2144415927158964449   
3  2020-09-24 11:57:33 UTC       view      635807  2144415923107266682   
4  2020-09-24 11:57:36 UTC       view     3658723  2144415921169498184   

                   category_code        brand   price  user_id user_session  
0          electronics.telephone          NaN   31.90        0   LJuJVLEjPT  
1    computers.components.cooler       zalman   17.16        1   tdicluNnRY  
2                            NaN          NaN    9.81        2   4TMArHtXQy  
3  computers.peripherals.printer       pantum  113.81        3   aGFYrNgC08  
4                            NaN  cameronsino   15.87        4   aa4mmk0kwQ  


In [244]:
# drop catecorigal features
ecom_df.drop(columns=['user_session'], inplace=True)
ecom_df.drop(columns=['product_id', 'category_id','brand','category_code'],inplace=True)
# ecom_df['brand'].fillna('other',inplace=True)
ecom_df.head()



Unnamed: 0,event_time,event_type,price,user_id
0,2020-09-24 11:57:06 UTC,view,31.9,0
1,2020-09-24 11:57:26 UTC,view,17.16,1
2,2020-09-24 11:57:27 UTC,view,9.81,2
3,2020-09-24 11:57:33 UTC,view,113.81,3
4,2020-09-24 11:57:36 UTC,view,15.87,4


In [245]:
ecom_df['user_id'] = ecom_df['user_id']+1
ecom_df.rename(columns={'user_id': 'sequence_id'}, inplace=True)

In [246]:
ecom_df['sequence_id'] = ecom_df['sequence_id']+ num_subjects_eneryield
# Remove "UTC" from event_time
ecom_df['event_time'] = pd.to_datetime(ecom_df['event_time'].str.replace(" UTC", "", regex=False))
ecom_df.head()

Unnamed: 0,event_time,event_type,price,sequence_id
0,2020-09-24 11:57:06,view,31.9,1421
1,2020-09-24 11:57:26,view,17.16,1422
2,2020-09-24 11:57:27,view,9.81,1423
3,2020-09-24 11:57:33,view,113.81,1424
4,2020-09-24 11:57:36,view,15.87,1425


In [247]:
# df = ecom_df
# df = df[[df.columns[-1]] + list(df.columns[:-1])]
unique_classes = ecom_df['event_type'].unique()
#print(unique_classes)
df = ecom_df

df['event_label'] = df['event_type']
df.drop(columns=['event_type'], inplace = True)
pd.set_option("display.max_rows",10)

df.head()

Unnamed: 0,event_time,price,sequence_id,event_label
0,2020-09-24 11:57:06,31.9,1421,view
1,2020-09-24 11:57:26,17.16,1422,view
2,2020-09-24 11:57:27,9.81,1423,view
3,2020-09-24 11:57:33,113.81,1424,view
4,2020-09-24 11:57:36,15.87,1425,view


In [248]:
# number of unique subjects in both eneryield and ecom electronic data
num_subjects_eneryield_ecom = num_subjects_eneryield + df['sequence_id'].nunique()
df.head()

Unnamed: 0,event_time,price,sequence_id,event_label
0,2020-09-24 11:57:06,31.9,1421,view
1,2020-09-24 11:57:26,17.16,1422,view
2,2020-09-24 11:57:27,9.81,1423,view
3,2020-09-24 11:57:33,113.81,1424,view
4,2020-09-24 11:57:36,15.87,1425,view


In [249]:
# split the dataframe into a separate dataframe for each event_label
dfs = {event: df[df['event_label'] == event] for event in unique_classes}
dfs['view'].head()

save_dir = '/home/filip-marcus/ESGPT_test/data/eneryield_ecom_mimic_pred_ecom_cosm/pretrain'
for event, df_event in dfs.items():
    df_event.to_csv(f"{save_dir}/df_ecom_{event}.csv",index=False)

### Creates a dataframe for subject if you are inly supposed to train on eneryield and ecom electronic data

In [117]:
subjects = pd.Series(range(num_subjects_eneryield_ecom))  # change df to ecom_df if you want full dataset
subject_df = pd.DataFrame()
subject_df['sequence_id']=subjects +1
subject_df['dummy_static'] = 1
subject_df.to_csv('/home/filip-marcus/ESGPT_test/data/eneryield_ecom/pretrain/subjects.csv')
subject_df.tail()

Unnamed: 0,sequence_id,dummy_static
408698,408699,1
408699,408700,1
408700,408701,1
408701,408702,1
408702,408703,1


# MIMIC-IV

### Load data

In [220]:
raw_data_dir='/home/filip-marcus/ESGPT_test/data/'
mimic_edstays_df = pd.read_csv(raw_data_dir + "/edstays" + "_mimic_demo.csv")
mimic_pyxis_df = pd.read_csv(raw_data_dir + "/pyxis" + "_mimic_demo.csv")
mimic_vitalsign_df = pd.read_csv(raw_data_dir + "/vitalsign" + "_mimic_demo.csv")
mimic_medrecon_df = pd.read_csv(raw_data_dir + "/medrecon" + "_mimic_demo.csv")

In [221]:
mimic_edstays_df['event_label'] = 'edstay'
mimic_medrecon_df['event_label'] = 'medrecon'
mimic_pyxis_df['event_label'] = 'pyxis'
mimic_vitalsign_df['event_label'] = 'vitalsign'


In [222]:
mimic_df = pd.concat([mimic_pyxis_df,mimic_edstays_df,mimic_medrecon_df,mimic_vitalsign_df],axis=0,ignore_index=True)
# mimic_df.drop(columns=['Unnamed: 0'],inplace=True)
unique_users = {old_id: new_id for new_id, old_id in enumerate(mimic_df["subject_id"].unique(), start=1)}
mimic_df["subject_id"] = mimic_df["subject_id"].map(unique_users)-1
mimic_df["subject_id"] = mimic_df["subject_id"] + num_subjects_eneryield_ecom +1
print(mimic_df.columns)
mimic_df.head()

Index(['Unnamed: 0', 'subject_id', 'stay_id', 'charttime', 'med_rn', 'name',
       'gsn_rn', 'gsn', 'event_label', 'hadm_id', 'intime', 'outtime',
       'gender', 'race', 'arrival_transport', 'disposition', 'ndc', 'etc_rn',
       'etccode', 'etcdescription', 'temperature', 'heartrate', 'resprate',
       'o2sat', 'sbp', 'dbp', 'rhythm', 'pain'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,subject_id,stay_id,charttime,med_rn,name,gsn_rn,gsn,event_label,hadm_id,...,etccode,etcdescription,temperature,heartrate,resprate,o2sat,sbp,dbp,rhythm,pain
0,0,408704,36185653,2175-04-05 06:39:00,4.0,Aspirin,1.0,4380.0,pyxis,,...,,,,,,,,,,
1,1,408705,37401157,2145-10-19 23:53:00,2.0,Aspirin,1.0,4380.0,pyxis,,...,,,,,,,,,,
2,2,408706,30279522,2149-09-17 11:51:00,2.0,Aspirin,1.0,4380.0,pyxis,,...,,,,,,,,,,
3,3,408705,30836087,2145-07-04 10:27:00,10.0,Aspirin,1.0,4380.0,pyxis,,...,,,,,,,,,,
4,4,408707,30701739,2143-03-22 21:48:00,2.0,Aspirin,1.0,4376.0,pyxis,,...,,,,,,,,,,


In [223]:
dfs = {event: mimic_df[mimic_df['event_label'] == event] for event in mimic_df['event_label'].unique()}

# Vital sign
vitalsign_df = dfs['vitalsign'].loc[:,mimic_vitalsign_df.columns]
vitalsign_df.drop(columns=['Unnamed: 0','stay_id','rhythm'],inplace=True)
vitalsign_df.rename(columns={'subject_id':'sequence_id'},inplace=True)
print(vitalsign_df.head())

# Medrecon
medrecon_df = dfs['medrecon'].loc[:,mimic_medrecon_df.columns]
medrecon_df.drop(columns=['Unnamed: 0','stay_id','name','etccode','etcdescription'],inplace=True)
medrecon_df.rename(columns={'subject_id':'sequence_id'},inplace=True)
print(medrecon_df.head())

# Pyxis
pyxis_df = dfs['pyxis'].loc[:,mimic_pyxis_df.columns]
pyxis_df.drop(columns=['Unnamed: 0','stay_id','name'],inplace=True)
pyxis_df.rename(columns={'subject_id':'sequence_id'},inplace=True)
print(pyxis_df.head())

# Edstays
edstays_df = dfs['edstay'].loc[:,mimic_edstays_df.columns]
edstays_df.drop(columns=['Unnamed: 0','stay_id','hadm_id','race','arrival_transport'],inplace=True)
edstays_df.rename(columns={'subject_id':'sequence_id'},inplace=True)
print(edstays_df.head())

mimic_dfs = {
            'vitalsign':vitalsign_df,
            'medrecon':medrecon_df,
            'pyxis':pyxis_df,
            'edstay':edstays_df }

      sequence_id            charttime  temperature  heartrate  resprate  \
4068       408715  2125-03-19 13:22:00          NaN      124.0      24.0   
4069       408715  2125-03-19 18:28:00         98.9      106.0      18.0   
4070       408715  2125-03-19 13:07:00          NaN      128.0      18.0   
4071       408715  2125-03-19 16:23:00         99.8      115.0      22.0   
4072       408715  2125-03-19 13:24:00          NaN      125.0       NaN   

      o2sat    sbp   dbp pain event_label  
4068  100.0   93.0  65.0  NaN   vitalsign  
4069  100.0  115.0  70.0    5   vitalsign  
4070  100.0  132.0  96.0  NaN   vitalsign  
4071   97.0  114.0  45.0    0   vitalsign  
4072    NaN    NaN   NaN  NaN   vitalsign  
      sequence_id            charttime     gsn           ndc  etc_rn  \
1304       408737  2171-11-07 20:38:00     0.0  0.000000e+00     1.0   
1305       408747  2132-05-17 21:17:00     0.0  0.000000e+00     1.0   
1306       408711  2160-07-15 19:43:00     0.0  0.000000e+00   

In [224]:
save_dir = '/home/filip-marcus/ESGPT_test/data/eneryield_ecom_mimic_pred_ecom_cosm/pretrain'
for event, df_event in mimic_dfs.items():
    print(event, df_event)
    df_event.to_csv(f"{save_dir}/df_mimic_{event}.csv",index=False)

vitalsign       sequence_id            charttime  temperature  heartrate  resprate  \
4068       408715  2125-03-19 13:22:00          NaN      124.0      24.0   
4069       408715  2125-03-19 18:28:00         98.9      106.0      18.0   
4070       408715  2125-03-19 13:07:00          NaN      128.0      18.0   
4071       408715  2125-03-19 16:23:00         99.8      115.0      22.0   
4072       408715  2125-03-19 13:24:00          NaN      125.0       NaN   
...           ...                  ...          ...        ...       ...   
5101       408711  2156-04-30 18:47:00          NaN      100.0      30.0   
5102       408711  2156-04-30 20:14:00          NaN       90.0      26.0   
5103       408711  2156-04-30 19:47:00          NaN       96.0      29.0   
5104       408711  2155-12-04 21:31:00         97.4       82.0      16.0   
5105       408711  2155-12-04 21:36:00         97.4       82.0      18.0   

      o2sat    sbp   dbp pain event_label  
4068  100.0   93.0  65.0  NaN   v

In [225]:
num_subjects_eneryield_ecom_mimic = num_subjects_eneryield_ecom + mimic_df['subject_id'].nunique()

# Predictive maintenance

In [226]:
raw_data_dir_predicitve_maintenance = "/home/filip-marcus/data/predictive_maintenance"
pred_maintenance_df = pd.read_csv(raw_data_dir_predicitve_maintenance + "/predictive_maintenance_dataset.csv")
print(pred_maintenance_df.shape)
pred_maintenance_df.head()

(124494, 12)


Unnamed: 0,date,device,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
0,1/1/2015,S1F01085,0,215630672,55,0,52,6,407438,0,0,7
1,1/1/2015,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,1/1/2015,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,1/1/2015,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,1/1/2015,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3


In [227]:
# Move column "user_id" to the front

pred_maintenance_df = pred_maintenance_df[[pred_maintenance_df.columns[1]] + [pred_maintenance_df.columns[0]] + list(pred_maintenance_df.columns[2:])]
pred_maintenance_df.head()

Unnamed: 0,device,date,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
0,S1F01085,1/1/2015,0,215630672,55,0,52,6,407438,0,0,7
1,S1F0166B,1/1/2015,0,61370680,0,3,0,6,403174,0,0,0
2,S1F01E6Y,1/1/2015,0,173295968,0,0,0,12,237394,0,0,0
3,S1F01JE0,1/1/2015,0,79694024,0,0,0,6,410186,0,0,0
4,S1F01R2B,1/1/2015,0,135970480,0,0,0,15,313173,0,0,3


In [228]:

# Sort events by user_id
pred_maintenance_df.sort_values(by="device")

# Create new user_ids ranging from 0 to len (df["user_id"])
unique_users = {old_id: new_id for new_id, old_id in enumerate(pred_maintenance_df["device"].unique(), start=1)}

# Replace user_id with new unique IDs
pred_maintenance_df["device"] = pred_maintenance_df["device"].map(unique_users)-1
pred_maintenance_df.rename(columns={'device': 'sequence_id'}, inplace=True)
pred_maintenance_df['sequence_id'] = pred_maintenance_df['sequence_id'] + num_subjects_eneryield_ecom_mimic+1
# pred_maintenance_df.to_csv("/home/filip-marcus/ESGPT_test/data/eneryield_ecom_pred/pretrain/predicitve_maintenance_events_df.csv")
pred_maintenance_df.head()


Unnamed: 0,sequence_id,date,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
0,408768,1/1/2015,0,215630672,55,0,52,6,407438,0,0,7
1,408769,1/1/2015,0,61370680,0,3,0,6,403174,0,0,0
2,408770,1/1/2015,0,173295968,0,0,0,12,237394,0,0,0
3,408771,1/1/2015,0,79694024,0,0,0,6,410186,0,0,0
4,408772,1/1/2015,0,135970480,0,0,0,15,313173,0,0,3


In [229]:
pred_maintenance_df.rename(columns={'failure': 'event_label'}, inplace=True)

In [230]:
dfs = {event: pred_maintenance_df[pred_maintenance_df['event_label'] == event] for event in pred_maintenance_df['event_label'].unique()}
dfs[1].head()

Unnamed: 0,sequence_id,date,event_label,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
4885,409003,1/5/2015,1,48467332,64776,0,841,8,39267,56,56,1
6879,408831,1/7/2015,1,184069720,528,0,4,9,387871,32,32,3
8823,409416,1/9/2015,1,136429411,64784,0,406,30,224801,8,8,0
11957,409592,1/13/2015,1,188251248,2040,0,0,6,39345,32,32,1
12668,409583,1/14/2015,1,220461296,0,0,0,14,325125,0,0,0


In [231]:

save_dir = '/home/filip-marcus/ESGPT_test/data/eneryield_ecom_mimic_pred_ecom_cosm/pretrain'
for event, df_event in dfs.items():
    df_event.to_csv(f"{save_dir}/df_pred_{str(event)}.csv",index=False)

In [232]:
num_subjects_eneryield_ecom_mimic_pred = num_subjects_eneryield_ecom_mimic + pred_maintenance_df['sequence_id'].nunique()

# Subjects eneryield ecom mimic pred

In [131]:
subject = pd.Series(range(num_subjects_eneryield_ecom_mimic_pred))
subjects = pd.DataFrame()
subjects['sequence_id'] = subject + 1
subjects['dummy_static'] = 1
subjects.to_csv("/home/filip-marcus/ESGPT_test/data/eneryield_ecom_mimic_pred/pretrain/subjects.csv")
subjects.tail()

Unnamed: 0,sequence_id,dummy_static
409931,409932,1
409932,409933,1
409933,409934,1
409934,409935,1
409935,409936,1


# Ecom Cosmetic

In [233]:
raw_data_dir_ecom_cosmetics = '/home/filip-marcus/data/kaggle_ecommerce_cosmetics'
ecom_cosm_dec_19_df = pd.read_csv(raw_data_dir_ecom_cosmetics+"/2019-Dec.csv")
ecom_cosm_nov_19_df = pd.read_csv(raw_data_dir_ecom_cosmetics+"/2019-Nov.csv")
ecom_cosm_oct_19_df = pd.read_csv(raw_data_dir_ecom_cosmetics+"/2019-Oct.csv")

ecom_cosm_feb_20_df = pd.read_csv(raw_data_dir_ecom_cosmetics+"/2020-Feb.csv")
# Removing rows that has price of other datatype than float
ecom_cosm_feb_20_df['price_numeric'] = pd.to_numeric(ecom_cosm_feb_20_df['price'], errors='coerce')
ecom_cosm_feb_20_df = ecom_cosm_feb_20_df[~ecom_cosm_feb_20_df['price_numeric'].isnull()]
ecom_cosm_feb_20_df.drop(['price_numeric'],inplace=True, axis=1)

ecom_cosm_Jan_20_df = pd.read_csv(raw_data_dir_ecom_cosmetics+"/2020-Jan.csv")

  ecom_cosm_feb_20_df = pd.read_csv(raw_data_dir_ecom_cosmetics+"/2020-Feb.csv")


In [234]:
# Append the above dataframe into one
ecom_cosm_df = pd.concat([ecom_cosm_dec_19_df,ecom_cosm_oct_19_df,ecom_cosm_nov_19_df,ecom_cosm_feb_20_df,ecom_cosm_Jan_20_df], axis=0,ignore_index=True)

# Drop columns "user_sesion"
ecom_cosm_df.drop("user_session",inplace=True,axis=1)

# Move column "user_id" to the front
ecom_cosm_df = ecom_cosm_df[[ecom_cosm_df.columns[-1]] + list(ecom_cosm_df.columns[:-1])]

# Sort events by user_id
ecom_cosm_df.sort_values(by="user_id")

# Create new user_ids ranging from 0 to len (df["user_id"])
unique_users = {old_id: new_id for new_id, old_id in enumerate(ecom_cosm_df["user_id"].unique(), start=1)}

# Replace user_id with new unique IDs
ecom_cosm_df["user_id"] = ecom_cosm_df["user_id"].map(unique_users)

# Drop categorical features
ecom_cosm_df.drop(columns=['product_id','category_id','brand','category_code'],inplace=True)

# Rename column "event_type" to "event_label"
ecom_cosm_df.rename(columns={'event_type':'event_label'},inplace=True)

# Remove "UTC" from event_time
ecom_cosm_df['event_time'] = pd.to_datetime(ecom_cosm_df['event_time'].str.replace(" UTC", "", regex=False))

# Rename "user_id" to "sequence_id"
ecom_cosm_df.rename(columns={'user_id':'sequence_id'},inplace=True)

ecom_cosm_df.head()

Unnamed: 0,sequence_id,event_time,event_label,price
0,1,2019-12-01 00:00:00,remove_from_cart,6.27
1,2,2019-12-01 00:00:00,view,29.05
2,3,2019-12-01 00:00:02,cart,1.19
3,4,2019-12-01 00:00:05,view,0.79
4,5,2019-12-01 00:00:07,view,5.56


In [235]:
dfs_cosm = {event: ecom_cosm_df[ecom_cosm_df['event_label'] == event] for event in ecom_cosm_df['event_label'].unique()}
dfs_cosm['remove_from_cart'].head()
save_dir = '/home/filip-marcus/ESGPT_test/data/eneryield_ecom_mimic_pred_ecom_cosm/pretrain'
for event, df_event in dfs_cosm.items():
    df_event.to_csv(f"{save_dir}/df_ecom_cosm_{str(event)}.csv",index=False)

In [236]:
num_subjects_eneryield_ecom_mimic_pred_ecom_cosm = num_subjects_eneryield_ecom_mimic_pred + ecom_cosm_df['sequence_id'].nunique()
print(num_subjects_eneryield_ecom_mimic_pred_ecom_cosm)

2049294


In [237]:
subject = pd.Series(range(num_subjects_eneryield_ecom_mimic_pred_ecom_cosm))
subjects = pd.DataFrame()
subjects['sequence_id'] = subject
subjects['dummy_static'] = 1
subjects.to_csv(save_dir+'/subjects.csv',index=False)
print(subjects.shape)
subjects.tail()

(2049294, 2)


Unnamed: 0,sequence_id,dummy_static
2049289,2049289,1
2049290,2049290,1
2049291,2049291,1
2049292,2049292,1
2049293,2049293,1
