PART 1 - PROCESS CLEAN DATA (TRAINING) AND SAVE THE SCALER AS robust_scaler.pkl

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
import joblib
import os

#r'C:/Users/gusta/Documents/KTH/TriOptima/trioptima/trioptima/'
#'/Users/elliotlindestam/Documents/Skola/Indek icloud/trioptima/'
your_path = '/Users/elliotlindestam/Documents/Skola/Indek icloud/trioptima/'
folder_path = your_path + '6.Active Data/Train Model Data/'
# Get file in the folder
files = os.listdir(folder_path)
# MAC issue
files = [f for f in files if f != '.DS_Store']
training_data_name = files[0][:-4]

# Load the data
data = pd.read_csv(your_path + "3.Cash_Risk/" + training_data_name+'_Cash_Risk.csv')

# Step 1: Handle Missing Data
data_filled = data.fillna(0)

# Step 2: Extract Information from DateTime Columns
datetime_columns = ['effectiveDate', 'executionDateTime', 'expirationDate'] # ['effectiveDate', 'expirationDate', 'eventDateTime',] # 

def extract_date_features(df, column):
    df[column] = pd.to_datetime(df[column], errors='coerce')
    df[column + '_year'] = df[column].dt.year
    df[column + '_month'] = df[column].dt.month
    df[column + '_day'] = df[column].dt.day
    df[column + '_hour'] = df[column].dt.hour
    df[column + '_minute'] = df[column].dt.minute
    df[column + '_second'] = df[column].dt.second
    df[column + '_weekday'] = df[column].dt.weekday
    # Drop the original datetime column
    df = df.drop(column, axis=1)
    return df

for col in datetime_columns:
    data_filled = extract_date_features(data_filled, col)

original_dtypes = data.dtypes.to_dict()

# Convert boolean columns to binary (1/0) before one-hot encoding other categorical columns
data_filled = data_filled*1

def compare_item(row,leg1,leg2):
    return 1 if row[leg1] == row[leg2] else 0

cur = data_filled.apply(lambda row: compare_item(row, 'leg1NotionalCurrency','leg2NotionalCurrency'), axis=1)
nom = data_filled.apply(lambda row: compare_item(row, 'leg1NotionalAmount','leg2NotionalAmount'), axis=1)

# Step 3: Encode Categorical Variables
categorical_columns = data_filled.select_dtypes(include=['object', 'bool']).columns.tolist()
categorical_columns = [col for col in categorical_columns if col not in datetime_columns]
data_encoded = pd.get_dummies(data_filled, columns=categorical_columns, drop_first=True)

# Step 3.5: Identify and Drop Single-Value Columns
single_value_columns = data_encoded.columns[data_encoded.nunique() == 1].tolist()

# Optionally print these columns and their unique values
print("Columns with a single unique value: ", single_value_columns)

# Drop the single-value columns
data_encoded = data_encoded.drop(columns=single_value_columns)
data_encoded['CurrencyIV'] = cur
data_encoded['NominalIV'] = nom
#data_encoded = data_encoded.fillna(0)
# Define numerical_columns here, after all column dropping and adding has occurred



print(data_encoded.columns)
# Create interaction variable


numerical_columns = data_encoded.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Check for NaN or infinite values in numerical columns before scaling
nan_inf_columns = data_encoded[numerical_columns].columns[data_encoded[numerical_columns].isna().any() | np.isinf(data_encoded[numerical_columns]).any()].tolist()
print("Columns with NaN or infinite values before scaling: ", nan_inf_columns)

# Step 4: Scale/Normalize Data
numerical_columns = data_encoded.select_dtypes(include=['float64', 'int64']).columns.tolist()
scaler = RobustScaler()
data_encoded[numerical_columns] = scaler.fit_transform(data_encoded[numerical_columns])

# Save the scaler for future use
scaler_filename = 'robust_scaler.pkl'
joblib.dump(scaler, scaler_filename)

# Preprocessed data is now stored in `data_encoded`, and the scaler is saved as `robust_scaler.pkl`
# Save the processed data to a CSV file in this environment
data_encoded.to_csv(your_path + '4.Scaled/' + training_data_name + '_Scaled.csv', index=False)

train_encoded_columns = data_encoded.columns


  data = pd.read_csv(your_path + "3.Cash_Risk/" + training_data_name+'_Cash_Risk.csv')


Columns with a single unique value:  ['customBasketIndicator', 'packageIndicator', 'postPricedSwapIndicator', 'CurrencyIV', 'NominalIV', 'effectiveDate_hour', 'effectiveDate_minute', 'effectiveDate_second', 'executionDateTime_year', 'executionDateTime_month', 'executionDateTime_day', 'executionDateTime_hour', 'executionDateTime_minute', 'executionDateTime_second', 'executionDateTime_weekday', 'expirationDate_hour', 'expirationDate_minute', 'expirationDate_second']
Index(['leg1NotionalAmount', 'leg2NotionalAmount', 'leg1FixedRate',
       'leg1FixedRatePaymentFrequencyMultiplier',
       'leg2UnderlierTenorMultiplier', 'leg1NotionalAmountUSD', 'MtM_leg1',
       'MtM_leg2', 'total_delta', 'effectiveDate_year', 'effectiveDate_month',
       'effectiveDate_day', 'effectiveDate_weekday', 'expirationDate_year',
       'expirationDate_month', 'expirationDate_day', 'expirationDate_weekday',
       'cleared_Y', 'event_Allocation', 'event_Amendment',
       'event_BLENDING_REMNANT', 'event_Clea

PART 2 - PROCESS THE TEST DATA USING THE SCALER IN TRAINING DATA

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
import joblib
import os

folder_path = your_path + '6.Active Data/Test Data/'
# Get file in the folder
files = os.listdir(folder_path)

# MAC issue
files = [f for f in files if f != '.DS_Store']

test_data_name = files[0][:-4]

# Load the data
t_data = pd.read_csv(your_path + '3.Cash_Risk/' + test_data_name+'_Cash_Risk.csv')

# Drop 'disseminationTimestamp' and its derived columns
#t_data = t_data.drop(columns=['disseminationTimestamp','sDRreceiptTimestamp','disseminationIdentifier'])

# Step 1: Handle Missing Data
t_data_filled = t_data.fillna(0)

# Step 2: Extract Information from DateTime Columns
t_datetime_columns = ['effectiveDate', 'expirationDate']

def extract_date_features(df, column):
    df[column] = pd.to_datetime(df[column], errors='coerce')
    df[column + '_year'] = df[column].dt.year
    df[column + '_month'] = df[column].dt.month
    df[column + '_day'] = df[column].dt.day
    df[column + '_hour'] = df[column].dt.hour
    df[column + '_minute'] = df[column].dt.minute
    df[column + '_second'] = df[column].dt.second
    df[column + '_weekday'] = df[column].dt.weekday
    # Drop the original datetime column
    df = df.drop(column, axis=1)
    return df

for col in t_datetime_columns:
    t_data_filled = extract_date_features(t_data_filled, col)

# Convert boolean columns to binary (1/0) before one-hot encoding other categorical columns
t_data_filled = t_data_filled*1

def compare_item(row,leg1,leg2):
    return 1 if row[leg1] == row[leg2] else 0

cur = t_data_filled.apply(lambda row: compare_item(row, 'leg1NotionalCurrency','leg2NotionalCurrency'), axis=1)
nom = t_data_filled.apply(lambda row: compare_item(row, 'leg1NotionalAmount','leg2NotionalAmount'), axis=1)

# Step 3: Encode Categorical Variables
t_categorical_columns = t_data_filled.select_dtypes(include=['object', 'bool']).columns.tolist()
t_categorical_columns = [col for col in t_categorical_columns if col not in t_datetime_columns]
t_data_encoded = pd.get_dummies(t_data_filled, columns=t_categorical_columns, drop_first=True)

# Step 3.5: Identify and Drop Single-Value Columns
t_single_value_columns = t_data_encoded.columns[t_data_encoded.nunique() == 1].tolist()

# Optionally print these columns and their unique values
print("Columns with a single unique value: ", t_single_value_columns)

# Drop the single-value columns
t_data_encoded = t_data_encoded.drop(columns=t_single_value_columns)
#t_data_encoded = t_data_encoded.fillna(0)

t_data_encoded['CurrencyIV'] = cur
t_data_encoded['NominalIV'] = nom

# Define numerical_columns here, after all column dropping and adding has occurred
t_numerical_columns = t_data_encoded.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Check for NaN or infinite values in numerical columns before scaling
t_nan_inf_columns = t_data_encoded[t_numerical_columns].columns[t_data_encoded[t_numerical_columns].isna().any() | np.isinf(t_data_encoded[t_numerical_columns]).any()].tolist()
print("Columns with NaN or infinite values before scaling: ", t_nan_inf_columns)

def match_columns(new_data, train_encoded_columns, original_dtypes):
    missing_cols = {col: 0 for col in train_encoded_columns if col not in new_data.columns}
    extra_cols = [col for col in new_data.columns if col not in train_encoded_columns]
    
    # Add missing columns with 0s
    new_data = pd.concat([new_data, pd.DataFrame(missing_cols, index=new_data.index)], axis=1)
    
    # Remove extra columns
    new_data = new_data.drop(columns=extra_cols)
    
    # Ensure columns are in the same order as training data
    new_data = new_data[train_encoded_columns]
    
    # Convert columns back to their original type
    for col, original_dtype in original_dtypes.items():
        if col in new_data.columns:  # Ensure the column exists in new_data
            if pd.api.types.is_categorical_dtype(original_dtype):
                new_data[col] = pd.Categorical(new_data[col])
            else:
                new_data[col] = new_data[col].astype(original_dtype)
            
    
    return new_data


# Apply the function
t_data_matched = match_columns(t_data_encoded, train_encoded_columns, original_dtypes)

# Check if all columns in t_data_matched are equal to train_encoded_columns
if list(t_data_matched.columns) == list(train_encoded_columns):

    print("All columns in t_data_matched are equal to train_encoded_columns.")
else:
    print("Columns in t_data_matched and train_encoded_columns do not match.")

    # Check if the columns are the same but in a different order
    if set(t_data_matched.columns) == set(train_encoded_columns):
        print("The datasets have the same columns, but they are in a different order.")
        
        # You can reorder the columns in t_data_matched to match train_encoded_columns
        t_data_matched = t_data_matched[train_encoded_columns]
        print("Columns in t_data_matched reordered to match train_encoded_columns.")
    else:
        # Identify and print the mismatched columns
        extra_cols = set(t_data_matched.columns) - set(train_encoded_columns)
        missing_cols = set(train_encoded_columns) - set(t_data_matched.columns)
        print("Extra columns in t_data_matched: ", extra_cols)
        print("Missing columns in t_data_matched: ", missing_cols)


import joblib

# Load the saved scaler
scaler = joblib.load(your_path +'0.Code/robust_scaler.pkl')

# Scale the numerical columns of t_data_matched using the 'numerical_columns' from the training data
t_data_matched[numerical_columns] = scaler.transform(t_data_matched[numerical_columns])

t_data_matched.to_csv(your_path + '4.Scaled/' + test_data_name + '_Scaled.csv', index=False)

Columns with a single unique value:  ['blockTradeIndicator', 'customBasketIndicator', 'packageIndicator', 'postPricedSwapIndicator', 'primeBrokerageTransactionIndicator', 'CurrencyIV', 'NominalIV', 'effectiveDate_hour', 'effectiveDate_minute', 'effectiveDate_second', 'expirationDate_hour', 'expirationDate_minute', 'expirationDate_second']
Columns with NaN or infinite values before scaling:  []
All columns in t_data_matched are equal to train_encoded_columns.


  if pd.api.types.is_categorical_dtype(original_dtype):
  if pd.api.types.is_categorical_dtype(original_dtype):
  if pd.api.types.is_categorical_dtype(original_dtype):
