THE DATA PROCESSING, CURRENTLY SET TO PROCESS TO FILES - ONE TRAINING FILE FOR THE AUTOENCODER MODEL AND ONE TRADE FILE WHERE OUTLIERS CAN BE TESTED

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

def process_data(data, date_cols, output_path, train_feature_names=None):
    """
    Preprocesses the input data and writes the result to a CSV file.

    Parameters:
    data (DataFrame): Input data to be preprocessed.
    date_cols (list): List of date columns to be transformed.
    output_path (str): Path for the processed data CSV file.
    train_feature_names (list): Feature names from the training data.

    Returns:
    DataFrame: The preprocessed data.
    """

    # Handling DateTime Variables
    for col in date_cols:
        data[col] = pd.to_datetime(data[col], errors='coerce')  # Convert to datetime
        data[f'{col}Year'] = data[col].dt.year  # Extract year
        data[f'{col}Month'] = data[col].dt.month  # Extract month
        data[f'{col}Day'] = data[col].dt.day  # Extract day
        data = data.drop(columns=[col])  # Drop the original date column

    # Automatically identify numerical and categorical variables
    numerical_vars = [col for col in data.select_dtypes(include=[np.number]).columns.tolist() 
                      if not (col.endswith('Year') or col.endswith('Month') or col.endswith('Day'))]
    categorical_vars = data.select_dtypes(include=[object]).columns.tolist()

    # Print information for verification
    print(f"Numerical Columns: {numerical_vars}")
    print(f"Categorical Columns: {categorical_vars}")

    # Create transformers
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Initialize ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_vars),
            ('cat', categorical_transformer, categorical_vars)
        ]
    )

    # Fit and transform the data
    data_preprocessed = preprocessor.fit_transform(data)

    # Get feature names after one-hot encoding
    onehot_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_vars)

    # Combine the feature names
    feature_names = numerical_vars + onehot_columns.tolist()

    # If the function was provided training feature names, add any missing columns with zeros
    if train_feature_names is not None:
        missing_columns = set(train_feature_names) - set(feature_names)
        for missing_col in missing_columns:
            data_preprocessed = np.column_stack((data_preprocessed, np.zeros(data_preprocessed.shape[0])))
        feature_names = train_feature_names

    # Print and verify the lengths
    print(f"Data Shape after Preprocessing: {data_preprocessed.shape}")
    print(f"Number of Feature Names: {len(feature_names)}")

    data_preprocessed_dense = data_preprocessed.toarray() if hasattr(data_preprocessed, 'toarray') else data_preprocessed
    data_preprocessed_df = pd.DataFrame(data_preprocessed_dense, columns=feature_names)

    # Handle columns with all missing values
    for col in data_preprocessed_df.columns:
        if data_preprocessed_df[col].isnull().all():
            data_preprocessed_df[col].fillna(0, inplace=True)  # fill NaN with 0 for these particular columns

    # Save preprocessed data to CSV
    data_preprocessed_df.to_csv(output_path, index=False)
    return data_preprocessed_df

# SET THIS TO THE FOLDER WHERE THE FILES ARE LOCATED
your_path = r'C:\Users\gusta\Documents\KTH\TriOptima\trioptima'

# For training data
file_name1 = '\CLEAN'
file_path1 = your_path + file_name1 + '.csv'
date_cols = ['effectiveDate', 'eventDateTime', 'executionDateTime', 'expirationDate', 'sDRreceiptTimestamp']
data1 = pd.read_csv(file_path1)
output_path1 = your_path + file_name1 + '_Processed.csv'
processed_data_clean = process_data(data1, date_cols, output_path1)

# For trade data
file_name2 = '\TRADES'
file_path2 = your_path + file_name2 + '.csv'
data2 = pd.read_csv(file_path2)
output_path2 = your_path + file_name2 + '_Processed.csv'
processed_data_trade = process_data(data2, date_cols, output_path2, processed_data_clean.columns.tolist())

Numerical Columns: ['leg1NotionalAmount', 'leg2NotionalAmount', 'leg2ResetFrequencyMultiplier', 'leg1FixedRate', 'leg1FixedRatePaymentFrequencyMultiplier', 'leg2FloatingRatePaymentFrequencyMultiplier', 'leg2UnderlierTenorMultiplier']
Categorical Columns: ['action', 'assetClass', 'blockTradeIndicator', 'cleared', 'deliveryType', 'disseminationTimestamp', 'event', 'instrumentType', 'leg1FixedRateDayCount', 'leg1FloatingRateDayCount', 'leg1NotionalCurrency', 'leg1NotionalScheduleType', 'leg1SettlementCurrency', 'leg1UnderlyingAssetOrContractType', 'leg2FloatingRateDayCount', 'leg2NotionalCurrency', 'leg2NotionalScheduleType', 'leg2SettlementCurrency', 'nonStandardTermIndicator', 'platformID', 'primeBrokerageTransactionIndicator', 'productName', 'leg1FixedRatePaymentFrequencyPeriod', 'leg2FloatingRatePaymentFrequencyPeriod', 'leg2UnderlierCurrency', 'leg2UnderlierID', 'leg2UnderlierIDSource', 'leg2UnderlierTenorPeriod', 'leg2ResetFrequencyPeriod']
Data Shape after Preprocessing: (5804, 504

 'primeBrokerageTransactionIndicator']. At least one non-missing value is needed for imputation with strategy='median'.


NEW ATTEMPT

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from joblib import dump, load

def fit_transform_data(data, date_cols, output_path, preprocessor_path):
    """
    Fits transformers, preprocesses the input data, and writes the result to a CSV file.

    Parameters:
    data (DataFrame): Input data to be preprocessed.
    date_cols (list): List of date columns to be transformed.
    output_path (str): Path for the processed data CSV file.
    preprocessor_path (str): Path to save the fitted preprocessor.
    """

    # Handling DateTime Variables
    for col in date_cols:
        data[col] = pd.to_datetime(data[col], errors='coerce')
        data[f'{col}Year'] = data[col].dt.year
        data[f'{col}Month'] = data[col].dt.month
        data[f'{col}Day'] = data[col].dt.day
        data = data.drop(columns=[col])

    # Identify numerical and categorical variables
    numerical_vars = [col for col in data.select_dtypes(include=[np.number]).columns.tolist() if not col.endswith(('Year', 'Month', 'Day'))]
    categorical_vars = data.select_dtypes(include=[object]).columns.tolist()

    # Create transformers
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Initialize and fit ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_vars),
            ('cat', categorical_transformer, categorical_vars)
        ]
    )

    data_preprocessed = preprocessor.fit_transform(data)
    dump(preprocessor, preprocessor_path)

    # Get feature names after one-hot encoding
    onehot_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_vars)
    feature_names = numerical_vars + onehot_columns.tolist()

    data_preprocessed_dense = data_preprocessed.toarray() if hasattr(data_preprocessed, 'toarray') else data_preprocessed
    data_preprocessed_df = pd.DataFrame(data_preprocessed_dense, columns=feature_names)

    data_preprocessed_df.to_csv(output_path, index=False)
    return data_preprocessed_df

def transform_data(data, date_cols, output_path, preprocessor_path):
    """
    Uses a fitted preprocessor to transform the input data and writes the result to a CSV file.
    """
    for col in date_cols:
        data[col] = pd.to_datetime(data[col], errors='coerce')
        data[f'{col}Year'] = data[col].dt.year
        data[f'{col}Month'] = data[col].dt.month
        data[f'{col}Day'] = data[col].dt.day
        data = data.drop(columns=[col])

    preprocessor = load(preprocessor_path)
    data_preprocessed = preprocessor.transform(data)

    onehot_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(data.select_dtypes(include=[object]).columns.tolist())
    feature_names = data.select_dtypes(include=[np.number]).columns.tolist() + onehot_columns.tolist()

    data_preprocessed_dense = data_preprocessed.toarray() if hasattr(data_preprocessed, 'toarray') else data_preprocessed
    data_preprocessed_df = pd.DataFrame(data_preprocessed_dense, columns=feature_names)

    data_preprocessed_df.to_csv(output_path, index=False)
    return data_preprocessed_df

# Example Usage
your_path = r'C:\Users\gusta\Documents\KTH\TriOptima\trioptima'

# For training data
file_name1 = '\CLEAN'
file_path1 = your_path + file_name1 + '.csv'
date_cols = ['effectiveDate', 'eventDateTime', 'executionDateTime', 'expirationDate', 'sDRreceiptTimestamp']
data1 = pd.read_csv(file_path1)
output_path1 = your_path + file_name1 + '_Processed.csv'
preprocessor_path = your_path + '/fitted_preprocessor.joblib'

processed_data_clean = fit_transform_data(data1, date_cols, output_path1, preprocessor_path)

# For trade data
file_name2 = '\TRADES'
file_path2 = your_path + file_name2 + '.csv'
data2 = pd.read_csv(file_path2)
output_path2 = your_path + file_name2 + '_Processed.csv'

processed_data_trade = transform_data(data2, date_cols, output_path2, preprocessor_path)


ValueError: input_features should have length equal to number of features (29), got 25

In [7]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from joblib import dump, load

def process_data(data, date_cols, output_path, preprocessor_path=None):
    """
    ... (the rest of your documentation) ...
    """
    # Handling DateTime Variables
    for col in date_cols:
        data[col] = pd.to_datetime(data[col], errors='coerce')
        data[f'{col}Year'] = data[col].dt.year
        data[f'{col}Month'] = data[col].dt.month
        data[f'{col}Day'] = data[col].dt.day
        data = data.drop(columns=[col])

    # Automatically identify numerical and categorical variables
    numerical_vars = [col for col in data.select_dtypes(include=[np.number]).columns.tolist() 
                      if not (col.endswith('Year') or col.endswith('Month') or col.endswith('Day'))]
    categorical_vars = data.select_dtypes(include=[object]).columns.tolist()

    # Print information for verification
    print(f"Numerical Columns: {numerical_vars}")
    print(f"Categorical Columns: {categorical_vars}")

    # Create transformers
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Initialize ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_vars),
            ('cat', categorical_transformer, categorical_vars)
        ]
    )

    # Fit and transform the data
    if preprocessor_path is not None and os.path.exists(preprocessor_path):
        preprocessor = load(preprocessor_path)
        
        # Ensure the data has the same columns as the data the preprocessor was trained on
        expected_numeric_columns = set(preprocessor.named_transformers_['num'].named_steps['scaler'].mean_.index)
        expected_categorical_columns = set(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_vars))
        expected_columns = expected_numeric_columns.union(expected_categorical_columns)
        
        input_numeric_columns = set(data.select_dtypes(include=[np.number]).columns.tolist())
        input_categorical_columns = set(data.select_dtypes(include=[object]).columns.tolist())
        input_columns = input_numeric_columns.union(input_categorical_columns)

        missing_columns = expected_columns - input_columns

        if missing_columns:
            print(f"Warning: Missing columns detected:")
            print(f"Columns expected but not found in input data: {missing_columns}")
            for col in missing_columns:
                if col in expected_numeric_columns:
                    data[col] = 0  # or use np.nan or another default value
                else:
                    data[col] = 'missing'  # or another default value
        
        # Ensure columns are in the expected order
        data = data[expected_columns]
        data_preprocessed = preprocessor.transform(data)

    else:
        data_preprocessed = preprocessor.fit_transform(data)
        if preprocessor_path is not None:
            dump(preprocessor, preprocessor_path)

    onehot_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_vars)
    feature_names = numerical_vars + onehot_columns.tolist()
    print(f"Data Shape after Preprocessing: {data_preprocessed.shape}")
    print(f"Number of Feature Names: {len(feature_names)}")
    data_preprocessed_dense = data_preprocessed.toarray() if hasattr(data_preprocessed, 'toarray') else data_preprocessed
    data_preprocessed_df = pd.DataFrame(data_preprocessed_dense, columns=feature_names)
    data_preprocessed_df.to_csv(output_path, index=False)
    return data_preprocessed_df

# Replace your_path with your actual path
your_path = r'C:\Users\gusta\Documents\KTH\TriOptima\trioptima'

# For training data
file_name1 = '\CLEAN'
file_path1 = your_path + file_name1 + '.csv'
date_cols = ['effectiveDate', 'eventDateTime', 'executionDateTime', 'expirationDate', 'sDRreceiptTimestamp']
data1 = pd.read_csv(file_path1)
output_path1 = your_path + file_name1 + '_Processed.csv'
preprocessor_path = your_path + '\preprocessor.joblib'
processed_data_clean = process_data(data1, date_cols, output_path1, preprocessor_path)

# For trade data
file_name2 = '\TRADES'
file_path2 = your_path + file_name2 + '.csv'
data2 = pd.read_csv(file_path2)
output_path2 = your_path + file_name2 + '_Processed.csv'
processed_data_trade = process_data(data2, date_cols, output_path2, preprocessor_path)


Numerical Columns: ['leg1NotionalAmount', 'leg2NotionalAmount', 'leg2ResetFrequencyMultiplier', 'leg1FixedRate', 'leg1FixedRatePaymentFrequencyMultiplier', 'leg2FloatingRatePaymentFrequencyMultiplier', 'leg2UnderlierTenorMultiplier']
Categorical Columns: ['action', 'assetClass', 'blockTradeIndicator', 'cleared', 'deliveryType', 'disseminationTimestamp', 'event', 'instrumentType', 'leg1FixedRateDayCount', 'leg1FloatingRateDayCount', 'leg1NotionalCurrency', 'leg1NotionalScheduleType', 'leg1SettlementCurrency', 'leg1UnderlyingAssetOrContractType', 'leg2FloatingRateDayCount', 'leg2NotionalCurrency', 'leg2NotionalScheduleType', 'leg2SettlementCurrency', 'nonStandardTermIndicator', 'platformID', 'primeBrokerageTransactionIndicator', 'productName', 'leg1FixedRatePaymentFrequencyPeriod', 'leg2FloatingRatePaymentFrequencyPeriod', 'leg2UnderlierCurrency', 'leg2UnderlierID', 'leg2UnderlierIDSource', 'leg2UnderlierTenorPeriod', 'leg2ResetFrequencyPeriod']


AttributeError: 'numpy.ndarray' object has no attribute 'index'