The pre processor is used to process all data into a suitable format for the autoencoder.

In [2]:
# PREPROCESSOR 
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Loading Data from CSV
file_path = '/Users/elliotlindestam/Documents/Skola/Indek icloud/trioptima/RT.IRS_CLEAN.csv'
data = pd.read_csv(file_path)

# Handling DateTime Variables
data['effectiveDate'] = pd.to_datetime(data['effectiveDate'], errors='coerce')
data['effectiveYear'] = data['effectiveDate'].dt.year
data['effectiveMonth'] = data['effectiveDate'].dt.month
data['effectiveDay'] = data['effectiveDate'].dt.day

# Drop the original 'effectiveDate' column
data = data.drop(columns=['effectiveDate'])

# Identifying variable types
numerical_vars = ['leg1FixedRate', 'leg1FixedRatePaymentFrequencyMultiplier', 'effectiveYear', 'effectiveMonth', 'effectiveDay']
categorical_vars = ['action', 'assetClass', 'leg2UnderlierTenorPeriod', 'leg2ResetFrequencyPeriod']

# Preprocessing: Numerical Variables
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing: Categorical Variables
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Column Transformer: Applying transformations to respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_vars),
        ('cat', categorical_transformer, categorical_vars)
    ]
)

# Applying Preprocessing
data_preprocessed = pd.DataFrame(preprocessor.fit_transform(data))


ModuleNotFoundError: No module named 'sklearn'

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# Read data from CSV file
file_path = 'your_file_path.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Assume the target variable for anomaly detection is named 'is_anomaly'
# And all other columns are features
features = data.drop(columns=['is_anomaly'])  

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Split the data into training and validation sets
X_train, X_val = train_test_split(scaled_features, test_size=0.2, random_state=42)


In [2]:
input_dim = 30  # Assume 30 features. Adjust as per your dataset.

# Define the encoder
encoder = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(20, activation='relu'),
    layers.Dense(10, activation='relu')
])

# Define the decoder
decoder = models.Sequential([
    layers.Input(shape=(10,)),
    layers.Dense(20, activation='relu'),
    layers.Dense(input_dim, activation='sigmoid')
])

# Combine encoder and decoder to define autoencoder
autoencoder = models.Sequential([encoder, decoder])

# Compile the model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')


Hello


In [3]:
history = autoencoder.fit(
    X_train, X_train,
    epochs=50,
    batch_size=256,
    validation_data=(X_val, X_val)
)


Hello, I am alive


In [None]:
# Obtain the reconstructions of the data
reconstructions = autoencoder.predict(X_val)

# Compute the mean squared error for each sample
mse_per_instance = np.mean(np.square(X_val - reconstructions), axis=1)

# Choose a threshold value
threshold = np.percentile(mse_per_instance, 95)  # e.g., 95th percentile

# Classify instances with MSE above the threshold as anomalies
anomalies = mse_per_instance > threshold
