Defining the autoencoder module

In [6]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest

# Constants and Configurations
your_path = r'/Users/elliotlindestam/Documents/Skola/Indek icloud/trioptima/'
train_folder_path = os.path.join(your_path, '6.Active Data/Train Model Data/')
test_folder_path = os.path.join(your_path, '6.Active Data/Test Data/')


def load_preprocessed_data(folder_path):
    # Get file in the folder
    files = [f for f in os.listdir(folder_path) if f != '.DS_Store']
    file_name = files[0][:-4]

    # Load preprocessed data
    file_path = os.path.join(your_path, '4.Scaled', file_name + '_Scaled.csv')
    data = pd.read_csv(file_path, header=0)

    # Convert data to float32
    data = data.astype(np.float32)

    # Check for inf or NaN values in data
    print("Inf values in data: ", np.isinf(data).values.sum())
    print("NaN values in data: ", np.isnan(data).values.sum())

    return data

def main():
    train_data_preprocessed = load_preprocessed_data(train_folder_path)
    test_data_preprocessed = load_preprocessed_data(test_folder_path)
    # Ensure data is in expected format
    assert train_data_preprocessed.apply(np.isreal).all().all(), "Non-numeric data found"
    assert not train_data_preprocessed.isnull().any().any(), "NaN values found"

    assert test_data_preprocessed.apply(np.isreal).all().all(), "Non-numeric data found"
    assert not test_data_preprocessed.isnull().any().any(), "NaN values found"

    # Initialize Isolation Forest
    clf = IsolationForest(n_estimators=100, contamination='auto')
    
    # Fit the Isolation Forest model
    clf.fit(train_data_preprocessed)

    # Optionally, use the Isolation Forest to predict on the data
    # The model will return -1 for outliers and 1 for inliers
    predictions = clf.predict(test_data_preprocessed)

    # Count the number of -1 values in the predictions array
    number_of_outliers = np.sum(predictions == -1)
    print("Number of outliers detected:", number_of_outliers)


    # Get anomaly scores
    anomaly_scores = clf.decision_function(test_data_preprocessed)

    # Return the Isolation Forest model, predictions, and anomaly scores
    return clf, predictions, anomaly_scores

if __name__ == "__main__":
    model, preds, scores = main()



Inf values in data:  0
NaN values in data:  0
Inf values in data:  0
NaN values in data:  0
Number of outliers detected: 3
