Fuse

In [None]:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

# Directory containing the downloaded CSV files
raw_data_directory = os.path.expanduser("~/Desktop/Trioptima/Raw data")

# Output directory for categorized CSV files
output_directory = os.path.expanduser("~/Desktop/Trioptima/Categorized Data")

# List of keywords to categorize CSV files
categories = ['RT.FX']  #'RT.CDS', 'RT.EQUITY', 'RT.IRS', RT.FX, RT.COMMODITY

# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Function to process a single file and return a category dataframe
def process_file(file_path, category):
    try:
        df = pd.read_csv(file_path, dtype=str)
        df.replace('nan', '', inplace=True)
        return category, df
    except pd.errors.ParserError as e:
        print(f"Error processing {file_path}: {e}")
        return category, pd.DataFrame()

# Iterate through categories
for category in categories:
    # Dictionary to store data frames for the current category
    category_dataframes = {category: pd.DataFrame()}
    
    category_directory = os.path.join(raw_data_directory, category)
    if os.path.exists(category_directory):
        processed_files_count = 0  # Reset processed files count for the current category
        
        with ThreadPoolExecutor() as executor:
            # Process files in parallel using ThreadPoolExecutor
            futures = []
            for filename in os.listdir(category_directory):
                if filename.lower().endswith('.csv'):
                    file_path = os.path.join(category_directory, filename)
                    futures.append(executor.submit(process_file, file_path, category))

            # Retrieve results from futures and update category dataframes
            for future in futures:
                category, df = future.result()
                category_dataframes[category] = pd.concat([category_dataframes[category], df], sort=False)

                # Increment the processed files count
                processed_files_count += 1

                # Print the count of processed files after each file
                print(f"Processed {processed_files_count} files from {category} category.")

        # Save data frames to a separate CSV file for the current category
        category_output_path = os.path.join(output_directory, f"{category}_Data.csv")
        category_dataframes[category].to_csv(category_output_path, index=False)
        print(f"{category} Data saved at: {category_output_path}")
    else:
        print(f"Category directory not found: {category_directory}")


Import

In [None]:
from ftplib import FTP
import os
import zipfile

# FTP server details
ftp_host = "ftp.cmegroup.com"
ftp_directory = "/sdr"

# Local directory to save .csv and .zip files
local_directory = os.path.expanduser("~/Desktop/Trioptima/Raw data")

# Function to download files from FTP server recursively
def download_files_from_ftp(ftp, ftp_directory, local_directory):
    try:
        ftp.cwd(ftp_directory)
        files = ftp.nlst()
        for file_name in files:
            remote_file_path = ftp_directory + '/' + file_name
            local_file_path = os.path.join(local_directory, file_name)
            if (file_name.lower().endswith('.csv')):
                with open(local_file_path, 'wb') as local_file:
                    ftp.retrbinary(f"RETR {remote_file_path}", local_file.write)
                print(f"Downloaded .csv file: {file_name}")
            elif (file_name.lower().endswith('.zip')):
                with open(local_file_path, 'wb') as local_file:
                    ftp.retrbinary(f"RETR {remote_file_path}", local_file.write)
                print(f"Downloaded .zip file: {file_name}")
                # Extract .zip file
                with zipfile.ZipFile(local_file_path, 'r') as zip_ref:
                    zip_ref.extractall(local_directory)
                print(f"Extracted .zip file: {file_name}")
                # Delete the original .zip file
                os.remove(local_file_path)
                print(f"Deleted .zip file: {file_name}")
            elif '.' not in file_name:
                download_files_from_ftp(ftp, remote_file_path, local_directory)
    except Exception as e:
        print(f"Failed to download files: {e}")

# Create local directory if it doesn't exist
os.makedirs(local_directory, exist_ok=True)

# Connect to FTP server and start downloading files
try:
    with FTP(ftp_host) as ftp:
        ftp.login()  # Log in anonymously
        download_files_from_ftp(ftp, ftp_directory, local_directory)
except Exception as e:
    print(f"Failed to connect to FTP server: {e}")


Machine learning algo

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load data
data = pd.read_csv('your_data.csv')

# Normalize data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Split data into training and testing sets
x_train, x_test = train_test_split(scaled_data, test_size=0.2, random_state=42)



In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

# Define model
input_dim = x_train.shape[1]  # Number of features
encoding_dim = 14  # Length of encoded representation. Adjust based on your data.

input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation="tanh")(input_layer)
encoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)
decoder = Dense(int(encoding_dim / 2), activation='tanh')(encoder)
decoder = Dense(input_dim, activation='relu')(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)

# Compile model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train model
autoencoder.fit(x_train, x_train, epochs=50, batch_size=32, shuffle=True, validation_split=0.2)


In [None]:
import numpy as np

# Get reconstruction error on the test set
reconstructed = autoencoder.predict(x_test)
mse = np.mean(np.power(x_test - reconstructed, 2), axis=1)

# Set a threshold for anomaly detection
threshold = np.quantile(mse, 0.95)  # Adjust the quantile value as needed

# Get outliers
outliers = x_test[mse > threshold]
