In [None]:
import sys 
sys.path.append('..')
import os
import numpy as np
import pandas as pd
from numpy import array, random, arange
from scipy import stats
from icecream import ic
from src.models.base_model import BaseModel
import config

In [None]:
def xicor(X, Y, ties=True):
    random.seed(42)
    n = len(X)
    order = array([i[0] for i in sorted(enumerate(X), key=lambda x: x[1])])
    if ties:
        l = array([sum(y >= Y[order]) for y in Y[order]])
        r = l.copy()
        for j in range(n):
            if sum([r[j] == r[i] for i in range(n)]) > 1:
                tie_index = array([r[j] == r[i] for i in range(n)])
                r[tie_index] = random.choice(r[tie_index] - arange(0, sum([r[j] == r[i] for i in range(n)])), sum(tie_index), replace=False)
        return 1 - n*sum( abs(r[1:] - r[:n-1]) ) / (2*sum(l*(n - l)))
    else:
        r = array([sum(y >= Y[order]) for y in Y[order]])
        return 1 - 3 * sum( abs(r[1:] - r[:n-1]) ) / (n**2 - 1)

In [None]:
# Initialize a list to store results
correlation_results = []

# Iterate over each dataset in the data folder
for dataset in os.listdir(os.path.join('..', config.DATA_FOLDER)):
    path = os.path.join('..', config.DATA_FOLDER, dataset)
    name = os.path.basename(path)
    
    # Read data using BaseModel
    model = BaseModel(path, name)
    model.read_data()
    y_data = model.y_data.dropna(ignore_index=True)
    
    # Calculate correlations for all pairs of y columns
    for i in range(y_data.shape[1]):
        for j in range(i + 1, y_data.shape[1]):
            x = y_data.iloc[:, i]
            y = y_data.iloc[:, j]
            
            # Get column names for the current pair
            column_pair = (y_data.columns[i], y_data.columns[j])
            
            # Calculate XiCor correlation
            xi = xicor(x, y)
            correlation_results.append({
                'Dataset': name,
                'Correlation_Type': 'XiCor',
                'Column_Pair': column_pair,
                'Value': xi
            })
            
            # Calculate Pearson correlation
            pearson_corr, _ = stats.pearsonr(x, y)
            correlation_results.append({
                'Dataset': name,
                'Correlation_Type': 'Pearson',
                'Column_Pair': column_pair,
                'Value': pearson_corr
            })
            
            # Calculate Spearman correlation
            spearman_corr, _ = stats.spearmanr(x, y)
            correlation_results.append({
                'Dataset': name,
                'Correlation_Type': 'Spearman',
                'Column_Pair': column_pair,
                'Value': spearman_corr
            })

# Convert the results to a Pandas DataFrame for better organization and further analysis
correlation_df = pd.DataFrame(correlation_results)
correlation_df.to_csv('correlation_results.csv', index=False)

In [None]:
correlation_df