In [None]:
import os
import pandas as pd

def sum(csv_file):
    try:
        df = pd.read_csv(csv_file, header=0, index_col=0)
        # Exclude first column (variable names)
        matrix_data = df.iloc[:, 1:].to_numpy()
        # Calculate sum of nonzeros
        sum = matrix_data.sum()
        return sum
    except Exception as e:
        print(f"Error processing {csv_file}: {e}")
        return None

In [None]:
folder_path = '/Users/naufaamirani/Documents/Columbia/CBMFW4761/deeptfni_colab/Input_data'

for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        sum = sum(file_path)
        print(f"Sum of 1's in {file_name}: {sum}")

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

def process_adj(path):
    df = pd.read_csv(path, sep='\t', header=None, index_col=None)
    # Assuming the first row contains column names
    df.columns = df.iloc[0]  
    # Reset index after setting columns
    df = df.iloc[1:].reset_index(drop=True) 
    # df = df.drop(['TF',axis=1])
    df = df.astype(int, errors='ignore')
    return df

def confusion(y_true_df, y_pred_df):
    # Flatten dfs for metrics
    y_true_flat = y_true_df.values.flatten()
    y_pred_flat = y_pred_df.values.flatten()
    # Convert string to numeric
    y_true_numeric = np.where(y_true_flat == '1', 1, 0).astype(int)
    y_pred_numeric = np.where(y_pred_flat == '1', 1, 0).astype(int)
    # Compute confusion matrix
    cm = confusion_matrix(y_true_numeric, y_pred_numeric)
    # Compute accuracy precision recall
    accuracy = accuracy_score(y_true_numeric, y_pred_numeric)
    precision = precision_score(y_true_numeric, y_pred_numeric, average='binary')
    recall = recall_score(y_true_numeric, y_pred_numeric, average='binary')

    # Find indices where true label= 1 and predicted label= 1
    true_positive_indices = np.where((y_true_numeric == 1) & (y_pred_numeric == 1))[0]

    if len(true_positive_indices) > 0:
        true_positive_columns = y_true_df.columns.tolist()
        # Filter true positive indices to ensure they are within the range of column names
        true_positive_indices_filtered = [idx for idx in true_positive_indices if idx < len(true_positive_columns)]
        # Get column names
        true_positive_column_names = [true_positive_columns[idx] for idx in true_positive_indices_filtered]
        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=['Predicted 0', 'Predicted 1'],
                    yticklabels=['Actual 0', 'Actual 1'])
        plt.xlabel('Predicted labels')
        plt.ylabel('True labels')
        plt.title('Confusion Matrix')
        plt.show()
        return accuracy, precision, recall, true_positive_indices_filtered
    else:
        print("No true positives found.")
        return accuracy, precision, recall, []

In [None]:
import os
import pandas as pd

pred_dir = "/Users/naufaamirani/Downloads/results/final adjacency matrix/binary"
true_dir = "/Users/naufaamirani/Downloads/results/initial adjacency matrix"

names = []
accuracies = []
precisions = []
recalls = []
tp_indices = []

for file in os.listdir(pred_dir):
    print(file+'\n')
    y_pred = process_adj(pred_dir + os.sep + file)
    sample = "_".join(file.split("_")[0:4])
    print(sample+'\n')
    # COMBO & GENE use the same initial adjacency
    if sample.startswith("COMBO"):
        temp_sample = sample.replace("COMBO", "GENE")
        y_true_path = true_dir + os.sep + temp_sample + "_subset.txt"  
    # SOLO & RNASEQ use the same initial adjacency
    elif sample.startswith("RNASEQ"):
        temp_sample = sample.replace("RNASEQ", "SOLO")
        y_true_path = true_dir + os.sep + temp_sample + "_subset.txt"  
    else:
        y_true_path = true_dir + os.sep + sample + "_subset.txt"

    print(y_true_path)
    y_true = process_adj(y_true_path)

    # Get scores
    accuracy, precision, recall, tp_i = confusion(y_true, y_pred)

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)

    names.append(sample)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    tp_indices.append(len(tp_i))

In [None]:
def split_data_by_third_token(names, accuracies):
    def get_third_token(filename):
        return filename.split('_')[3]

    unique_third_tokens = sorted(set(get_third_token(filename) for filename in names))
    # Split filenames and accuracies into lists based on third token
    data_by_third_token = {token: {'filenames': [], 'accuracies': []} for token in unique_third_tokens}
    for filename, accuracy in zip(names, accuracies):
        third_token = get_third_token(filename)
        data_by_third_token[third_token]['filenames'].append(filename)
        data_by_third_token[third_token]['accuracies'].append(accuracy)

    # Convert dict to lists of filenames and accuracies
    names_lists = []
    accuracies_lists = []

    for token_data in data_by_third_token.values():
        names_lists.append(token_data['filenames'])
        accuracies_lists.append(token_data['accuracies'])

    return names_lists, accuracies_lists

In [None]:
names_lists, accuracies_lists = split_data_by_third_token(names, accuracies)
print(len(names_lists))
print(names_lists[0][0])

In [None]:
names_lists, precisions_lists = split_data_by_third_token(names, precisions)
print(len(precisions_lists))
print(precisions_lists[0][0])

In [None]:
names_lists, precisions_lists = split_data_by_third_token(names, recalls)
print(len(recalls_lists))
print(recalls_lists[0][0])

In [None]:
def plot(nams, accs, name):
    # Nested dict to store aggregated values by (category, subcategory) pairs
    category_subcategory_values = {}
    # Process each name and corresponding value
    for filename, value in zip(nams, accs):
        tokens = filename.split('_')
        category = tokens[0]
        subcategory = tokens[1]
        
        # Update the dictionary with the (category, subcategory) pair and its associated value
        # category = COMBO/GENE/RNASEQ/SOLO
        # subcategory = ALS/CTR
        if (category, subcategory) in category_subcategory_values:
            category_subcategory_values[(category, subcategory)].append(value)
        else:
            category_subcategory_values[(category, subcategory)] = [value]

    # Calculate averages for each (category, subcategory)
    averages = {}
    for key, value_list in category_subcategory_values.items():
        avg = np.mean(value_list)*100
        rounded_avg = round(avg, 2)
        percentage_avg = "{:.2f}%".format(rounded_avg)
        averages[key] = percentage_avg
        print(str(key) + ": " + str(percentage_avg))

    categories = sorted(set(category for category, subcategory in averages.keys()))
    subcategories = sorted(set(subcategory for category, subcategory in averages.keys()))

    # Prepare for plotting multiple bars
    # Average values grouped by category and subcategory)
    data = [[averages[(category, subcategory)] for subcategory in subcategories] for category in categories]
    num_subcategories = len(subcategories)

    bar_width = 0.1
    fig, ax = plt.subplots(figsize=(6, 6))
    for i, category in enumerate(categories):
        x = np.arange(num_subcategories) + i * bar_width
        ax.bar(x, data[i], width=bar_width, label=category)
    ax.set_xlabel('Subcategory')
    ax.set_ylabel('Average Values')
    ax.set_title('Count of False Positives by Model Types ('+name+')')
    ax.set_xticks(np.arange(num_subcategories) + (len(categories) - 1) * bar_width / 2)
    ax.set_xticklabels(subcategories)
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), shadow=True, ncol=len(categories))
    plt.show()

In [None]:
plot(names_lists[0], recalls_lists[0], "Astro")

In [None]:
plot(names_lists[1], recalls_lists[1], "Exc")