In [36]:
import pandas as pd
import pingouin as pg
import time
from scipy import stats
from tqdm import tqdm
import numpy as np
from pathlib import Path
import snf
from sklearn.cluster import spectral_clustering
from sklearn.metrics import v_measure_score
import networkx as nx
import itertools
from copy import deepcopy

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [53]:
def get_correlation_dataframe(df):
    col_1 = []
    col_2 = []
    col_r = []
    col_p = []
    
    for idx1, row1 in tqdm(df.iterrows(), total=df.shape[0]):
        for idx2, row2 in df.loc[idx1:, :].iterrows():
            r, p = stats.pearsonr(row1.values, row2.values)
            col_1.append(idx1)
            col_2.append(idx2)
            col_r.append(r)
            col_p.append(p)
            
    corr_df = pd.DataFrame.from_dict({
        "sample1": col_1,
        "sample2": col_2,
        "r": col_r,
        "p": col_p
    })
    return corr_df


def merge_correlation_dataframes(dfs):
    
    greatest_r = np.argmax(np.array([df.r for df in dfs]), axis=0)
    to_concat = [df.loc[greatest_r == i] for i, df in enumerate(dfs)]
    return pd.concat(to_concat).sort_index()


def build_edge_list(df, r_filter, p_filter):
    edges_df = df.loc[(df.r >= r_filter) & (df['p'] <= p_filter)]
    return edges_df.rename(columns={'sample1':'source', 'sample2':'target'})

def filter_relevant_connections(df, threshold):
    return df.loc[(df.weight >= threshold)]


def get_stage_class_from_patient(patient_idx, clin_df, agglutinate_stages=False):
    stage_str = clin_df.loc[patient_idx, "pathologic_stage"]
    
    if stage_str in ["stage i"+suffix for suffix in ['', 'a','b','c']]:
        return "stage1"
    elif stage_str in ["stage ii"+suffix for suffix in ['', 'a','b','c']]:
        if agglutinate_stages:
            return "stage23"
        else:
            return "stage2"
    elif stage_str in ["stage iii"+suffix for suffix in ['', 'a','b','c']]:
        if agglutinate_stages:
            return "stage23"
        else:
            return "stage3"
    elif stage_str in ["stage iv"+suffix for suffix in ['', 'a','b','c']]:
        return "stage4"
    else:
        return np.nan


def build_class_df(sample_idxs, agglutinate_stages=False):
    
    clin_df = pd.read_csv(f"{base}{cancer}_clin.txt", sep="\t", index_col=0).T.iloc[:, [6]]
    
    class_col = []
    for idx in sample_idxs:
        patient_idx = '-'.join(idx.split('.')[:-1]).lower()
        sample_type = int(idx.split('.')[-1])
        
        if sample_type <= 9:   # Tumor sample
            class_col.append(get_stage_class_from_patient(patient_idx, clin_df, agglutinate_stages))
        elif sample_type <= 19:   # Normal sample
            class_col.append('normal')
        elif sample_type <= 29:   # Control sample
            print(f"Warning! Found control sample {idx}, Skipping...")
            continue
        else:
            print(f"Warning! Found unexpected sample type: {idx}. Skipping...")
                  
    return pd.DataFrame.from_dict({
            "id": sample_idxs,
            "class": class_col
        }).set_index("id")


def get_consistency_index(corr_df, class_df):
    correct_connections = 0
    for index, row in corr_df.iterrows():
        src_class = class_df.loc[row[0], "class"]
        trg_class = class_df.loc[row[1], "class"]
        
        if src_class == trg_class:
            correct_connections += 1
    return correct_connections/len(corr_df)


def generate_csvs(edges_df, class_df, max_each_feature=100, multi_omics=True):
    
    Path(save_dir).mkdir(parents=True, exist_ok=True)
    
    edges_df.to_csv(save_dir+"edges.csv", index=False)
    class_df.to_csv(save_dir+"classes.csv", index=True)
    
    gene = pd.read_csv(f"{base}{cancer}_mRNA.csv", index_col=0).iloc[:max_each_feature, :]
    if multi_omics:
        mirna = pd.read_csv(f"{base}{cancer}_miRNA.csv", index_col=0).iloc[:max_each_feature, :]
        meth = pd.read_csv(f"{base}{cancer}_Methy.csv", index_col=0).iloc[:max_each_feature, :] 
        #cnv = pd.read_csv(f"{base}{cancer}_CNV.csv", index_col=0).iloc[:max_each_feature, :]
        #features_df = pd.concat([gene,mirna,meth,cnv]).T
        features_df = pd.concat([gene,mirna,meth]).T
    else:
        features_df = gene.T
    
    features_df.loc[class_df.index, :].to_csv(save_dir+"features.csv", index=True)
    return


def get_snf_network(dfs_values_matrices, class_df):
    affinity_networks = snf.make_affinity(dfs_values_matrices)
    fused_network = snf.snf(affinity_networks)
    return fused_network
    #np.fill_diagonal(fused_network, 1)
    #G = nx.from_pandas_adjacency(pd.DataFrame(data=fused_network, index=class_df.index.values, columns=class_df.index.values), create_using=nx.Graph())
    #return nx.to_pandas_edgelist(G)
    
def get_best_performance_sc(true, pred):
    classes = ['stage1','stage2','stage3','stage4']
    possible_orders = list(itertools.permutations(classes))
    pred=np.array(pred)
    
    best_acc = 0
    best_prec = 0
    best_recall = 0
    for order in possible_orders:
        replace_dict = {
            order[0]: 0,
            order[1]: 1,
            order[2]: 2,
            order[3]: 3
            }
        
        true_cp = deepcopy(true)
        for k, v in replace_dict.items(): 
            true_cp[true==k] = v

        true_cp=np.array(true_cp)
        acc = accuracy_score(list(true_cp), list(pred))
        precision, recall, _, _ = precision_recall_fscore_support(list(true_cp), list(pred))
        if acc > best_acc:
            best_acc = acc
            best_prec = precision
            best_recall = recall
            
    return best_acc, best_prec, best_recall

In [51]:
cancers = ["COAD", "KIRC", "LUAD"]
predictions = {}
true_class = {}

for cancer in cancers:
    print(cancer)
    base = f"C:/Users/colombelli/Desktop/TCC/experiments/{cancer}/"

    dfs_values = []

    df1 = pd.read_csv(f"{base}{cancer}_mRNA.csv", index_col=0).T
    class_df = build_class_df(list(df1.index), agglutinate_stages=False).dropna()
    dfs_values.append(df1.loc[class_df.index, :].values)

    df2 = pd.read_csv(f"{base}{cancer}_miRNA.csv", index_col=0).T
    dfs_values.append(df2.loc[class_df.index, :].values)

    df3 = pd.read_csv(f"{base}{cancer}_Methy.csv", index_col=0).T
    dfs_values.append(df3.loc[class_df.index, :].values)
    
    affinity_matrix = get_snf_network(dfs_values, class_df)
    
    labels = spectral_clustering(affinity_matrix, n_clusters=4)
    predictions[cancer] = labels
    true_class[cancer] = class_df['class'].values

COAD
KIRC
LUAD


In [55]:
for cancer in cancers:
    print(cancer)
    print(get_best_performance_sc(true_class[cancer], predictions[cancer]))

COAD
(0.2978723404255319, array([0.37777778, 0.234375  , 0.32631579, 0.12121212]), array([0.30357143, 0.34090909, 0.36470588, 0.09756098]))
KIRC
(0.3738019169329074, array([0.41558442, 0.56451613, 0.07692308, 0.14141414]), array([0.44444444, 0.46052632, 0.03225806, 0.24137931]))
LUAD
(0.2966292134831461, array([0.27906977, 0.56428571, 0.06930693, 0.18644068]), array([0.22018349, 0.32377049, 0.36842105, 0.30136986]))
