In [13]:
import pandas as pd
import time
from scipy import stats
from tqdm import tqdm
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.cluster import spectral_clustering
from sklearn.metrics import v_measure_score
import networkx as nx

In [105]:
def get_correlation_dataframe(df):
    col_1 = []
    col_2 = []
    col_r = []
    col_p = []
    
    for idx1, row1 in tqdm(df.iterrows(), total=df.shape[0]):
        for idx2, row2 in df.loc[idx1:, :].iterrows():
            r, p = stats.pearsonr(row1.values, row2.values)
            col_1.append(idx1)
            col_2.append(idx2)
            col_r.append(r)
            col_p.append(p)
            
    corr_df = pd.DataFrame.from_dict({
        "sample1": col_1,
        "sample2": col_2,
        "r": col_r,
        "p": col_p
    })
    return corr_df


def merge_correlation_dataframes(dfs):
    
    greatest_r = np.argmax(np.array([df.r for df in dfs]), axis=0)
    to_concat = [df.loc[greatest_r == i] for i, df in enumerate(dfs)]
    return pd.concat(to_concat).sort_index()


def build_edge_list(df, r_filter, p_filter):
    edges_df = df.loc[(df.r >= r_filter) & (df['p'] <= p_filter)]
    return edges_df.rename(columns={'sample1':'source', 'sample2':'target'})

def filter_relevant_connections(df, threshold):
    return df.loc[(df.weight >= threshold)]


def generate_csvs(edges_df, class_df, max_each_feature=100, multi_omics=True):
    
    Path(save_dir).mkdir(parents=True, exist_ok=True)
    
    edges_df.to_csv(save_dir+"edges.csv", index=False)
    class_df.to_csv(save_dir+"classes.csv", index=True)
    
    gene = pd.read_csv(f"{base}{cancer}_mRNA.csv", index_col=0).iloc[:max_each_feature, :]
    if multi_omics:
        mirna = pd.read_csv(f"{base}{cancer}_miRNA.csv", index_col=0).iloc[:max_each_feature, :]
        meth = pd.read_csv(f"{base}{cancer}_Methy.csv", index_col=0).iloc[:max_each_feature, :] 
        #cnv = pd.read_csv(f"{base}{cancer}_CNV.csv", index_col=0).iloc[:max_each_feature, :]
        #features_df = pd.concat([gene,mirna,meth,cnv]).T
        features_df = pd.concat([gene,mirna,meth]).T
    else:
        features_df = gene.T
    
    features_df.loc[class_df.index, :].to_csv(save_dir+"features.csv", index=True)
    return



def get_correlation_vector(A, B, th):
    # Get number of rows in either A or B
    N = B.shape[0]

    # Store columnw-wise in A and B, as they would be used at few places
    sA = A.sum(0)
    sB = B.sum(0)

    # Basically there are four parts in the formula. We would compute them one-by-one
    p1 = N*np.einsum('ij,ik->kj',A,B)
    p2 = sA*sB[:,None]
    p3 = N*((B**2).sum(0)) - (sB**2)
    p4 = N*((A**2).sum(0)) - (sA**2)

    # Finally compute Pearson Correlation Coefficient as 2D array 
    pcorr = ((p1 - p2)/np.sqrt(p4*p3[:,None]))
    corr_vec = pcorr[np.tril_indices(n=A.shape[1],m=B.shape[1], k=-1)]
    corr_vec[np.absolute(corr_vec) > th] = 0
    return corr_vec


def get_classification_vector(sample, s1, s2, s3, s4, th):
    #print(s1)
    #print(sample)
    ps1 = np.vstack([s1, sample])
    ps1 = get_correlation_vector(ps1, ps1, th)
    
    ps2 = np.vstack([s2, sample])
    ps2 = get_correlation_vector(ps2, ps2, th)
    
    ps3 = np.vstack([s3, sample])
    ps3 = get_correlation_vector(ps3, ps3, th)
    
    ps4 = np.vstack([s4, sample])
    ps4 = get_correlation_vector(ps4, ps4, th)
    return np.concatenate([ps1,ps2,ps3,ps4])
    
    
def build_dataset(features, classes, th, seed):
    y_train, y_base_test = train_test_split(classes, stratify=classes, test_size=0.6, random_state=seed, shuffle=True)
    y_base, y_test = train_test_split(y_base_test, stratify=y_base_test, test_size=0.5, random_state=seed, shuffle=True)
    
    s1 = features.loc[y_base.loc[y_base['class'] == 'stage1'].index, :]
    s2 = features.loc[y_base.loc[y_base['class'] == 'stage2'].index, :]
    s3 = features.loc[y_base.loc[y_base['class'] == 'stage3'].index, :]
    s4 = features.loc[y_base.loc[y_base['class'] == 'stage4'].index, :]
    
    x_train = []
    x_train_features = features.loc[y_train.index, :]
    for _, row in tqdm(x_train_features.iterrows(), total=x_train_features.shape[0]):
        x_train.append(get_classification_vector(row.values, s1, s2, s3, s4, th))
    
    x_test = []
    x_test_features = features.loc[y_test.index, :]
    for _, row in tqdm(x_test_features.iterrows(), total=x_test_features.shape[0]):
        x_test.append(get_classification_vector(row.values, s1, s2, s3, s4, th))
    
    return x_train, y_train, x_test, y_test

In [15]:
cancer = "KIRC"
threshold_r = 0.7
threshold_p = 0.05

base = f"C:/Users/colombelli/Desktop/TCC/experiments_extra_40/{cancer}/"

df = pd.read_csv(f"{base}{cancer}_mRNA.csv", index_col=0).T
df_classes = build_class_df(list(df.index), agglutinate_stages=False).dropna()
df = df.loc[class_df.index, :]

#corr_df = get_correlation_dataframe(df)

In [106]:
x_train, y_train, x_test, y_test = build_dataset(df, df_classes, 0.7, 2643643)

100%|████████████████████████████████████████████████████████████████████████████████| 125/125 [01:17<00:00,  1.61it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 94/94 [01:02<00:00,  1.51it/s]


In [114]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(np.array(x_train), y_train['class'].values)
clf.score(np.array(x_test), y_test['class'].values)

0.46808510638297873