In [34]:
import pandas as pd
import pingouin as pg
import time
from scipy import stats
from tqdm import tqdm
import numpy as np

base = "C:/Users/colombelli/Desktop/TCC/data/TCGA/BRCA/"

In [39]:
df = pd.read_csv(base+"BRCA_mRNA.csv", index_col=0)

statspandas.DataFrame
'X': Name(s) of first columns.

'Y': Name(s) of second columns.

'method': Correlation type.

'covar': List of specified covariate(s), only when covariates are passed.

'alternative': Tail of the test.

'n': Sample size (after removal of missing values).

'r': Correlation coefficients.

'CI95': 95% parametric confidence intervals.

'p-unc': Uncorrected p-values.

'p-corr': Corrected p-values.

'p-adjust': P-values correction method.

'BF10': Bayes Factor of the alternative hypothesis (only for Pearson correlation)

'power': achieved power of the test (= 1 - type II error).

#### Pingouin didn't work, took way too much to calculate what i needed

In [None]:
start = time.perf_counter()
corr_df = pg.pairwise_corr(df.iloc[:10, :], padjust='bonf')
end = time.perf_counter()

print("Time taken: ", end-start)

In [None]:
start = time.perf_counter()
corr_df_none = pg.pairwise_corr(df, padjust='none')
end = time.perf_counter()

print("Time taken: ", end-start)

In [None]:
start = time.perf_counter()
corr_df_100 = pg.pairwise_corr(df.iloc[0:100, :], padjust='bonf')
end = time.perf_counter()

print("Time taken: ", end-start)

#### Trying to calculate by hand, making an edge list like dataframe

In [19]:
dt = df.T

col_1 = []
col_2 = []
col_r = []
col_p = []

for idx1, row1 in tqdm(dt.iterrows(), total=dt.shape[0]):
    for idx2, row2 in dt.loc[idx1:, :].iterrows():
        r, p = stats.pearsonr(row1.values, row2.values)
        col_1.append(idx1)
        col_2.append(idx2)
        col_r.append(r)
        col_p.append(p)

100%|████████████████████████████████████████████████████████████████████████████████| 759/759 [00:22<00:00, 34.43it/s]


In [20]:
corr_df = pd.DataFrame.from_dict({
        "sample1": col_1,
        "sample2": col_2,
        "r": col_r,
        "p": col_p
    })

In [21]:
corr_df

Unnamed: 0,sample1,sample2,r,p
0,TCGA.3C.AAAU.01,TCGA.3C.AAAU.01,1.000000,0.000000e+00
1,TCGA.3C.AAAU.01,TCGA.3C.AALI.01,0.011027,6.221119e-01
2,TCGA.3C.AAAU.01,TCGA.3C.AALJ.01,0.161264,4.018140e-13
3,TCGA.3C.AAAU.01,TCGA.3C.AALK.01,-0.167896,4.107702e-14
4,TCGA.3C.AAAU.01,TCGA.5L.AAT0.01,-0.124918,2.082677e-08
...,...,...,...,...
288415,TCGA.XX.A89A.01,TCGA.Z7.A8R5.01,0.141578,2.021889e-10
288416,TCGA.XX.A89A.01,TCGA.Z7.A8R6.01,-0.225851,1.509051e-24
288417,TCGA.Z7.A8R5.01,TCGA.Z7.A8R5.01,1.000000,0.000000e+00
288418,TCGA.Z7.A8R5.01,TCGA.Z7.A8R6.01,-0.284133,1.881507e-38


In [31]:
clin = pd.read_csv(base+"BRCA_clin.txt", sep="\t", index_col=0).T.iloc[:, [6]]

In [33]:
clin.value_counts()

pathologic_stage
stage iia           358
stage iib           258
stage iiia          156
stage i              90
stage ia             86
stage iiic           65
stage iiib           27
stage iv             20
stage x              14
stage ib              7
stage ii              6
stage iii             2
dtype: int64

In [111]:
def get_correlation_dataframe(df):
    col_1 = []
    col_2 = []
    col_r = []
    col_p = []

    for idx1, row1 in tqdm(df.iterrows(), total=df.shape[0]):
        for idx2, row2 in df.loc[idx1:, :].iterrows():
            r, p = stats.pearsonr(row1.values, row2.values)
            col_1.append(idx1)
            col_2.append(idx2)
            col_r.append(r)
            col_p.append(p)
            
    corr_df = pd.DataFrame.from_dict({
        "sample1": col_1,
        "sample2": col_2,
        "r": col_r,
        "p": col_p
    })
    return corr_df


def build_edge_list(df, r_filter, p_filter):
    edges_df = df.loc[(df.r >= r_filter) & (df['p'] <= p_filter)]
    return edges_df.rename(columns={'sample1':'source', 'sample2':'target'})


def get_stage_class_from_patient(patient_idx, clin_df):
    stage_str = clin_df.loc[patient_idx, "pathologic_stage"]
    
    if stage_str in ["stage i"+suffix for suffix in ['', 'a','b','c']]:
        return "stage1"
    elif stage_str in ["stage ii"+suffix for suffix in ['', 'a','b','c']]:
        return "stage2"
    elif stage_str in ["stage iii"+suffix for suffix in ['', 'a','b','c']]:
        return "stage3"
    elif stage_str in ["stage iv"+suffix for suffix in ['', 'a','b','c']]:
        return "stage4"
    else:
        return np.nan


def build_class_df(sample_idxs):
    
    clin_df = pd.read_csv(base+"BRCA_clin.txt", sep="\t", index_col=0).T.iloc[:, [6]]
    
    class_col = []
    for idx in sample_idxs:
        patient_idx = '-'.join(idx.split('.')[:-1]).lower()
        sample_type = int(idx.split('.')[-1])
        
        if sample_type <= 9:   # Tumor sample
            class_col.append(get_stage_class_from_patient(patient_idx, clin_df))
        elif sample_type <= 19:   # Normal sample
            class_col.append('normal')
        elif sample_type <= 29:   # Control sample
            print(f"Warning! Found control sample {idx}, Skipping...")
            continue
        else:
            print(f"Warning! Found unexpected sample type: {idx}. Skipping...")
                  
    return pd.DataFrame.from_dict({
            "id": sample_idxs,
            "class": class_col
        }).set_index("id")


def get_consistency_index(corr_df, class_df):
    correct_connections = 0
    for index, row in corr_df.iterrows():
        src_class = class_df.loc[row[0], "class"]
        trg_class = class_df.loc[row[1], "class"]
        
        if src_class == trg_class:
            correct_connections += 1
    return correct_connections/len(corr_df)


def generate_csvs(edges_df, class_df, max_each_feature=100):
    edges_df.to_csv(base+"edges.csv", index=False)
    class_df.to_csv(base+"classes.csv", index=True)
    
    gene = pd.read_csv(base+"BRCA_mRNA.csv", index_col=0).iloc[:max_each_feature, :]
    mirna = pd.read_csv(base+"BRCA_miRNA.csv", index_col=0).iloc[:max_each_feature, :]
    meth = pd.read_csv(base+"BRCA_Methy.csv", index_col=0).iloc[:max_each_feature, :] 
    cnv = pd.read_csv(base+"BRCA_CNV.csv", index_col=0).iloc[:max_each_feature, :]
    features_df = pd.concat([gene,mirna,meth,cnv]).T
    features_df.loc[class_df.index, :].to_csv(base+"features.csv", index=True)
    return

#### Complete pipeline

In [73]:
base = "C:/Users/colombelli/Desktop/TCC/data/TCGA/BRCA/"

df = pd.read_csv(base+"BRCA_mRNA.csv", index_col=0).T
class_df = build_class_df(list(df.index)).dropna()
df = df.loc[class_df.index, :]
corr_df = get_correlation_dataframe(df) 

100%|████████████████████████████████████████████████████████████████████████████████| 750/750 [00:21<00:00, 34.42it/s]


In [91]:
get_consistency_index(build_edge_list(corr_df, 0.6, 0.001), class_df)

0.8167730173199635

In [105]:
generate_csvs(build_edge_list(corr_df, 0.6, 0.001), class_df)

#### Calculate alpha weights for focal loss

In [109]:
class_df.value_counts()

class 
stage2    425
stage3    191
stage1    123
stage4     11
dtype: int64

In [110]:
n_samples = len(class_df)
n_classes = 4
for count in class_df.value_counts():
    print(n_samples / (n_classes * count))

0.4411764705882353
0.981675392670157
1.524390243902439
17.045454545454547
