In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=4, shm_size_mb=4000, progress_bar=True)

In [None]:
df = pd.read_csv("datasets/merged/mainTable.csv", index_col=0)
df_files = pd.read_csv("datasets/merged/files.dat", index_col=0)
df_files = df_files[df_files.index.isin(df.columns)]
df_files['primary_site'].unique()

In [None]:
#fpkm threshholds
df = df.where(df<1e5,1e5)
df = df.where(df>1e-1,0)

In [None]:
def get_site_columns(site='Colon'):
    return df_files[df_files['primary_site']==site].index.values

def get_not_site_columns(site='Colon'):
    return df_files[df_files['primary_site']!=site].index.values

In [None]:
site = "Breast"
HDE_genes = []
sigmadiff = []

In [None]:
for gene in df.index:
    print(gene)
    bins=np.linspace(1e-1,df.loc[gene,:].max(),100)
    T_data = df[get_site_columns(site)].loc[gene,:].astype(float).values
    NT_data = df[get_not_site_columns(site)].loc[gene,:].astype(float).values
    T_mean = T_data.mean()
    NT_mean = NT_data.mean()
    NT_sigma = NT_data.std()
    sigmadiff.append([gene, np.abs(NT_mean-T_mean)/NT_sigma])
    if np.abs(NT_mean-T_mean) > 0.5*NT_sigma:
        fig=plt.figure()
        plt.title(gene)
        nT,_,_ = plt.hist(T_data, density=True, histtype='step', bins=bins)
        nNT,_,_ = plt.hist(NT_data, density=True, histtype='step', bins=bins)
        plot_max = np.where(nT>nNT,nT,nNT).max()
        plt.vlines([T_mean, NT_mean],0,plot_max, linestyles='dashed', colors=['b','orange'])
        plt.vlines([NT_mean-NT_sigma, NT_mean+NT_sigma],0, plot_max, colors='red')
        plt.tick_params(labelsize=18)
        plt.xticks(rotation=90)
        plt.show()
        HDE_genes.append(gene)

In [None]:
means = df.parallel_apply(lambda x: x[get_not_site_columns()].mean(), axis=1)
variances = df.parallel_apply(lambda x: x[get_not_site_columns()].var(), axis=1)
means_T = df.parallel_apply(lambda x: x[get_site_columns()].mean(), axis=1)
variances_T = df.parallel_apply(lambda x: x[get_site_columns()].var(), axis=1)

In [None]:
df_info = pd.DataFrame(index=df.index)
df_info.insert(0,'mean', means)
df_info.insert(1,'std', variances.apply(np.sqrt))
df_info.insert(2,'mean_T', means_T)
df_info.insert(3,'std_T', variances_T.apply(np.sqrt))
df_info.insert(4,'(mean-mean_T)/std', (means-means_T).apply(np.abs)/variances.apply(np.sqrt))
df_info.insert(5,'cv2_T', variances_T/means_T/means_T)
df_info.insert(6,'cv2', variances/means/means)
df_info = df_info.sort_values(by='(mean-mean_T)/std', ascending=False)

In [None]:
df_info.to_csv("df_info.csv", index=True, header=True)
#df_info = pd.read_csv("df_info.csv", index_col=0)
df_info

In [None]:
HDE_genes = df_info.index.values

In [None]:
fig, ax=plt.subplots()
df.loc['CYP4Z1',get_site_columns()].hist(density=True, ax=ax, bins=np.linspace(0,1000,50), histtype='step')
df.loc['CYP4Z1',get_not_site_columns()].hist(density=True, ax=ax, bins=np.linspace(0,1000,50), histtype='step')

In [None]:
plt.scatter(df_info['mean_T'],df_info['cv2_T'])
plt.plot(df_info['mean_T'],1./df_info['mean_T'])
plt.xscale('log')
plt.yscale('log')
plt.ylim(1e-3,1e6)
plt.xlim(1e-2,1e5)

In [None]:
with open(f"HDE_{site}.txt", 'w') as f:
    for g in HDE_genes:
        f.write("%s\n"%g)

In [None]:
df_conversion = pd.read_csv("https://www.genenames.org/cgi-bin/download/custom?col=gd_app_sym&col=gd_pub_ensembl_id&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit", sep="\t", index_col=0)

In [None]:
pd.Series(data=map(lambda g: df_conversion.at[g, 'Ensembl gene ID'] if g in df_conversion.index else None, HDE_genes)).dropna().to_csv(f"HDE_{site}.csv", index=False)

In [None]:
import os
destination = site.lower()
if site =="Colon":
    destination="crectal"
os.system(f"mv HDE_{site}.txt HDE_{site}.csv df_info.csv datasets/cancers/{destination}/.")