In [1]:
import pandas as pd
import os
import numpy as np
from scipy.io import mmread
import numpy as np
from scipy.sparse import csr_matrix
from tqdm import tqdm

In [2]:
#from the hTFtarget database (transcription factors and their target genes)
tf_target = pd.read_csv("TF-Target-information.txt", sep="\t")
tf_target = tf_target.drop(columns=["tissue"])
tf_target

Unnamed: 0,TF,target
0,AEBP2,TMEM53
1,AEBP2,C1orf228
2,AEBP2,FBXO31
3,AEBP2,ADAMTSL5
4,AEBP2,CTB-25B13.9
...,...,...
1342124,CCDC101,CLN3
1342125,CCDC101,RP11-666O2.1
1342126,CCDC101,RP11-666O2.2
1342127,CCDC101,RP11-666O2.4


In [3]:
#from the hocomo database
hocomo = pd.read_csv("TF_list.txt", sep = "\t",  encoding='unicode_escape')
hocomo.rename(columns={'Transcription factor': 'TF'}, inplace=True)
hocomo = hocomo[["Model", "TF", "TF family"]]
hocomo

Unnamed: 0,Model,TF,TF family
0,AHR_HUMAN.H10MO.B,AHR,PAS domain factors{1.2.5}
1,AIRE_HUMAN.H10MO.C,AIRE,AIRE{5.3.1}
2,ALX1_HUMAN.H10MO.B,ALX1,Paired-related HD factors{3.1.3}
3,ALX3_HUMAN.H10MO.D,ALX3,Paired-related HD factors{3.1.3}
4,ALX4_HUMAN.H10MO.D,ALX4,Paired-related HD factors{3.1.3}
...,...,...,...
635,ZN713_HUMAN.H10MO.D,ZNF713,More than 3 adjacent zinc finger factors{2.3.3}
636,ZN740_HUMAN.H10MO.D,ZNF740,Other factors with up to three adjacent zinc f...
637,ZN784_HUMAN.H10MO.D,ZNF784,Factors with multiple dispersed zinc fingers{2...
638,ZSC16_HUMAN.H10MO.D,ZSCAN16,More than 3 adjacent zinc finger factors{2.3.3}


In [16]:
#get all genes with counts from patient cohort
rna_folder_path = "/Users/elizabethchang/Library/CloudStorage/GoogleDrive-ec3055@columbia.edu/My Drive/final projects CBMFW4761 BINFG4002/data/c9als data/GSE219281_RAW/raw snRNA motor cortex"

patients = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6']
rna_files = {}
for patient in patients: 
    rna_files[patient] = ["matrix", "features", "barcodes"]

for file in os.listdir(rna_folder_path):
    split_string = file.split("_")
    if len(split_string) > 3:
        file_type = split_string[-1]
        patient = split_string[3]
        file_path = os.path.join(rna_folder_path, file)

        if file_type == "features.tsv":
            rna_files[patient][1] = file_path
        elif file_type == "matrix.mtx":
            rna_files[patient][0] = file_path
        elif file_type == "barcodes.tsv":
            rna_files[patient][2] = file_path

expressed_genes = []

for patient in tqdm(rna_files, desc='Processing RNA Files'):
    features_file = rna_files[patient][1]
    matrix_file = rna_files[patient][0] 

    features = pd.read_csv(features_file, sep="\t", header=None)

    sc_data = mmread(matrix_file)
    csr_data = csr_matrix(sc_data).transpose()
    matrix_df = pd.DataFrame.sparse.from_spmatrix(csr_data)
    matrix_df.columns = features.iloc[:, 1].tolist()

    genes_counts = matrix_df.sum()
    expressed_genes.append(pd.DataFrame(genes_counts[genes_counts != 0]).reset_index())

Processing RNA Files: 100%|██████████| 12/12 [07:49<00:00, 39.14s/it]


In [69]:
data = []

for df in expressed_genes:
    for index, row in df.iterrows():
        gene = row['index']
        count = row[0]
        data.append({'gene': gene, 'count': count})

genes_df = pd.DataFrame(data)
genes_df = genes_df.groupby("gene")["count"].sum().reset_index()
genes_df = pd.DataFrame(genes_df).sort_values(by="count", ascending=False)
genes = genes_df["gene"]

In [70]:
#only keeping TFs that we have (1) target gene info for, (2) are in our dataset's expressed genes list, (3) in the hocomo database
tf_target.rename(columns={'target': 'gene'}, inplace=True)
df = pd.merge(genes, tf_target, on="gene")
df = pd.merge(df, hocomo, on="TF")
df

Unnamed: 0,gene,TF,Model,TF family
0,MALAT1,AR,ANDR_HUMAN.H10MO.A,Steroid hormone receptors (NR3){2.1.1}
1,MAP1B,AR,ANDR_HUMAN.H10MO.A,Steroid hormone receptors (NR3){2.1.1}
2,IL1RAPL1,AR,ANDR_HUMAN.H10MO.A,Steroid hormone receptors (NR3){2.1.1}
3,ANK2,AR,ANDR_HUMAN.H10MO.A,Steroid hormone receptors (NR3){2.1.1}
4,RTN4,AR,ANDR_HUMAN.H10MO.A,Steroid hormone receptors (NR3){2.1.1}
...,...,...,...,...
534058,RPL17-C18orf32,MSX1,MSX1_HUMAN.H10MO.D,NK-related factors{3.1.2}
534059,RPL17,MSX1,MSX1_HUMAN.H10MO.D,NK-related factors{3.1.2}
534060,VAMP8,IRF5,IRF5_HUMAN.H10MO.D,Interferon-regulatory factors{3.5.3}
534061,REEP4,DLX4,DLX4_HUMAN.H10MO.D,NK-related factors{3.1.2}


In [71]:
model_list = list(df["Model"].unique())
keep_count = 0
remove_count = 0
meme_files = []
for file in os.listdir("human_HOCOMO"):
    model = file.split(".meme")[0]
    if model not in model_list:
        remove_count+=1
        # os.remove(f"human_HOCOMO/{file}")
    else:
        keep_count +=1
        meme_files.append(model)

In [72]:
df = df.drop_duplicates()
df = df[df["Model"].isin(meme_files)]
genes = df['gene'].unique()
TFs = df['TF'].unique()
print("models:" , len(df["Model"].unique()))
print("genes: ", len(genes))
print("TFs: ", len(TFs))

models: 228
genes:  17298
TFs:  209


In [73]:
tf_target_matrix = pd.DataFrame(0, index=genes, columns=TFs)

for index, row in df.iterrows():
    gene = row['gene']
    tf = row['TF']
    tf_target_matrix.at[gene, tf] = 1

tf_target_matrix = tf_target_matrix.reset_index()
tf_target_matrix.rename(columns={'index': 'gene'}, inplace=True)
tf_target_matrix.to_csv("TF_target_binary_matrix.csv", index=False)
tf_target_matrix

Unnamed: 0,gene,ARNT,ASCL2,ATF2,ATF3,BACH1,BCL6,BRCA1,CEBPA,CEBPB,...,GLIS2,SP3,PITX1,SOX9,NR4A1,TFE3,KLF15,MSX1,IRF5,DLX4
0,MALAT1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,CALM1,1,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,CHN1,1,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,QKI,1,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,KAZN,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17293,MYRFL,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17294,ROPN1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17295,VGLL1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17296,AC009227.2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
