1. Load all the volcano sig files, also the TF file
2. Next, filter rows based on the transcriptional factor genes
3. Based upon the filtering, create the final set of ensemble gene ids 
4. Filter the countData file rows based upon the final set of ensemble gene ids

In [1]:
# Load libraries

import os
import pandas as pd

In [2]:
# Read the TF file

df_tffile = pd.read_csv('../Rattus_norvegicus_TF.txt', sep='\t')
tf_list = list(df_tffile['Ensembl'])
df_tffile.head()

Unnamed: 0,Species,Symbol,Ensembl,Family,Protein,Entrez_ID
0,Rattus_norvegicus,Prdm2,ENSRNOG00000033522,zf-C2H2,ENSRNOP00000046011.3;,313678.0
1,Rattus_norvegicus,,ENSRNOG00000067812,zf-C2H2,ENSRNOP00000086758.1;ENSRNOP00000089484.1;,102551257.0
2,Rattus_norvegicus,Isl2,ENSRNOG00000015336,Homeobox,ENSRNOP00000021074.3;,57233.0
3,Rattus_norvegicus,Fos,ENSRNOG00000008015,TF_bZIP,ENSRNOP00000010712.2;,314322.0
4,Rattus_norvegicus,Vax1,ENSRNOG00000008824,Homeobox,ENSRNOP00000011743.1;,64571.0


In [3]:
# List of tf genes that are present in all-counts data

df_all_counts = pd.read_csv('../../data/A_FullClean_CountData.csv')
mask = df_all_counts['ID'].isin(set(tf_list))
df_all_counts[mask]

Unnamed: 0,ID,TCPS.1,TCPS.2,TCPS.3,OM.1,OM.2,OM.3,BMP2.1,BMP2.2,BMP2.3,PT.1,PT.2,PT.3,mSLA.1,mSLA.2,mSLA.3,Titan.1,Titan.2,Titan.3
38,ENSRNOG00000025587,2,5,7,1,5,6,5,2,1,7,10,0,5,6,6,4,6,4
47,ENSRNOG00000011015,1978,2118,2258,1720,1437,1764,1955,1655,1769,1413,1640,983,2345,2148,2777,1486,2691,2514
97,ENSRNOG00000055858,0,3,0,0,0,0,0,6,6,5,4,4,11,5,14,11,11,15
109,ENSRNOG00000011720,388,387,421,285,287,321,358,291,374,303,406,219,281,256,339,199,307,289
115,ENSRNOG00000026462,29,27,21,32,16,35,34,23,33,58,51,40,97,135,109,168,160,139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22805,ENSRNOG00000008706,526,499,566,498,428,477,578,487,567,483,553,333,830,788,821,739,961,928
22864,ENSRNOG00000001183,1,0,0,0,0,1,0,0,1,0,3,0,0,0,1,1,2,1
22945,ENSRNOG00000037428,102,113,93,108,109,139,95,51,114,102,99,75,100,95,126,67,98,51
22958,ENSRNOG00000053042,1,0,0,1,2,0,0,0,0,0,0,0,1,0,0,0,0,0


In [4]:
# Create a method to preprocess the deg files

# def tf_filter(df):
#     df = df[df['padj'] <= 0.01]
#     df = df[df['ENSEMBL ID'].isin(tf_list)]
#     # df = df[abs(df['log2FoldChange']) >= 0.5]
#     print(f'No of rows: {len(df)}')

#     return list(df['ENSEMBL ID'])

In [5]:
# Read all the differential gene expression files

path = '../Volcano_sig'
deg_files = [f for f in os.listdir(path) if f.endswith('_sig.csv')]

df_list_mSLA = []
df_list_OM = []
df_list_PT = []
df_list_TCPS = []
df_list_Titan = []

for file in deg_files:
    if file.__contains__('mSLA'):
        df_list_mSLA.append(file)

merged_df = pd.DataFrame()

for file in df_list_mSLA:
    df = pd.read_csv(os.path.join(path, file))
    merged_df = pd.concat([merged_df, df])

merged_df.reset_index(drop=True, inplace=True)

set_mSLA = set(merged_df['ENSEMBL ID'])

tf_all = df_tffile[df_tffile['Ensembl'].isin(df_all_counts['ID'])]

final_mSLA = set_mSLA.union(tf_all['Ensembl'])

count_mSLA = pd.read_csv('../../data/countdata_mSLA.csv')

count_mSLA_sig = count_mSLA.loc[:, count_mSLA.columns.isin(final_mSLA)]
count_mSLA_sig.to_csv('../../data/input_mSLA.csv', index=False)
merged_df[merged_df['ENSEMBL ID'].isin(final_mSLA)]
# len(set_mSLA)

Unnamed: 0,ENSEMBL ID,Gene names,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,ENSRNOG00000000156,Megf6,16.873940,2.061009,0.687385,2.998335,2.714593e-03,1.197919e-02
1,ENSRNOG00000001714,Atp13a4,21.935202,2.907737,0.764030,3.805788,1.413530e-04,8.995760e-04
2,ENSRNOG00000002331,Aldh3a1,11.912676,3.961011,0.822332,4.816804,1.460000e-06,1.380000e-05
3,ENSRNOG00000004498,Scin,123.821554,2.988190,0.520728,5.738489,9.550000e-09,1.290000e-07
4,ENSRNOG00000005465,Kcnmb1,25.799898,2.006448,0.459218,4.369269,1.250000e-05,9.980000e-05
...,...,...,...,...,...,...,...,...
938,ENSRNOG00000058111,Itga2,61.845056,2.346467,0.302069,7.767992,7.970000e-15,1.710000e-13
939,ENSRNOG00000058263,unkown,4.061244,2.558709,0.972388,2.631367,8.504203e-03,2.776525e-02
940,ENSRNOG00000058337,Plcb2,5.143754,2.161508,0.866256,2.495230,1.258755e-02,3.846663e-02
941,ENSRNOG00000059897,Gjb5,7.487092,2.789474,0.701889,3.974241,7.060000e-05,4.217220e-04


In [6]:
# Read all the differential gene expression files

path = '../Volcano_sig'
deg_files = [f for f in os.listdir(path) if f.endswith('_sig.csv')]

In [7]:
def tf_preprocess(implant):
    df_list_implant = []
    for file in deg_files:
        if file.__contains__(implant):
            df_list_implant.append(file)

    merged_df = pd.DataFrame()
    for file in df_list_implant:
        df = pd.read_csv(os.path.join(path, file))
        merged_df = pd.concat([merged_df, df])

    merged_df.reset_index(drop=True, inplace=True)
    merged_df.to_csv(f'../../data/{implant}_merged.csv', index=False)
    set_implant = set(merged_df['ENSEMBL ID'])
    tf_all = df_tffile[df_tffile['Ensembl'].isin(df_all_counts['ID'])]
    # tf_all = df_tffile[df_tffile['Ensembl'].isin(set_implant)]
    final_implant = set_implant.union(tf_all['Ensembl'])
    count_implant = pd.read_csv(f'../../data/countdata_{implant}.csv')
    count_implant_sig = count_implant.loc[:, count_implant.columns.isin(final_implant)]
    count_implant_sig.to_csv(f'../../data/input_{implant}.tsv', sep='\t', index=False)

In [8]:
implants = ['mSLA', 'OM', 'PT', 'TCPS', 'Titan']

for implant in implants:
    tf_preprocess(implant)

In [9]:
for implant in implants:
    df = pd.read_csv(f'../../data/input_{implant}.tsv', sep='\t')
    print(df.shape)

(3, 2099)
(3, 2195)
(3, 2242)
(3, 2231)
(3, 2422)
