## Decompouse an annotation file obtained with the DAVID Tool

Made by: Cynthia Soto<br>
Date: March 2th, 2022<br>
Last md: March 16th, 2022<br>

**Assumptions:**<br>
Takes a (cluster annotation) tvs file get with the **DAVID Tool**, which is parsed to separate the clusters into individual files & eliminate redundant genes. 

**Code goal:** <br>
1. Input a clustering annotation file. g.e: David_clustering_Run0.txt with 9 clusters <br>
2. Output 4 different files:
    1. An individual file for each cluster in the run. g.e: run*_clustx.txt
    2. A file with the unique genes by each cluster in the run. g.e: run*_clust_unique_genes_x.txt
    3. A file with the Enrichment Score by each cluster in the run. g.e: run*_Annotation_clusters_ETest.csv
    4. A summary for the run: g.e: run*_meta-data_clusters.txt <br>

**Important NOTE:**<br>
1. The annotation files were obtained with the with *DAVID Tool v. 6.8 (from January 2022)*. Be sure to use the same format. 
2. A header need to be added in the first line of the file. Just copy & paste the label of the clusters as header. g.e: <br>
*Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR*

In [2]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt

run = 'run0'

In [3]:
# Set the base directory
try:
    # Change the current working Directory      
    os.chdir("/home/cyntsc/Proyectos/tesis-code/meta-xtrome-analysis/results-data/MG_DarkMagenta_WGCNA_DAVID_VisANT")
    print("Directory changed:", os.getcwd() )
except OSError:
    print("Can't change the Current Working Directory")        
#print(os.listdir(os.getcwd())  

Directory changed: /home/cyntsc/Proyectos/tesis-code/meta-xtrome-analysis/results-data/MG_DarkMagenta_WGCNA_DAVID_VisANT


In [23]:
## check a bit of the files to be processed ... are they ok?
# s_path = str(os.getcwd())+'/*.txt'
# s_path
# all_files = glob.glob(s_path)
# for file in all_files:
#     print(file)

'/home/cyntsc/Proyectos/tesis-code/meta-xtrome-analysis/results-data/MG_DarkMagenta_WGCNA_DAVID_VisANT/*.txt'

In [4]:
# clusters_file = os.path.basename(file)
clusters_file = 'David_clustering_Run0.txt'
clusters_file

'David_clustering_Run0.txt'

In [5]:
# Remember to add a header line by hand in the input file to capture the column names  
df_clusters = pd.read_csv(clusters_file, header=0, index_col=0, sep='\t')  #(clusters_file, header=1, index_col=0, sep='\t')
df_clusters.reset_index(inplace=True)

In [6]:
df_clusters.head(3)

Unnamed: 0,Category,Term,Count,%,PValue,Genes,List Total,Pop Hits,Pop Total,Fold Enrichment,Bonferroni,Benjamini,FDR
0,Annotation Cluster 1,Enrichment Score: 1.5527098383390239,,,,,,,,,,,
1,Category,Term,Count,%,PValue,Genes,List Total,Pop Hits,Pop Total,Fold Enrichment,Bonferroni,Benjamini,FDR
2,INTERPRO,IPR015943:WD40/YVTN repeat-like-containing domain,7,5.263157894736842,0.0029394806898688287,"AT2G05170, AT3G13340, AT5G23430, AT5G51980, AT...",124,262,22920,4.938438808175326,0.5307215741640121,0.7237339433165627,0.7237339433165627


In [7]:
df_clust_idx = df_clusters['Category'].str.contains('Annotation Cluster', regex=False)

In [8]:
#type(df_clust_idx)
df_clust_idx.head(10)
df_idx = df_clusters[df_clust_idx].index
df_idx

Int64Index([0, 8, 17, 24, 31, 38, 43, 48, 53, 60, 66, 80, 86], dtype='int64')

In [9]:
run

'run0'

In [10]:
df_clusters.iloc[df_idx][['Category','Term']]
df_clusters.iloc[df_idx][['Category','Term']].to_csv(run +"_Annotation_clusters_ETest.csv", index=False, header=True)

In [11]:
df_clusters.iloc[df_idx][['Category','Term']]

Unnamed: 0,Category,Term
0,Annotation Cluster 1,Enrichment Score: 1.5527098383390239
8,Annotation Cluster 2,Enrichment Score: 1.3511256564676855
17,Annotation Cluster 3,Enrichment Score: 1.3489668435343156
24,Annotation Cluster 4,Enrichment Score: 1.111257745805441
31,Annotation Cluster 5,Enrichment Score: 1.017328213339477
38,Annotation Cluster 6,Enrichment Score: 0.8795378259776299
43,Annotation Cluster 7,Enrichment Score: 0.7764482129298738
48,Annotation Cluster 8,Enrichment Score: 0.6208365559775062
53,Annotation Cluster 9,Enrichment Score: 0.5971627919281557
60,Annotation Cluster 10,Enrichment Score: 0.5284468488168873


In [12]:
lst_idx = df_idx.tolist()
lst_idx
#df_idx[1]

[0, 8, 17, 24, 31, 38, 43, 48, 53, 60, 66, 80, 86]

In [13]:
# declare flags to control de idx (s)
last_element_df = df_clusters.index.values[-1]
last_cluster = lst_idx[-1]
print(last_element_df)
print(last_cluster)

94
86


In [14]:
# parse the df_clusters to separate the clusters as:
#   @clustx: cluster data
#   @clust_unique_genes_0: list of unique genes in the cluster

idx2 = 1
last_element = df_clusters['Category'].iloc[-1].index
print(last_element)
iclust_num = 1

lst_meta_data = []
print('Clust_Name','Original_Number_Of_Genes ','Unique_Genes') 
for item in lst_idx: 
    if item == last_cluster:
        #print(item)
        tmp_idx2 = last_element_df + 1
        #print(tmp_idx2)
    else:
        #print(item)
        tmp_idx2 = lst_idx[idx2]  # - 1
        #print(tmp_idx2)
        idx2 = idx2 + 1
    df_clust =  df_clusters.iloc[item:tmp_idx2] 
    #print(df_clust)
    df_clust.to_csv(run + '_clust' + str(iclust_num), index=False, header=True)
    
    # get the unique genes of this cluster
    lst_genes_tmp = df_clust['Genes'].to_list()
    lst_genes = [item for item in lst_genes_tmp if not (pd.isnull(item) == True)] 
    lst_genes.pop(0)
    #print(lst_genes)
    #print(len(lst_genes))
    lst_genes2 = []
    for element in lst_genes:
        tmp_genes2 = element.split(',')
        for gen in tmp_genes2:
            lst_genes2.append(gen)
    #print(lst_genes2)
    #print(len(lst_genes2))
    uniqueGenes=set(lst_genes2)
    #print(uniqueGenes)
    name_clust = run + '_unique_genes_clust' + str(iclust_num)
    # save the unique genes of this cluster to a txt file for further analysis
    textfile = open(name_clust, "w")
    for element in uniqueGenes:
        textfile.write(element + ",")
    textfile.close()
   
    # summary
    lst_meta_data.append(run + '_clust_' + str(iclust_num) + ' ' + str(len(lst_genes2)) + ' ' + str(len(uniqueGenes)))
    print(run + '_clust_' + str(iclust_num) + ' ' + str(len(lst_genes2)) + ' ' + str(len(uniqueGenes))) 
    # walk to the next cluster
    iclust_num = iclust_num + 1    

<built-in method index of str object at 0x7fc6112fdcb0>
Clust_Name Original_Number_Of_Genes  Unique_Genes
run0_clust_1 28 9
run0_clust_2 22 4
run0_clust_3 34 12
run0_clust_4 23 11
run0_clust_5 17 5
run0_clust_6 13 7
run0_clust_7 11 6
run0_clust_8 14 6
run0_clust_9 23 8
run0_clust_10 15 5
run0_clust_11 85 17
run0_clust_12 28 14
run0_clust_13 75 27


In [15]:
lst_meta_data

['run0_clust_1 28 9',
 'run0_clust_2 22 4',
 'run0_clust_3 34 12',
 'run0_clust_4 23 11',
 'run0_clust_5 17 5',
 'run0_clust_6 13 7',
 'run0_clust_7 11 6',
 'run0_clust_8 14 6',
 'run0_clust_9 23 8',
 'run0_clust_10 15 5',
 'run0_clust_11 85 17',
 'run0_clust_12 28 14',
 'run0_clust_13 75 27']

In [16]:
df_meta = pd.DataFrame(lst_meta_data)
df_meta.to_csv(run + '_meta-data_clusters', sep = '\t', index=None, header = None)