## Decompouse the output file (clusterization) produced with DAVID Tool

**Goal: get individual clusters to analize with VisANT Tool.** <br>
Decompouse the clusterization file (output) gotten with DAVID Tool into cluster files (one by cluster)<br><br>
### RUN 3: Custom
**Input:** David_clustering_Run1c.txt<br>
Classification MID; kappa Similarity MID; Similarity Overlap Term 4; Multi-linkage Sim 0.75	(HIGH) <br>
Clusters found: 6<br>
**Output:** 1) run*_clustx.txt; 2) run*_clust_unique_genes_*.txt; 3) run*_Annotation_clusters_ETest.csv; 4) run*_meta-data_clusters.txt <br>
Made by: Cynthia Soto<br>
Date: March 2th, 2022<br>
Last md: March 9th, 2022<br>

**Assumptions:**<br>
You have a text file make it with DAVID Tool to extract the information (gene names). 

**Important NOTE:**<br>
To correctly parse the DAVID clustering file generated & downloaded with DAVID Tool v. 6.8 (from January 2022), you need to add in the first line of the file a column header; g.e: (note that it is the same used to label the clusters, just copy and paste)
**Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR**

In [16]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt

run = 'run3'

In [2]:
# Set the base directory
try:
    # Change the current working Directory      
    os.chdir("/home/cyntsc/Proyectos/tesis-code/meta-xtrome-analysis/results-data/MG_DarkMagenta_WGCNA_DAVID_VisANT")
    print("Directory changed:", os.getcwd() )
except OSError:
    print("Can't change the Current Working Directory")        
#print(os.listdir(os.getcwd())  

Directory changed: /home/cyntsc/Proyectos/tesis-code/meta-xtrome-analysis/results-data/MG_DarkMagenta_WGCNA_DAVID_VisANT


In [3]:
## check a bit of the files to be processed ... are they ok?
s_path = str(os.getcwd())+'/*.txt'
s_path

'/home/cyntsc/Proyectos/tesis-code/meta-xtrome-analysis/results-data/MG_DarkMagenta_WGCNA_DAVID_VisANT/*.txt'

In [4]:
all_files = glob.glob(s_path)
for file in all_files:
    print(file)

/home/cyntsc/Proyectos/tesis-code/meta-xtrome-analysis/results-data/MG_DarkMagenta_WGCNA_DAVID_VisANT/David_clustering_Run3b.txt
/home/cyntsc/Proyectos/tesis-code/meta-xtrome-analysis/results-data/MG_DarkMagenta_WGCNA_DAVID_VisANT/David_clustering_Run1c.txt
/home/cyntsc/Proyectos/tesis-code/meta-xtrome-analysis/results-data/MG_DarkMagenta_WGCNA_DAVID_VisANT/David_clustering_Run1b.txt
/home/cyntsc/Proyectos/tesis-code/meta-xtrome-analysis/results-data/MG_DarkMagenta_WGCNA_DAVID_VisANT/David_clustering_Run1.txt
/home/cyntsc/Proyectos/tesis-code/meta-xtrome-analysis/results-data/MG_DarkMagenta_WGCNA_DAVID_VisANT/David_clustering_Run2.txt
/home/cyntsc/Proyectos/tesis-code/meta-xtrome-analysis/results-data/MG_DarkMagenta_WGCNA_DAVID_VisANT/David_clustering_Run3c.txt
/home/cyntsc/Proyectos/tesis-code/meta-xtrome-analysis/results-data/MG_DarkMagenta_WGCNA_DAVID_VisANT/David_clustering_Run2c.txt
/home/cyntsc/Proyectos/tesis-code/meta-xtrome-analysis/results-data/MG_DarkMagenta_WGCNA_DAVID_VisA

In [7]:
# clusters_file = os.path.basename(file)
clusters_file = 'David_clustering_Run1c.txt'
clusters_file

'David_clustering_Run1c.txt'

In [8]:
# Remember to add a header line by hand in the input file to capture the column names  
df_clusters = pd.read_csv(clusters_file, header=0, index_col=0, sep='\t')  #(clusters_file, header=1, index_col=0, sep='\t')
df_clusters.reset_index(inplace=True)

In [9]:
df_clusters.head(3)

Unnamed: 0,Category,Term,Count,%,PValue,Genes,List Total,Pop Hits,Pop Total,Fold Enrichment,Bonferroni,Benjamini,FDR
0,Annotation Cluster 1,Enrichment Score: 1.4202060313688265,,,,,,,,,,,
1,Category,Term,Count,%,PValue,Genes,List Total,Pop Hits,Pop Total,Fold Enrichment,Bonferroni,Benjamini,FDR
2,GOTERM_MF_DIRECT,"GO:0016757~transferase activity, transferring ...",9,6.7669172932330826,0.01137034666861072,"AT5G45660, AT1G34270, AT1G27440, AT4G38040, AT...",111,531,19198,2.931439914490762,0.8358223750807536,1.0,1.0


In [10]:
df_clust_idx = df_clusters['Category'].str.contains('Annotation Cluster', regex=False)

In [11]:
#type(df_clust_idx)
df_clust_idx.head(10)
df_idx = df_clusters[df_clust_idx].index
df_idx

Int64Index([0, 6, 11, 16, 24, 33], dtype='int64')

In [12]:
df_clusters.iloc[df_idx][['Category','Term']]
df_clusters.iloc[df_idx][['Category','Term']].to_csv(run +"_Annotation_clusters_ETest.csv", index=False, header=True)

In [13]:
#df_clusters.iloc[df_idx][['Category','Term']]

Unnamed: 0,Category,Term
0,Annotation Cluster 1,Enrichment Score: 1.4202060313688265
6,Annotation Cluster 2,Enrichment Score: 1.1132414865173261
11,Annotation Cluster 3,Enrichment Score: 0.6207840865062924
16,Annotation Cluster 4,Enrichment Score: 0.49484110262213965
24,Annotation Cluster 5,Enrichment Score: 0.31824321995051924
33,Annotation Cluster 6,Enrichment Score: 0.09393414412628075


In [14]:
lst_idx = df_idx.tolist()
lst_idx
#df_idx[1]

[0, 6, 11, 16, 24, 33]

In [15]:
# declare flags to control de idx (s)
last_element_df = df_clusters.index.values[-1]
last_cluster = lst_idx[-1]
print(last_element_df)
print(last_cluster)

37
33


In [17]:
run

'run3'

In [19]:
# parse the df_clusters to separate the clusters as:
#   @clustx: cluster data
#   @clust_unique_genes_1: list of unique genes in the cluster

idx2 = 1
last_element = df_clusters['Category'].iloc[-1].index
print(last_element)
iclust_num = 1

lst_meta_data = []
print('Clust_Name','Original_Number_Of_Genes ','Unique_Genes') 
for item in lst_idx: 
    if item == last_cluster:
        #print(item)
        tmp_idx2 = last_element_df + 1
        #print(tmp_idx2)
    else:
        #print(item)
        tmp_idx2 = lst_idx[idx2]  # - 1
        #print(tmp_idx2)
        idx2 = idx2 + 1
    df_clust =  df_clusters.iloc[item:tmp_idx2] 
    #print(df_clust)
    #df_clust.to_csv(run + '_clust' + str(iclust_num), index=False, header=True)
    
    # get the unique genes of this cluster
    lst_genes_tmp = df_clust['Genes'].to_list()
    lst_genes = [item for item in lst_genes_tmp if not (pd.isnull(item) == True)] 
    lst_genes.pop(0)
    #print(lst_genes)
    #print(len(lst_genes))
    lst_genes2 = []
    for element in lst_genes:
        tmp_genes2 = element.split(',')
        for gen in tmp_genes2:
            lst_genes2.append(gen)
    #print(lst_genes2)
    #print(len(lst_genes2))
    uniqueGenes=set(lst_genes2)
    #print(uniqueGenes)
    name_clust = run + '_unique_genes_clust' + str(iclust_num)
    # save the unique genes of this cluster to a txt file for further analysis
    textfile = open(name_clust, "w")
    for element in uniqueGenes:
        textfile.write(element + ",")
    textfile.close()
   
    # summary
    lst_meta_data.append(run + '_clust_' + str(iclust_num) + ' ' + str(len(lst_genes2)) + ' ' + str(len(uniqueGenes)))
    #print(run + '_clust_' + str(iclust_num) + ' ' + str(len(lst_genes2)) + ' ' + str(len(uniqueGenes))) 
    # walk to the next cluster
    iclust_num = iclust_num + 1    

<built-in method index of str object at 0x7f9eade93300>
Clust_Name Original_Number_Of_Genes  Unique_Genes
run3_clust_1 29 12
run3_clust_2 17 6
run3_clust_3 17 8
run3_clust_4 52 15
run3_clust_5 75 27
run3_clust_6 22 12


In [None]:
lst_meta_data

In [24]:
df_meta = pd.DataFrame(lst_meta_data)
df_meta.to_csv(run + '_meta-data_clusters', sep = '\t', index=None, header = None)