# Code for preparing control data set of Genes related with the plant's defense mechanisms

1.- Data where consulted and retrieved from Uniprot database based on:
    a) 11 Gene classes previousy selected see list below)
             Gene_Class (e.g: BAK1, FLS2, etc
    b) Plants resource database
    c) Reviewed: swiss-prot database
    
    E.g query: BAK1+plats+reviewed; cdpks plants AND reviewed:yes
                 
2.- Structure of the data downloaded: (In this order as tab-separated file)
    Uniprot: Entry_name+Gene_name(primary)+Protein_names+Length
    
Summary: 
11 datasets where recovery on March 2020

In [84]:
import pandas as pd
import os

In [93]:
pwd

'/home/cynthia/xtrome-main/notebooks'

## Inserting 'protein class' meta-data into each of the files recovered
### Personalized classification


In [94]:
# Preparing dictionary of classes
dict_uniprot = {'2': 'BIK1', '3': 'CC','4': 'CDPKS', '5': 'FLS2', '6': 'MAPKS', '7': 'NB-LRR', '8': 'PRR', '9': 'RKS','10': 'RLKS', '11': 'TIR'}

# Class 1: 'BAK1' is not in the dict_uniprot due it is created when initialized the Uniprot_defense_related.csv file 
print('Protein predifined meta-data class')

for keys,values in dict_uniprot.items():
    print(keys+':'+values)
    #print(values)

Protein predifined meta-data class
2:BIK1
3:CC
4:CDPKS
5:FLS2
6:MAPKS
7:NB-LRR
8:PRR
9:RKS
10:RLKS
11:TIR


# Creating csv file Uniprot_defense_related.csv
### Meta-data of genes related with defense recovered from Uniprot DB
Fist class defined: BAK 1

In [116]:
df_gene_meta_data = pd.read_csv(
  '/home/cynthia/xtrome-main/Uniprot_defense_data/bak1_pd.tab',
  sep = ' '
)
print("Uniprot DB recovery on: March-2020")
print("Meta-Data related to plant-defense genes was recovery")
print("Classes consulted:")
print("1.   BAK1:")

print('====================================')
print("Example of the class BAK1:")
#df_gene_meta_data.head(5)

#Fields-Heders correspond as follow:
#Entry: Unique an stable ID
#Entry_name: UniprotKB Entry (Mnemonic)
#Gene_name: gene name primary
#Lng: gene length
#Protein_name: protein name 

Uniprot DB recovery on: March-2020
Meta-Data related to plant-defense genes was recovery
Classes consulted:
1.   BAK1:
Example of the class BAK1:


In [117]:
# inserting column with static value in data frame for identify the class of the protein 

#df_gene_meta_data.insert(1,'Uniprot_Class', int, allowed_duplicates= False)
df_gene_meta_data.insert(0,'Uniprot_Class', 1)

In [104]:
#df_gene_meta_data.head(5)
#df_gene_meta_data.tail(5)

In [118]:
# create and save the dataframe into defense_meta_data csv file
df_gene_meta_data.to_csv('~/xtrome-main/Uniprot_defense_data/Defense_genes_metadata.csv', index = False) 

### Append all records associated to each class pre-defined in the dict_uniprot

In [119]:
# Get the whole list of files in the directory (parameter sys.2) 
# base_dir: /home/cynthia/xtrome-main
s_dir = '/home/cynthia/xtrome-main/Uniprot_defense_data/'     
arr_fasta_f = os.listdir(s_dir)          # Create an array 
print('Directory name: Uniprot_defense_data\ in path', s_dir)
print('\nNumber of files to process:', len(arr_fasta_f))
print('\nFile names:', arr_fasta_f)

Directory name: Uniprot_defense_data\ in path /home/cynthia/xtrome-main/Uniprot_defense_data/

Number of files to process: 13

File names: ['fls2_pd.tab', 'mapks_pd.tab', 'tir_pd.tab', 'rlks_pd.tab', 'cc_pd.tab', 'cdpks_pd.tab', 'bak1_pd.tab', 'rks_pd.tab', 'prr_pd.tab', 'bik1_pd.tab', 'nb-lrr_pd.tab', 'raw_uniprot_files', 'Defense_genes_metadata.csv']


In [122]:
# Parse the csv files in the directory to compose the Defense_genes_data file (Classes selected)

for fasta_f in (arr_fasta_f):
  
    i_name_pos = fasta_f.rfind('_')
    s_name_class = (fasta_f[0:i_name_pos])
    s_name_class = s_name_class.upper()
    
    if s_name_class == 'BAK1': continue
    # load file downloaded ftom UniprotDB in csv format 
    try:
        df_gene_meta_data = pd.read_csv(s_dir+fasta_f, sep = ' ', engine='python')
    except ValueError:
        print(fasta_f + ' is a directory. The file will not be processed.')    
   
    try:
        # parse and gets the 'key' class by 'value' in dict_uniprot
        i_class = list(dict_uniprot.keys())[list(dict_uniprot.values()).index(s_name_class)]
        # rename to columns to make them more intuitive
        df_gene_meta_data.rename(columns = {'Entry.1':'Entry_name', "name": "Gene_name", "Gene": "Lng", "names_(primary_)": "Protein_name"}, inplace = True) 
         # insert the new meta-data column (see dictionary above)
        df_gene_meta_data.insert(0,'Uniprot_Class', i_class)
        # Append the results to the 'Defense_genes_data.csv' file   
        df_gene_meta_data.to_csv(s_dir + 'Defense_genes_metadata.csv', mode='a', index=False, sep=',', header=False)    
        print(str(len(df_gene_meta_data))+' rcds appended to Defense_genes_metadata.csv. Uniprot class_name: '+ s_name_class +' class# '+ str(i_class) + ' File_name:' + fasta_f)
    except ValueError:
        print(fasta_f + ' is not in dict_uniprot dictonary. The file will not be processed.')   
        break
    
# Total: 217 Defense_genes_metadata.csv

36 rcds appended to Defense_genes_metadata.csv. Uniprot class_name: FLS2 class# 5 File_name:fls2_pd.tab
19 rcds appended to Defense_genes_metadata.csv. Uniprot class_name: MAPKS class# 6 File_name:mapks_pd.tab
18 rcds appended to Defense_genes_metadata.csv. Uniprot class_name: TIR class# 11 File_name:tir_pd.tab
1 rcds appended to Defense_genes_metadata.csv. Uniprot class_name: RLKS class# 10 File_name:rlks_pd.tab
70 rcds appended to Defense_genes_metadata.csv. Uniprot class_name: CC class# 3 File_name:cc_pd.tab
5 rcds appended to Defense_genes_metadata.csv. Uniprot class_name: CDPKS class# 4 File_name:cdpks_pd.tab
10 rcds appended to Defense_genes_metadata.csv. Uniprot class_name: RKS class# 9 File_name:rks_pd.tab
12 rcds appended to Defense_genes_metadata.csv. Uniprot class_name: PRR class# 8 File_name:prr_pd.tab
5 rcds appended to Defense_genes_metadata.csv. Uniprot class_name: BIK1 class# 2 File_name:bik1_pd.tab
16 rcds appended to Defense_genes_metadata.csv. Uniprot class_name: NB-

IsADirectoryError: [Errno 21] Is a directory: '/home/cynthia/xtrome-main/Uniprot_defense_data/raw_uniprot_files'

## A bit on the exploration of the datasets

In [44]:
# load Uniprot raw data in csv format
df_gene_meta_data = pd.read_csv(
  '/home/cynthia/xtrome-main/Uniprot_defense_related/Defense_genes_metadata.csv',
  sep = ','
)
df_gene_meta_data.head() 
df_gene_meta_data.tail()

## Essential classification based on several criterias


In [65]:
# Overview of data

# print('Data dimensions:', df_gene_meta_data.shape, '\n')
# print(df_gene_meta_data.dtypes)
print("Uniprot DB recovery on: March-2020")
print("Criteria: filtered data by reviewed")
print('====================================\n')
print('Número de genes (primary_associated) en las clases BAK1, BIK1, CC, CDPKs, FLS2, MAPKs, NB-LRR, PRR, RKs, RLKs, TIR:', df_gene_meta_data['Uniprot_Class'].count(),'\n')
# print(len(df_gene_meta_data))


# Based in Uniprot Class pre-defined
df_grp = df_gene_meta_data.groupby(['Uniprot_Class'])
i = str(len(df_grp))
print('Agrupado por meta-clase:'+ i)

# Based in the 'Entry' Class field pre-defined
df_grp = df_gene_meta_data.groupby(['Entry'])
i = str(len(df_grp))
print('Agrupado por identificador unico en bd (estable):'+ i)

# Based in the 'Protein_name' Class field pre-defined
df_grp = df_gene_meta_data.groupby(['Protein_name'])
i = str(len(df_grp))
print('Agrupado por Protein_name:'+ i)

# Based in the 'Gene_name' Class field pre-defined
df_grp = df_gene_meta_data.groupby(['Gene_name'])
i = str(len(df_grp))
print('Agrupado por Gene_name:'+ i)

Uniprot DB recovery on: March-2020
Criteria: filtered data by reviewed

Número de genes (primary_associated) en las clases BAK1, BIK1, CC, CDPKs, FLS2, MAPKs, NB-LRR, PRR, RKs, RLKs, TIR: 216 

Agrupado por meta-clase:11
Agrupado por identificador unico en bd (estable):189
Agrupado por Protein_name:110
Agrupado por Gene_name:179


In [88]:
# Getting a protein description associated to the primary_gene name 

df_grp_sum = df_gene_meta_data.groupby(['Uniprot_Class']).sum()

print('How to interpret this result:\n')
print('Basically you are showing a group by gene_name dataset. The # in the Uniprot_class column implies that exist n equal gene_names with different proteins definitions.\n')
print('Total members gruped by Gene_name class:', df_grp_sum.count())
#df_grp_sum.head(5)


How to interpret this result:

Basically you are showing a group by gene_name dataset. The # in the Uniprot_class column implies that exist n equal gene_names with different proteins definitions.

Total members gruped by Gene_name class: Entry           11
Entry_name      11
Gene_name       11
Lng             11
Protein_name    11
dtype: int64


In [87]:
# For details about the grouped category
# Ejm: based on the 'Uniprot_Class' Class field pre-defined
df_grp = df_gene_meta_data.groupby(['Uniprot_Class'])
i = str(len(df_grp))
print('Agrupado por:'+ df_grp  + i)

for Uniprot_Class=='2' in df_grp:
    #print(Uniprot_Class)
    print(Gene_name)    

SyntaxError: invalid syntax (<ipython-input-87-2d914ce507e6>, line 7)

In [None]:
# Getting a protein description associated to the primary_gene name 

df_grp2 = df_gene_meta_data.groupby(['Gene_name','Protein_name']).sum()

print('How to interpret this result:\n')
print('Basically you are showing a group by gene_name & protein_name dataset. The # in the Uniprot_class column implies that exist n equal gene_names with the same protein definition.\n')
print('Total members after gruping class by gene_name & protein_name:', df_grp2.count())
df_grp3.head(10)