# Protein coding gene length distributions

## Import python modules

In [1]:
import pandas as pd
import numpy as np
import re

import sys
sys.path.append('../../../lib/')
import constants as c

## The protein coding gene length distributions for the different species

### Retrieving the statistical descriptions of the gene length's distributions

#### Statistical description 

In [2]:
# statistics on length distributions 
stat_file = c.STAT_P_FILE
print("The statistical descriptions of the protein coding gene distributions for the different species is in:\n", stat_file, "\n")

# retrieve data and diminish the number of columns
stat_df = pd.read_csv(stat_file, low_memory=False, sep="\t")
stat_df = stat_df[["division_both_dbs", "species", "count", "mean", "var", "Lineage", "proteome_id"]] # proteome_id as unique id
# visualize data
if 1:
    pd.set_option('display.max_columns', None)
    display(stat_df.head(2))
    print(stat_df.shape)
    print(stat_df["division_both_dbs"].value_counts())
    print(stat_df.index)
    print(stat_df.info())

The statistical descriptions of the protein coding gene distributions for the different species is in:
 /home/emuro/git/github/EM_geneLength_nature/working_on/analysis/groups_of_organisms/../../../lib/../main_tables/stat_proteins.tsv 



Unnamed: 0,division_both_dbs,species,count,mean,var,Lineage,proteome_id
0,archaea,Halorubrum saccharovorum,2334.0,250.641817,28808.608041,Archaea; Euryarchaeota; Stenosarchaea group; H...,UP000053331
1,archaea,Pyrodictium occultum,1602.0,285.092385,34816.348736,Archaea; Crenarchaeota; Thermoprotei; Desulfur...,UP000053352


(9913, 7)
bacteria         7997
fungi             772
archaea           330
vertebrates       248
invertebrates     226
plants            184
protists          156
Name: division_both_dbs, dtype: int64
RangeIndex(start=0, stop=9913, step=1)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9913 entries, 0 to 9912
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   division_both_dbs  9913 non-null   object 
 1   species            9913 non-null   object 
 2   count              9913 non-null   float64
 3   mean               9913 non-null   float64
 4   var                9913 non-null   float64
 5   Lineage            9900 non-null   object 
 6   proteome_id        9913 non-null   object 
dtypes: float64(3), object(4)
memory usage: 542.2+ KB
None


## Get a column with the groups fo organisms

In [3]:
# define the groups of organisms to study
g_orgs_l = ['Archaea', 'Bacteria']
g_orgs_l += ['Ascomycota', 'Basidiomycota', 'Microsporidia', 'Mucoromycota']
g_orgs_l += ['Primates', 'Mammalia', 'Aves', 'Actinopterygii']
g_orgs_l += ['Arthropoda', 'Nematoda', 'Lophotrochozoa']
g_orgs_l += ['Viridiplantae']

In [4]:
stat_df['g_orgs'] = None
g = "protists"
mask = stat_df['division_both_dbs'].str.contains(g, case=True, na=False)
stat_df.loc[stat_df['g_orgs'].isna() & mask, 'g_orgs'] = g 
for g in g_orgs_l:
    mask = stat_df['Lineage'].str.contains(g, case=True, na=False)
    stat_df.loc[stat_df['g_orgs'].isna() & mask, 'g_orgs'] = g 
if 1:
    pd.set_option('display.max_columns', None)
    display(stat_df.head(2))
    print(stat_df.columns)
    print(stat_df.shape)
    print(stat_df["g_orgs"].value_counts(dropna=False))
    #print(stat_df.index)
    #print(stat_df.info())

Unnamed: 0,division_both_dbs,species,count,mean,var,Lineage,proteome_id,g_orgs
0,archaea,Halorubrum saccharovorum,2334.0,250.641817,28808.608041,Archaea; Euryarchaeota; Stenosarchaea group; H...,UP000053331,Archaea
1,archaea,Pyrodictium occultum,1602.0,285.092385,34816.348736,Archaea; Crenarchaeota; Thermoprotei; Desulfur...,UP000053352,Archaea


Index(['division_both_dbs', 'species', 'count', 'mean', 'var', 'Lineage',
       'proteome_id', 'g_orgs'],
      dtype='object')
(9913, 8)
Bacteria          7986
Ascomycota         502
Archaea            330
Viridiplantae      178
Basidiomycota      178
protists           156
Arthropoda         115
None                93
Mammalia            75
Actinopterygii      65
Aves                60
Nematoda            57
Lophotrochozoa      38
Mucoromycota        30
Microsporidia       26
Primates            24
Name: g_orgs, dtype: int64


## Obtain the average value for each group

In [5]:
#mean
print(stat_df['mean'].groupby(stat_df['g_orgs']).describe()[['mean', 'count']].sort_values('mean', ascending=True))

                      mean   count
g_orgs                            
Archaea         273.916034   330.0
Microsporidia   310.827261    26.0
Bacteria        319.333116  7986.0
Nematoda        346.616261    57.0
Viridiplantae   390.588279   178.0
Mucoromycota    397.025739    30.0
Lophotrochozoa  427.159576    38.0
Basidiomycota   444.358624   178.0
Arthropoda      449.669682   115.0
Aves            450.480340    60.0
Ascomycota      474.125173   502.0
protists        518.474686   156.0
Primates        525.864757    24.0
Actinopterygii  527.731067    65.0
Mammalia        542.905725    75.0


In [6]:
#count
#print(stat_df['count'].groupby(stat_df['g_orgs']).describe()[['mean', 'count']].sort_values('mean', ascending=True))