# Distribution of the lengths of the proteins

## [Log-normal distribution](https://en.wikipedia.org/wiki/Log-normal_distribution)
It is a continuous probability distribution of a random variable whose logarithm is normally distributed. For human-readability the decimal logarithm (log10) is used.
  
**The PDF (Probability Density Function) of a normal distribution is**  
$ f(x) = \frac{1}{ \sigma \sqrt{2 \pi}} e^- \frac{1}{2} {(\frac{(x-\mu)}{\sigma})}^{2} $

## Import python modules

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import os 
import pandas as pd
import math

## The protein length distributions for the different species

### Retrieving the statistical descriptions of the length's distributions

In [2]:
# statistics on length distribution for different species     
stat_file = "../main_tables/stat_proteins.tsv"
print("The statistical descriptions of the protein distributions for the different species is in:\n", stat_file, "\n")

# retrieve data and diminish the number of columns
stat_df = pd.read_csv(stat_file, low_memory=False, sep="\t")
stat_df = stat_df[["superregnum", "species", "proteome_id", "tax_id", "uniprot_fasta_file", "count", "mean", "var"] + ["log10_mean", "log10_var"]]

# visualize data
pd.set_option('display.max_columns', None)
if 1:
    display(stat_df.head(2))
    print(stat_df.shape)
    print(stat_df["superregnum"].value_counts())

The statistical descriptions of the protein distributions for the different species is in:
 ../main_tables/stat_proteins.tsv 



Unnamed: 0,superregnum,species,proteome_id,tax_id,uniprot_fasta_file,count,mean,var,log10_mean,log10_var
0,bacteria,Kurthia sp. 3B1D,UP000288623,1562256,/ftp.uniprot.org/pub/databases/uniprot/current...,3621.0,285.936205,36171.895653,2.372531,0.075378
1,bacteria,Methyloprofundus sedimenti,UP000191980,1420851,/ftp.uniprot.org/pub/databases/uniprot/current...,3608.0,310.654933,52309.4387,2.393412,0.087226


(9915, 10)
bacteria     7997
eukaryota    1588
archaea       330
Name: superregnum, dtype: int64


In [3]:
# ENSEMBL tax_id     
ensembl_taxId_file = "../main_tables/extra_tables/species_Ensembl.tsv" 

# retrieve data
taxid_df = pd.read_csv(ensembl_taxId_file, sep="\t")
taxid_df = taxid_df[["#name", "species", "taxonomy_id"]]
taxid_df.rename(columns = {"#name" : 'name', "taxonomy_id" : 'tax_id'}, inplace = True)

# visualize data
pd.set_option('display.max_columns', None)
if 1:
    display(taxid_df.head(2))
    print(taxid_df.shape)

Unnamed: 0,name,species,tax_id
0,Spiny chromis,acanthochromis_polyacanthus,80966
1,Panda,ailuropoda_melanoleuca,9646


(33021, 3)


In [4]:
# Use pandas.merge() on multiple columns
stat_df = pd.merge(stat_df, taxid_df, on=['species'])
stat_df = stat_df[stat_df['tax_id'].notna()]
if 0:
    display(stat_df.head(2))
    print(stat_df.shape)

KeyError: 'tax_id'

## Python functions

In [None]:
# species' stats for length distribution
def get_df_for_taxid(df, taxid):
     return df.loc[df["tax_id"]==taxid].copy()

In [None]:
#### Function to plot the log-normal distribution
# blue: Observation
# red: Theoretical/Simulation
##########################################
def plot_log_norm (log10_len, mu, sigma, N, title, subtitle, xlabel, ylabel): # N is number of 
    print("µ:", mu, "\tσ:", sigma)
    my_normal_dist = np.random.normal(mu, sigma, N)
    count, bins, *rest = plt.hist(log10_len, 60, density=True, alpha=0.8, edgecolor='black') # plot 60 bins

    # plt.hist(my_normal_dist, 30, density=True, alpha=0.25, color="r")
    # print(count); print(bins); print(*rest)
    
    plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) *
             np.exp( - (bins - mu)**2 / (2 * sigma**2) ),
             linewidth=2, color='r', alpha=0.5)
     
    # mean
    plt.vlines(x = mu, ymin = 0, ymax = 0.2, colors = 'white', label = 'mean')
    plt.xlim([1,7]); plt.ylim([0,2.0])
    plt.suptitle(title, style='italic'); plt.title(subtitle + " (" + str(int(N)) +")")
    plt.xlabel(xlabel); plt.ylabel(ylabel)
    plt.show()

In [None]:
def does_it_fits_normal(data_observed): 
    # Kurtosis and Skewness
    # =====================
    print('Kurtosis:', stats.kurtosis(data_observed))
    print('Skewness:', stats.skew(data_observed))
    # D'Agostino's K-squared test
    # ===========================
    k2, p_value = stats.normaltest(data_observed)
    print(f"D'Agostino's K-squared test(statistics={k2}, p-value={p_value})")
    #
    # Mathematicaly it ends up beeing a theoretical log-normal when.
    #    - Kurtosis o Skewness: $\leq -1$ o $\geq 1$ 
    #    - D'Agostino's K-squared test: p-value $ \leq 0.05 $, becuase it rejects the null hypothesis 

In [None]:
# The next function performs the whole analysis for a tax_id
def species_distribution__retrieve_plot_and_fit(stat_df, tax_id):
    species_df = get_df_for_taxid(stat_df, tax_id)
    species_df.drop_duplicates(subset=['tax_id'], inplace=True, keep='last')
    display(species_df)
    
    lengths_f = our_mnt_dir + "results/geneLength/" + species_df["uniprot_fasta_file"].item() 
    lengths_f = lengths_f.replace(".fasta.gz", ".length.tsv") # I get the 
    print(lengths_f)
    
    # retrieve data
    lens_df = pd.read_csv(lengths_f, sep="\t")
    lens_df = lens_df[["length"]]
    # visualize data
    pd.set_option('display.max_columns', None)
    if 0:
        display(lens_df.head(2))
        print(lens_df.shape)
    
    # plot the distributions: observed and theoretical
    lens_df["log10_length"] = lens_df["length"].apply(np.log10)
    title = species_df["species"].item().capitalize(); subtitle = "Proteins";  xlabel = "$log_{10}(length)$"; ylabel = "Probability density function"
    plot_log_norm (lens_df["log10_length"], species_df["log10_mean"].item(), math.sqrt(species_df["log10_var"].item()), int(species_df["count"].item()), 
                   title, subtitle, xlabel, ylabel)  
    # fit to normal
    does_it_fits_normal(lens_df["log10_length"])

## Distributions for different species:

### _Homo sapiens_ (tax_id = 9606)

#### Retrieve its statistical description

In [5]:
tax_id = 9606
human_df = get_df_for_taxid(stat_df, tax_id)
display(human_df)

NameError: name 'get_df_for_taxid' is not defined

#### Retrieve its genes

In [None]:
# system dependencies
system = list(os.uname())[0]
if system == 'Linux':
    our_mnt_dir = "/media/emuro/Nubya/" # my disks: Nubya, Wes
elif system == 'Darwin':
    our_mnt_dir = "/Volumes/Wes/" # my disks: Wes, Nubya 

prots_f = our_mnt_dir + "results/geneLength/" + human_df["uniprot_fasta_file"].item() 
prots_f = prots_f.replace(".fasta.gz", ".length.tsv") # I get the 
print("All the proteins with their lengths of the species are in:\n", prots_f, "\n")

# retrieve data
prots_df = pd.read_csv(prots_f, sep="\t")
prots_df = prots_df[["length"]]  


# visualize data
pd.set_option('display.max_columns', None)
if 0:
    display(prots_df.head(2))
    print(prots_df.shape)

#### Plot the $log_{10}(length)$ distribution and the fit to a theoretical log10-normal

In [None]:
prots_df["log10_length"] = prots_df["length"].apply(np.log10)
title = human_df["species"].item().capitalize(); subtitle = "Proteins";  xlabel = "$log_{10}(length)$"; ylabel = "Probability density function"
plot_log_norm (prots_df["log10_length"], human_df["log10_mean"].item(), math.sqrt(human_df["log10_var"].item()), int(human_df["count"].item()), 
               title, subtitle, xlabel, ylabel)  

#### Does the distribution fits the normal distribution?

In [None]:
does_it_fits_normal(prots_df["log10_length"]) 

### _Danio rerio_ (tax_id = 7955)

In [None]:
tax_id = 7955
species_distribution__retrieve_plot_and_fit(stat_df, 7955)

### _Drosophila melanogaster_ (tax_id = 7227)

In [None]:
tax_id = 7227
species_distribution__retrieve_plot_and_fit(stat_df, tax_id)

### _Arabidopsis thaliana_ (tax_id = 3702)

In [None]:
tax_id = 3702
species_distribution__retrieve_plot_and_fit(stat_df, tax_id)

### _Saccharomyces cerevisae_ (tax_id = 4932)

In [None]:
#tax_id = 4932 #uniprot: 559292 (not in ensembl) 
#species_distribution__retrieve_plot_and_fit(stat_df, tax_id)

### _Escherichia coli_ (tax_id = 83333)

In [None]:
tax_id = 83334 #562 uniprot: 83333 (k12), 83334(0157:h7 disease), 1408285 (otra, this is not in ensembl)
species_distribution__retrieve_plot_and_fit(stat_df, tax_id)

## Fig. 1

### _Danio rerio_ (tax_id = 7955)

In [None]:
tax_id = 7955
species_distribution__retrieve_plot_and_fit(stat_df, 7955)

## Fig S1

### _Acanthochromis polyacanthus_ (tax_id = 80966)

In [None]:
tax_id = 80966
species_distribution__retrieve_plot_and_fit(stat_df, 80966)

### _Apteryx owenii_ (tax_id = 8824)

In [None]:
#tax_id = 8824
#species_distribution__retrieve_plot_and_fit(stat_df, 8824)

### _Equus asinus asinus_ (tax_id = 83772)

In [None]:
#tax_id = 83772
#species_distribution__retrieve_plot_and_fit(stat_df, 83772)

### _Vitis vinifera_ (tax\_id = 29760)

In [None]:
tax_id = 29760
species_distribution__retrieve_plot_and_fit(stat_df, 29760)

### _Acinetobacter baumannii_ (tax\_id = 1310800)

In [None]:
#tax_id = 1310800
#species_distribution__retrieve_plot_and_fit(stat_df, 1310800)

### _Colletotrichum gloeosporioides_ (tax\_id = 1213859)

In [None]:
tax_id = 1213859
species_distribution__retrieve_plot_and_fit(stat_df, 1213859)

## Fig S2

### _Cavia porcellus_ (tax\_id = 10141)

In [None]:
tax_id = 10141
species_distribution__retrieve_plot_and_fit(stat_df, 10141)

## Fig S7

### _Shewanella loihica_ (tax\_id = 323850)

In [None]:
tax_id = 323850
species_distribution__retrieve_plot_and_fit(stat_df, 323850)

### _Orchesella cincta_ (tax\_id = 48709)

In [None]:
tax_id = 48709
species_distribution__retrieve_plot_and_fit(stat_df, 48709)