# Add a new column (of division) to the statistical descriptions
in the protein coding gene length stat descriptions of the distributions for the different species. The column will be based in ensembl (division_7) and uniprot provided taxonomy (division_4_byLineage)

## Import python modules

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
import sys

#
# init some data paths
stat_base_path = "/media/emuro/Nubya/" 
stat_file_path = stat_base_path + "results/geneLength/outputInputFiles/analysis/some_statistics/stat_description/"
#
gitHubProject_base_path = "../../"
#main_tables_path       = gitHubProject_base_path + "main_tables/"
#extra_tables_path      = main_tables_path + "extra_tables/"
working_on_tables_path  = gitHubProject_base_path + "working_on_tables/"
if not os.path.isdir(working_on_tables_path):
    print("The path (working_on_tables_path) is not defined! Check.")  
    print(working_on_tables_path, os.path.isdir(working_on_tables_path))
    sys.exit(0)
#
# This is the file that is going to be examined
stat_file_name = "all__stat_description.ensembl_v3.0__withLineage.tsv"

## Retrieving the statistical descriptions 
**of the gene length's distributions for the different genomes**

In [2]:
# statistics on length distribution for different species
stat_file = stat_file_path + stat_file_name
print("The statistical descriptions of the protein coding gene distributions for the different species is in:\n", stat_file, "\n")

# retrieve data and diminish the number of columns
stat_df = pd.read_csv(stat_file, low_memory=False, sep="\t")
#stat_df = stat_df[["division_7", "division_8", "species", "taxonomy_id", "Lineage", "ensembl_assembly_accession", "group", "subgroup", "assembly_status"]] 

if 1: # visualize the data frame
    print(stat_df.shape)
    print(stat_df.value_counts('division_7', dropna=False))
    print()
    print(stat_df.value_counts('division_4_byLineage', dropna=False))
    pd.set_option('display.max_columns', stat_df.shape[1])
    display(stat_df.head(2))

The statistical descriptions of the protein coding gene distributions for the different species is in:
 /media/emuro/Nubya/results/geneLength/outputInputFiles/analysis/some_statistics/stat_description/all__stat_description.ensembl_v3.0__withLineage.tsv 

(33630, 42)
division_7
bacteria       31943
fungi           1014
protists         237
vertebrates      224
metazoa          115
plants            96
viruses            1
Name: count, dtype: int64

division_4_byLineage
Bacteria     30555
Eukaryota     1678
Archaea       1228
NaN            168
Viruses          1
Name: count, dtype: int64


Unnamed: 0,index_original,division_7,species,assembly,trunk_genes_path,genes_file,count,mean,std,var,min,25perc,50perc,75perc,max,log10_mean,log10_std,log10_var,log10_min,log10_25perc,log10_50perc,log10_75perc,log10_max,log_mean,log_std,log_var,log_min,log_25perc,log_50perc,log_75perc,log_max,taxonomy_id,genebuild,Scientific_name,Rank,Lineage,division_4_byLineage,Virus_host,division_8,bool_Rank,sameTax_order,division_both_dbs
0,7068,bacteria,methanobacterium_bryantii_gca_002287175,ASM228717v1,ftp.ensemblgenomes.org/pub/bacteria/release-49...,protein_coding.genes.methanobacterium_bryantii...,3168,840.40404,649.879873,422343.848699,131,416.0,686.0,1079.0,7910,2.827684,0.287565,0.082694,2.117271,2.619093,2.836324,3.033021,3.898176,6.510982,0.662143,0.438433,4.875197,6.030685,6.530878,6.98379,8.975883,2161,2017.0,Methanobacterium bryantii,Species,Archaea; Euryarchaeota; Methanomada group; Met...,Archaea,,archaea,1,1,archaea
1,26156,bacteria,methanobacterium_formicicum_gca_000762265,ASM76226v1,ftp.ensemblgenomes.org/pub/bacteria/release-49...,protein_coding.genes.methanobacterium_formicic...,2352,862.427296,589.270407,347239.612747,107,458.0,728.0,1115.0,6779,2.849723,0.276255,0.076317,2.029384,2.660865,2.862131,3.047275,3.831166,6.56173,0.6361,0.404623,4.672829,6.126869,6.590301,7.01661,8.821585,2162,2014.0,Methanobacterium formicicum,Species,Archaea; Euryarchaeota; Methanomada group; Met...,Archaea,,archaea,1,1,archaea


## Add the column

In [3]:
stat_df["division_both_dbs"] = stat_df["division_7"]
#
stat_df["division_both_dbs"] = np.where(stat_df['division_4_byLineage']=='Archaea', 'archaea', 
                                        stat_df["division_both_dbs"])
stat_df["division_both_dbs"] = np.where(stat_df['species']=='haloferax_sp_bab2207_gca_000328285', 'archaea', 
                                        stat_df["division_both_dbs"])

if 1: # visualize the data frame
    print(stat_df.shape)
    print(stat_df.value_counts('division_7', dropna=False))
    print()
    print(stat_df.value_counts('division_4_byLineage', dropna=False))
    print()
    print(stat_df.value_counts('division_both_dbs', dropna=False))
    pd.set_option('display.max_columns', stat_df.shape[1])
    display(stat_df.head(2))

(33630, 42)
division_7
bacteria       31943
fungi           1014
protists         237
vertebrates      224
metazoa          115
plants            96
viruses            1
Name: count, dtype: int64

division_4_byLineage
Bacteria     30555
Eukaryota     1678
Archaea       1228
NaN            168
Viruses          1
Name: count, dtype: int64

division_both_dbs
bacteria       30714
archaea         1229
fungi           1014
protists         237
vertebrates      224
metazoa          115
plants            96
viruses            1
Name: count, dtype: int64


Unnamed: 0,index_original,division_7,species,assembly,trunk_genes_path,genes_file,count,mean,std,var,min,25perc,50perc,75perc,max,log10_mean,log10_std,log10_var,log10_min,log10_25perc,log10_50perc,log10_75perc,log10_max,log_mean,log_std,log_var,log_min,log_25perc,log_50perc,log_75perc,log_max,taxonomy_id,genebuild,Scientific_name,Rank,Lineage,division_4_byLineage,Virus_host,division_8,bool_Rank,sameTax_order,division_both_dbs
0,7068,bacteria,methanobacterium_bryantii_gca_002287175,ASM228717v1,ftp.ensemblgenomes.org/pub/bacteria/release-49...,protein_coding.genes.methanobacterium_bryantii...,3168,840.40404,649.879873,422343.848699,131,416.0,686.0,1079.0,7910,2.827684,0.287565,0.082694,2.117271,2.619093,2.836324,3.033021,3.898176,6.510982,0.662143,0.438433,4.875197,6.030685,6.530878,6.98379,8.975883,2161,2017.0,Methanobacterium bryantii,Species,Archaea; Euryarchaeota; Methanomada group; Met...,Archaea,,archaea,1,1,archaea
1,26156,bacteria,methanobacterium_formicicum_gca_000762265,ASM76226v1,ftp.ensemblgenomes.org/pub/bacteria/release-49...,protein_coding.genes.methanobacterium_formicic...,2352,862.427296,589.270407,347239.612747,107,458.0,728.0,1115.0,6779,2.849723,0.276255,0.076317,2.029384,2.660865,2.862131,3.047275,3.831166,6.56173,0.6361,0.404623,4.672829,6.126869,6.590301,7.01661,8.821585,2162,2014.0,Methanobacterium formicicum,Species,Archaea; Euryarchaeota; Methanomada group; Met...,Archaea,,archaea,1,1,archaea


### Save the file
**if desired** (be careful not to overwrite any file!)

In [4]:
print(working_on_tables_path, os.path.isdir(working_on_tables_path))
out_file = working_on_tables_path + stat_file_name
print("saving in: ", out_file)
if 0: # Save the file if desired
    stat_df.to_csv(working_on_tables_path + stat_file_name, 
                   sep="\t", index=False, header=True)
    print("...file saved. Done!")

../../working_on_tables/ True
saving in:  ../../working_on_tables/all__stat_description.ensembl_v3.0__withLineage.tsv
