# Genes' bias per chr

Repito lo que hice hace años (cuaderno II. 17 Nov 2016), para ver si hay sesgo en el strand de los genes en los chrs.

In [1]:
import numpy as np
from scipy import stats
import os 
import pandas as pd
from plotnine import *

## Observations: Protein length distribution in different species

### Retrieve the statistical description and tax_id of species

##### Statistical description data file

In [2]:
# system
system = list(os.uname())[0]
if system == 'Linux':
    base_dir = "/media/emuro/Wes/"
elif system == 'Darwin':
    base_dir = "/Volumes/Wes/"
    
# statistics on length distribution for different species     
stat_file = base_dir + "results/geneLength/outputInputFiles/" + "analysis/some_statistics/stat_description/" 
stat_file += "taxid_merged/stat_description.taxid_merged.ensembl_and_ref_proteome.tsv"
print(stat_file)

# retrieve data
stat_df = pd.read_csv(stat_file, low_memory=False, sep="\t")
print(stat_df.shape)

/Volumes/Wes/results/geneLength/outputInputFiles/analysis/some_statistics/stat_description/taxid_merged/stat_description.taxid_merged.ensembl_and_ref_proteome.tsv
(6521, 66)


##### Filter some columns and species

In [3]:
stat_df.rename(columns={'merged_division_superregnum': 'division_8'},
               inplace=True)
# visualize data
pd.set_option('display.max_columns', None)
if 1:
    display(stat_df.head(2))
    print(stat_df.shape)

Unnamed: 0,genes_species,genes_assembly,genes_trunk_genes_path,genes_genes_file,genes_count,genes_mean,genes_std,genes_var,genes_min,genes_25perc,genes_50perc,genes_75perc,genes_max,genes_log10_mean,genes_log10_std,genes_log10_var,genes_log10_min,genes_log10_25perc,genes_log10_50perc,genes_log10_75perc,genes_log10_max,genes_log_mean,genes_log_std,genes_log_var,genes_log_min,genes_log_25perc,genes_log_50perc,genes_log_75perc,genes_log_max,genes_division,tax_id,prots_species,prots_proteome_id,prots_superregnum,prots_num_prot_cod_genes,prots_uniprot_fasta_file,prots_count,prots_mean,prots_std,prots_var,prots_min,prots_25perc,prots_50perc,prots_75perc,prots_max,prots_log10_mean,prots_log10_std,prots_log10_var,prots_log10_min,prots_log10_25perc,prots_log10_50perc,prots_log10_75perc,prots_log10_max,prots_log_mean,prots_log_std,prots_log_var,prots_log_min,prots_log_25perc,prots_log_50perc,prots_log_75perc,prots_log_max,division_8,ratio_mean_prots_genes,diff_prots_genes,abs_diff,ratio_prots_genes
0,archangium_gephyra_gca_001027285,ASM102728v1,ftp.ensemblgenomes.org/pub/bacteria/release-49...,protein_coding.genes.archangium_gephyra_gca_00...,10119.0,1103.871035,1127.066255,1270278.0,113.0,524.0,887.0,1388.0,36653.0,2.919254,0.333869,0.111468,2.053078,2.719331,2.947924,3.142389,4.56411,6.721831,0.768762,0.590995,4.727388,6.261492,6.787845,7.235619,10.509251,bacteria,48,Archangium gephyra,UP000035579,bacteria,10110,/ftp.uniprot.org/pub/databases/uniprot/current...,10110.0,367.499703,375.772244,141204.779058,37.0,174.0,295.0,462.0,12217.0,2.441094,0.334836,0.112115,1.568202,2.240549,2.469822,2.664642,4.086965,5.620827,0.770987,0.594422,3.610918,5.159055,5.686975,6.135565,9.410584,bacteria,0.998757,-9.0,9.0,0.999111
1,chondromyces_crocatus_gca_001189295,ASM118929v1,ftp.ensemblgenomes.org/pub/bacteria/release-49...,protein_coding.genes.chondromyces_crocatus_gca...,8339.0,1211.51445,1051.408605,1105460.0,89.0,629.0,977.0,1460.0,24476.0,2.978963,0.30025,0.09015,1.94939,2.798651,2.989895,3.164353,4.38874,6.859315,0.691352,0.477968,4.488636,6.444131,6.884487,7.286192,10.105448,bacteria,52,Chondromyces crocatus,UP000067626,bacteria,8327,/ftp.uniprot.org/pub/databases/uniprot/current...,8327.0,403.466555,350.57956,122906.027917,29.0,209.0,326.0,486.0,8158.0,2.501095,0.300984,0.090591,1.462398,2.320146,2.513218,2.686636,3.911584,5.758984,0.693041,0.480306,3.367296,5.342334,5.786897,6.186209,9.006754,bacteria,0.99908,-12.0,12.0,0.998561


(6521, 66)


# Python functions

In [4]:
# species' stats for length distribution
def get_df_for_taxid(df, taxid):
     return df.loc[df["tax_id"]==taxid].copy()

In [5]:
# The next function performs the whole analysis
def species_gene_distribution__retrieve_plot_and_fit(stat_taxid_df, tax_id):
    species_df = get_df_for_taxid(stat_taxid_df, tax_id)
    species_df.drop_duplicates(subset=['tax_id'], inplace=True, keep='last')
    display(species_df)
    
    genes_f = base_dir + "results/geneLength/" + species_df["genes_trunk_genes_path"].item() + species_df["genes_genes_file"].item()
    print(genes_f)
    # retrieve data
    genes_df = pd.read_csv(genes_f, sep="\t")
    genes_df["TSS"] = np.where(genes_df['strand']!= '+', genes_df['start'], genes_df['end'])
    
    # visualize data
    pd.set_option('display.max_columns', None)
    if 1:
        display(genes_df.head(20))
        print(genes_df.shape)

    return(genes_df)

# Calculate

In [6]:
human_tax_id = 9606
    
genes_df = species_gene_distribution__retrieve_plot_and_fit(stat_df, human_tax_id)

Unnamed: 0,genes_species,genes_assembly,genes_trunk_genes_path,genes_genes_file,genes_count,genes_mean,genes_std,genes_var,genes_min,genes_25perc,genes_50perc,genes_75perc,genes_max,genes_log10_mean,genes_log10_std,genes_log10_var,genes_log10_min,genes_log10_25perc,genes_log10_50perc,genes_log10_75perc,genes_log10_max,genes_log_mean,genes_log_std,genes_log_var,genes_log_min,genes_log_25perc,genes_log_50perc,genes_log_75perc,genes_log_max,genes_division,tax_id,prots_species,prots_proteome_id,prots_superregnum,prots_num_prot_cod_genes,prots_uniprot_fasta_file,prots_count,prots_mean,prots_std,prots_var,prots_min,prots_25perc,prots_50perc,prots_75perc,prots_max,prots_log10_mean,prots_log10_std,prots_log10_var,prots_log10_min,prots_log10_25perc,prots_log10_50perc,prots_log10_75perc,prots_log10_max,prots_log_mean,prots_log_std,prots_log_var,prots_log_min,prots_log_25perc,prots_log_50perc,prots_log_75perc,prots_log_max,division_8,ratio_mean_prots_genes,diff_prots_genes,abs_diff,ratio_prots_genes
213,homo_sapiens,GRCh38,ftp.ensembl.org/pub/release-98/genes/homo_sapi...,protein_coding.genes.homo_sapiens.nan.ensembl....,19976.0,68287.089808,132423.575448,17536000000.0,116.0,9606.75,27221.0,70804.0,2473538.0,4.40396,0.644353,0.415191,2.064458,3.982576,4.434904,4.850058,6.393319,10.140494,1.483677,2.201298,4.75359,9.170221,10.211744,11.167671,14.72116,vertebrates,9606,Homo sapiens (Human),UP000005640,eukaryota,20614,/ftp.uniprot.org/pub/databases/uniprot/current...,20614.0,552.845105,595.745883,354913.157154,2.0,245.0,410.0,666.0,34350.0,2.601146,0.353891,0.125239,0.30103,2.389166,2.612784,2.823474,4.535927,5.989361,0.814863,0.664002,0.693147,5.501258,6.016157,6.50129,10.444357,vertebrates,0.024288,638.0,638.0,1.031938


/Volumes/Wes/results/geneLength/ftp.ensembl.org/pub/release-98/genes/homo_sapiens/protein_coding.genes.homo_sapiens.nan.ensembl.98.tsv


Unnamed: 0.1,Unnamed: 0,contig,strand,start,end,biotype,gene_id,gene_name,length,diffLength,TSS
0,0,X,+,49529869,49529985,protein_coding,ENSG00000236737,GAGE12B,116,18,49529985
1,1,1,+,186401046,186401180,protein_coding,ENSG00000262180,OCLM,134,18,186401180
2,2,2,+,131035092,131035262,protein_coding,ENSG00000284479,SMIM39,170,6,131035262
3,3,7,-,135927274,135927450,protein_coding,ENSG00000267697,LUZP6,176,3,135927274
4,4,X,+,135309480,135309659,protein_coding,ENSG00000283644,ETDC,179,3,135309659
5,5,8,+,32647202,32647390,protein_coding,ENSG00000286131,AC083977.1,188,3,32647390
6,6,1,+,229305135,229305326,protein_coding,ENSG00000213029,SPHAR,191,0,229305326
7,7,20,-,10420546,10420737,protein_coding,ENSG00000285723,AL034430.2,191,0,10420546
8,8,21,+,46326288,46326491,protein_coding,ENSG00000286224,AP000471.1,203,3,46326491
9,9,MT,+,8366,8572,protein_coding,ENSG00000228253,MT-ATP8,206,3,8572


(19976, 11)


## Chromosomes  
Centromeres  
https://www.biostars.org/p/2349/

curl -s "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz" | gunzip -c | grep acen > hg38__acen_cytoBand.txt

quick try: EM manually parses to EM_hg38__acen_cytoBand.txt

In [7]:
centromere_file = hg38_centromere_file = "./EM_hg38__acen_cytoBand.txt"
# retrieve data
cent_df = pd.read_csv(centromere_file, sep="\t", names=["contig", "cen_start", "cen_end", "arm", "acen"])

cent_df["contig"] = cent_df["contig"].str.replace("chr", "")
contig_l = cent_df["contig"].to_list()

cent_df = cent_df[["contig", "cen_start", "cen_end"]]
print(cent_df.shape)
display(cent_df.head(2))

(24, 3)


Unnamed: 0,contig,cen_start,cen_end
0,1,121700000,125100000
1,10,38000000,41600000


In [8]:
genes_df.drop(genes_df.index[~genes_df["contig"].isin(contig_l)], inplace=True)
genes_df.shape

genes_df = pd.merge(genes_df, cent_df, on="contig", how="left")

genes_df["TSS_in_arm"] = "cen"    
genes_df.loc[genes_df['TSS'] < genes_df['cen_start'], 'TSS_in_arm'] = "q"
genes_df.loc[genes_df['TSS'] > genes_df['cen_end']  , 'TSS_in_arm']   = "p"
display(genes_df.head(2))
genes_df["TSS_in_arm"].value_counts()

Unnamed: 0.1,Unnamed: 0,contig,strand,start,end,biotype,gene_id,gene_name,length,diffLength,TSS,cen_start,cen_end,TSS_in_arm
0,0,X,+,49529869,49529985,protein_coding,ENSG00000236737,GAGE12B,116,18,49529985,58100000,63800000,q
1,1,1,+,186401046,186401180,protein_coding,ENSG00000262180,OCLM,134,18,186401180,121700000,125100000,p


p      13093
q       6777
cen       64
Name: TSS_in_arm, dtype: int64

In [9]:
pd.crosstab(genes_df.TSS_in_arm, genes_df.strand)

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
cen,42,22
p,6643,6450
q,3385,3392


In [24]:
for ch in sorted(contig_l):
    aux_df = genes_df[genes_df.contig == ch]
    display(ch)
    display(pd.crosstab(aux_df.TSS_in_arm, aux_df.strand))

'1'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
p,481,496
q,552,521


'10'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
cen,3,0
p,306,268
q,75,79


'11'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
cen,12,2
p,450,400
q,211,235


'12'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
cen,1,1
p,375,369
q,138,150


'13'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
p,156,165


'14'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
p,319,293


'15'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
p,298,299


'16'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
p,200,181
q,255,216


'17'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
cen,1,0
p,378,458
q,158,189


'18'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
cen,0,1
p,100,101
q,40,26


'19'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
p,424,407
q,310,330


'2'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
cen,9,4
p,418,346
q,232,238


'20'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
p,198,162
q,92,94


'21'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
p,98,118
q,6,10


'22'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
cen,3,6
p,219,215


'3'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
cen,5,2
p,281,289
q,248,250


'4'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
cen,5,2
p,252,269
q,123,101


'5'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
cen,2,1
p,407,319
q,73,83


'6'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
cen,0,1
p,211,221
q,307,307


'7'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
p,328,293
q,142,154


'8'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
cen,1,0
p,182,228
q,140,132


'9'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
p,281,282
q,86,129


'X'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
cen,0,2
p,263,256
q,184,146


'Y'

strand,+,-
TSS_in_arm,Unnamed: 1_level_1,Unnamed: 2_level_1
p,18,15
q,13,2
