In [1]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
input_file = '../data/1011_pangenome/allORFs_pangenome.fasta'
handle = []
seq = []

for record in SeqIO.parse(input_file, "fasta"):
    handle.append(record.id)
    seq.append(str(record.seq))

In [3]:
df = pd.DataFrame({'handle':handle, 'seq':seq})

In [4]:
df['aa'] = df.seq.apply(lambda x: str(Seq(str(x)).translate()))



In [5]:
# not divisible by 3
df['odd_length'] = (df.seq.str.len() % 3) != 0

In [6]:
# premature stop codon
df['premature_stop'] = df['aa'].str[:-1].str.contains("\*")

In [7]:
df.shape

(7796, 5)

In [8]:
df.query('odd_length')

Unnamed: 0,handle,seq,aa,odd_length,premature_stop
43,44-PUT_AA_TRASP-R.14848-,CCCGTTGCATTCTTCATAATCCTTGTTCTATAGGTCCCATTATTAT...,PVAFFIILVL*VPLLFCTLLFLIFVLYLCFVAILEKEMKNI*TILV...,True,True
44,45-PUT_IRON_TRASP-R.14849-,TTCAACGACTGAGAAGTCAGCTTACTATTGAAAAGGCAGTGAATTT...,FND*EVSLLLKRQ*IWWLHGSLLDYVIMRKHDVFLQVHRCDNDSQH...,True,True
45,46-PUT_LAC_TRASP-R.14850-,GAAGTAAGGTGCGCACACTGAAGCTGTTCCCTATCGCTTATCAACC...,EVRCAH*SCSLSLINHWQVIQPSTVICTLHIKR*GNMIVACIIMLS...,True,True
1746,1799-YAL030W,ATGTCGTCATCTACTCCCTTTGACCCTTATGCTCTATCCGAGCACG...,MSSSTPFDPYALSEHDEERPQNVQSKSRTAELQAVSTESHRVPSRK...,True,True
1843,1896-YBL026W_NumOfGenes_2,ATGCTTTTCTTCTCCTTTTTCAAGACTTTAGTTGACCAAGAAGTGG...,MLFFSFFKTLVDQEVVVEVCS**FTSEFPLIQEN*RVSYINF*KKS...,True,True
...,...,...,...,...,...
7781,7834-Q0255,ATGATTAAATGAACAATAATTAATATTTACTTATTATTAATATTTT...,MIK*TIINIYLLLIFLIIKNNNNNNNYNNILKYNKDIDLYSIQSPY...,True,True
7790,7843-RDN58-1_NumOfGenes_2,AAACTTTCAACAACGGATCTCTTGGTTCTCGCATCGATGAAGAACG...,KLSTTDLLVLASMKNAAKCDT*CELQNSVNHRIFERTLRPLVFQGA...,True,True
7791,7844-YMRCTy1-4_NumOfGenes_31_NumOfGenes_68,TGTTGGAATAAAAATCCACTATCGTCTATCAACTAATAGTTATATT...,CWNKNPLSSIN**LYYQYIIIYGVKMMT*VMRSCHRC*RKLKRKD*...,True,True
7793,7846-YGRWTy3-1_NumOfGenes_6,TGTTGTATCTCAAAATGAGATATGTCAGTATGACAATACGTCACCC...,CCISK*DMSV*QYVTLNVHKTHMKQPYNKTNNMRQNPTFPS*TTQS...,True,True


In [9]:
df.query('premature_stop')

Unnamed: 0,handle,seq,aa,odd_length,premature_stop
43,44-PUT_AA_TRASP-R.14848-,CCCGTTGCATTCTTCATAATCCTTGTTCTATAGGTCCCATTATTAT...,PVAFFIILVL*VPLLFCTLLFLIFVLYLCFVAILEKEMKNI*TILV...,True,True
44,45-PUT_IRON_TRASP-R.14849-,TTCAACGACTGAGAAGTCAGCTTACTATTGAAAAGGCAGTGAATTT...,FND*EVSLLLKRQ*IWWLHGSLLDYVIMRKHDVFLQVHRCDNDSQH...,True,True
45,46-PUT_LAC_TRASP-R.14850-,GAAGTAAGGTGCGCACACTGAAGCTGTTCCCTATCGCTTATCAACC...,EVRCAH*SCSLSLINHWQVIQPSTVICTLHIKR*GNMIVACIIMLS...,True,True
49,50-BIO6-R.8578-,GCTGCTTCCAGCACCTTCACTGGTATTGTCGTTCAATCCGAAGGTA...,AASSTFTGIVVQSEGMAAGLRTNALSTLAGIFILAFF*NECVTVLS...,False,True
1715,1768-YAL001C,ATGGTACTGACGATTTATCCTGACGAACTCGTACAAATAGTGTCTG...,MVLTIYPDELVQIVSDKIASNKGSMFMSHSPFRLRLGDKRTIL*KI...,False,True
...,...,...,...,...,...
7791,7844-YMRCTy1-4_NumOfGenes_31_NumOfGenes_68,TGTTGGAATAAAAATCCACTATCGTCTATCAACTAATAGTTATATT...,CWNKNPLSSIN**LYYQYIIIYGVKMMT*VMRSCHRC*RKLKRKD*...,True,True
7792,7845-YDRWTy2-3_NumOfGenes_13_NumOfGenes_28,TGTTGGAATAAAAATCAACTATCATCTACTAACTAGTATTTACGTT...,CWNKNQLSSTN*YLRY*YIIIYGVRR*RK**EIVI*ISGS*NARID...,False,True
7793,7846-YGRWTy3-1_NumOfGenes_6,TGTTGTATCTCAAAATGAGATATGTCAGTATGACAATACGTCACCC...,CCISK*DMSV*QYVTLNVHKTHMKQPYNKTNNMRQNPTFPS*TTQS...,True,True
7794,7847-YHLWTy4-1_NumOfGenes_3_NumOfGenes_6,TGTTGGAACGAGAGTAATTGATAGTGACATGAGTTGCTATGGTAAC...,CWNESN***HELLW*QSNAYIVY*CTTRIRLSVIAPIAEGMLNEKL...,True,True


In [10]:
df.query('premature_stop or odd_length').shape

(287, 5)

In [11]:
df.query('premature_stop or odd_length').to_csv('pangenome_potential-introns.csv', index=False)

In [12]:
df.query('odd_length')['odd_length'].sum()

180

In [13]:
df.query('odd_length')['premature_stop'].sum()

180

In [14]:
7727 - 6453

1274

In [16]:
#df.query('odd_length')['category'].value_counts()

In [19]:
# no odd lengths without premature stops
df.query('odd_length').query('~premature_stop').shape

(0, 5)

In [None]:
#df.query('odd_length').query('~premature_stop')['category'].value_counts()

In [20]:
df.shape

(7796, 5)

In [21]:
df.query('premature_stop').shape

(287, 5)

In [24]:
# 2% of pangenome has odd length ORF
df.query('odd_length').shape[0] / df.shape[0]

0.023088763468445357

In [25]:
# 4% of pangenome has ORF with premature stop
df.query('premature_stop').shape[0] / df.shape[0]

0.03681375064135454

In [26]:
# 4% of pangenome ORFs are odd (vs 10% of the 16genomes)
df.query('odd_length or premature_stop').shape[0] / df.shape[0]

0.03681375064135454

In [None]:
#df.query('premature_stop')['category'].value_counts().plot.bar(title='Premature stops (n=9504/108350)')

In [None]:
df.shape

In [None]:
df.query('premature_stop').shape

In [None]:
data = df.query('premature_stop').groupby('strain', as_index=False)['category'].value_counts()
g = sns.barplot(x='category',y='count',hue='strain',data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
g.set_xticklabels(g.get_xticklabels(),rotation=30)
g.set_ylim(0,460)
g.set_title('Premature Stop Codon (n=9604/108350)')

In [None]:
df.query('odd_length').shape

In [None]:
data = df.query('odd_length').groupby('strain', as_index=False)['category'].value_counts()
g = sns.barplot(x='category',y='count',hue='strain',data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
g.set_xticklabels(g.get_xticklabels(),rotation=30)
g.set_title('Length NOT divisible by 3 (n=7727/108350)')
g.set_ylim(0,460)

In [None]:
df.query('odd_length and premature_stop').shape

In [None]:
data = df.query('odd_length and premature_stop').groupby('strain', as_index=False)['category'].value_counts()
g = sns.barplot(x='category',y='count',hue='strain',data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
g.set_xticklabels(g.get_xticklabels(),rotation=30)
g.set_title('Premature stop and length NOT divisible by 3 (n=6453/108350)')
g.set_ylim(0,460)

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)

data = df.query('premature_stop').groupby('strain', as_index=False)['category'].value_counts()
g = sns.barplot(ax=axes[0],x='category',y='count',hue='strain',data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
g.set_xticklabels(g.get_xticklabels(),rotation=30)
g.set_ylim(0,460)
g.legend_.remove()
g.set_title('Premature Stop Codon (n=9604/108350)')

data = df.query('odd_length').groupby('strain', as_index=False)['category'].value_counts()
g = sns.barplot(ax=axes[1], x='category',y='count',hue='strain',data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
g.set_xticklabels(g.get_xticklabels(),rotation=30)
g.set_title('Length NOT divisible by 3 (n=7727/108350)')
g.set_ylim(0,460)
g.legend_.remove()

data = df.query('odd_length and premature_stop').groupby('strain', as_index=False)['category'].value_counts()
g = sns.barplot(ax=axes[2], x='category',y='count',hue='strain',data=data)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
g.set_xticklabels(g.get_xticklabels(),rotation=30)
g.set_title('Premature stop AND odd length (n=6453/108350)')
g.set_ylim(0,460)

In [None]:
df.query('odd_length and premature_stop')['premature_stop'].sum()