In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Parsing the variants.csv file for all C321 clones

In [2]:
mgdata = pd.read_csv("variants_gen1100toMG1655_smallfilesize.csv",dtype = {'IS_HET': str, 'AO': str, 'QA': str} )
mgdata.sort_values(by='EXPERIMENT_SAMPLE_LABEL')
c3data = pd.read_csv("variants_gen1100toC321_smallfilesize.csv",dtype = {'IS_HET': str, 'AO': str, 'QA': str} )
c3data.sort_values(by='EXPERIMENT_SAMPLE_LABEL')

Unnamed: 0,UID,POSITION,REF,ALT,EXPERIMENT_SAMPLE_LABEL,INFO_DP,INFO_PAIREDR,INFO_PQR,INFO_MQMR,INFO_AN,...,INFO_RPL,INFO_EFF_GT,INFO_AF,INFO_EFF_CLASS,INFO_MEANALT,INFO_EFF_RANK,INFO_EFF_CODING,INFO_EFF_TR,INFO_EFF_GENE,ALT.1
28697,6ff56711,404273,G,,C-A4-1,4683,0.986466,0.0,59.9646,170,...,,,,,,,,,,
15913,e066a940,230490,CAA,,C-A4-1,3336,0.983173,1242.0,59.9706,170,...,,,,,,,,,,
105848,5710f7cc,1569401,A,,C-A4-1,5431,0.988062,0.0,59.9502,170,...,,,,,,,,,,
301823,8660e7a2,4535440,C,,C-A4-1,5389,0.987210,0.0,59.9854,170,...,,,,,,,,,,
253655,de8d107d,3791603,C,,C-A4-1,4698,0.986870,0.0,59.9834,170,...,,,,,,,,,,
214123,21f7612b,3137113,GG,,C-A4-1,4719,0.989759,703.5,59.9762,170,...,,,,,,,,,,
105764,2813a323,1567303,T,,C-A4-1,4507,0.988259,0.0,59.9826,170,...,,,,,,,,,,
179572,ec77a789,2633501,A,,C-A4-1,5144,0.990989,0.0,59.9587,170,...,,,,,,,,,,
105673,bbdbfed1,1567178,T,,C-A4-1,5443,0.988493,0.0,59.9801,170,...,,,,,,,,,,
179658,738a26fd,2633746,A,,C-A4-1,1886,0.964668,0.0,59.8994,170,...,,,,,,,,,,


Figure out a bit of stuff about the variant set....looking for numbers of certain types of variants, read depth, etc.

Instance 1: Number of variant calls with read depth < min_read_depth
Instance 2: Number of variant calls with allele fracgion < min_allele_fraction 

In [3]:
long_ref = 7
min_read_depth = 7
min_read_depth_longref = 15
min_allele_fraction = 0.7
min_allele_fraction_longref = 0.9

mgvariants = mgdata[mgdata.ALT.notnull()]
c3variants = c3data[c3data.ALT.notnull()]
mg_avg_readdepth = mgdata.DP.mean()
c3_avg_readdepth = c3data.DP.mean()
print("total MG1655 variant calls: ",mgvariants.UID.count(),
      " -- at an average read depth of: ", round(mg_avg_readdepth,1))
print("variant MG1655 calls under min read depth: ",mgvariants[mgvariants.DP < min_read_depth].UID.count())
print("variant MG1655 calls under min allele fraction: ",mgvariants[mgvariants.AF < min_allele_fraction].UID.count())
print()
print("total C321 variant calls: ",c3variants.UID.count(),
      " -- at an average read depth of: ", round(c3_avg_readdepth,1))
print("variant C321 calls under min read depth: ",c3variants[c3variants.DP < min_read_depth].UID.count())
print("variant C321 calls under min allele fraction: ",c3variants[c3variants.AF < min_allele_fraction].UID.count())

total MG1655 variant calls:  47262  -- at an average read depth of:  58.5
variant MG1655 calls under min read depth:  1482
variant MG1655 calls under min allele fraction:  1814

total C321 variant calls:  28697  -- at an average read depth of:  58.8
variant C321 calls under min read depth:  807
variant C321 calls under min allele fraction:  865


Begin to cull the data.

Remove everything without a minimal read depth or allele fraction.  Apply more stringent criteria to variants with long strings of single nucleotides.

In [4]:
mvc = mgvariants[((mgvariants.DP > min_read_depth) & (mgvariants.REF.str.len() < long_ref)) |
                 ((mgvariants.DP > min_read_depth_longref) & (mgvariants.REF.str.len() > long_ref))]
mvc = mvc[((mvc.AF > min_allele_fraction) & (mvc.REF.str.len() < long_ref)) | 
          ((mvc.AF > min_allele_fraction_longref) & (mvc.REF.str.len() > long_ref))]
cvc = c3variants[((c3variants.DP > min_read_depth) & (c3variants.REF.str.len() < long_ref)) |
                 ((c3variants.DP > min_read_depth_longref) & (c3variants.REF.str.len() > long_ref))]
cvc = cvc[((cvc.AF > min_allele_fraction) & (cvc.REF.str.len() < long_ref)) | 
          ((cvc.AF > min_allele_fraction_longref) & (cvc.REF.str.len() > long_ref))]
print("total MG1655 culled variant calls: ",mvc.UID.count())
print("total C321 culled variant calls: ",cvc.UID.count())

total MG1655 culled variant calls:  43132
total C321 culled variant calls:  26604


Separate the culled variant calls by:

data set: ECNR2.1, C321.∆A, and C321.∆A_F11

generation: g0 vs g1,100

alignment: to MG1655 or to C321.deltaA

In [12]:
g0Emvc = mvc[(mvc.EXPERIMENT_SAMPLE_LABEL.str[3] == '0') & (mvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'E')]
g0Cmvc = mvc[(mvc.EXPERIMENT_SAMPLE_LABEL.str[3] == '0') & (mvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'C')]
g0Fmvc = mvc[(mvc.EXPERIMENT_SAMPLE_LABEL.str[3] == '0') & (mvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'F')]
Emvc = mvc[(mvc.EXPERIMENT_SAMPLE_LABEL.str[3] != '0') & (mvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'E')]
Cmvc = mvc[(mvc.EXPERIMENT_SAMPLE_LABEL.str[3] != '0') & (mvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'C')]
Fmvc = mvc[(mvc.EXPERIMENT_SAMPLE_LABEL.str[3] != '0') & (mvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'F')]
g0Ecvc = cvc[(cvc.EXPERIMENT_SAMPLE_LABEL.str[3] == '0') & (cvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'E')]
g0Ccvc = cvc[(cvc.EXPERIMENT_SAMPLE_LABEL.str[3] == '0') & (cvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'C')]
g0Fcvc = cvc[(cvc.EXPERIMENT_SAMPLE_LABEL.str[3] == '0') & (cvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'F')]
Ecvc = cvc[(cvc.EXPERIMENT_SAMPLE_LABEL.str[3] != '0') & (cvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'E')]
Ccvc = cvc[(cvc.EXPERIMENT_SAMPLE_LABEL.str[3] != '0') & (cvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'C')]
Fcvc = cvc[(cvc.EXPERIMENT_SAMPLE_LABEL.str[3] != '0') & (cvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'F')]
print("total gen0 MG1655 ENCR2.1 culled variant calls: ",g0Emvc.UID.count())
print("total gen0 MG1655 C321.∆A culled variant calls: ",g0Cmvc.UID.count())
print("total gen0 MG1655 C321.∆A_F11 culled variant calls: ",g0Fmvc.UID.count())
print()
print("total MG1655 to ENCR2.1 culled variant calls:",Emvc.UID.count()," -- per sample: ",
      round(Emvc.UID.count() / Emvc.EXPERIMENT_SAMPLE_LABEL.nunique(),1),
      " -- and in excess of gen0: ",
      round(Emvc.UID.count() / Emvc.EXPERIMENT_SAMPLE_LABEL.nunique()-g0Emvc.UID.count(),1),
      "\n   total unique variants: ",Emvc.UID.nunique()," -- not found in gen0: ",
      Emvc.UID[~Emvc.UID.isin(g0Emvc.UID)].nunique())
print("total MG1655 to C321 culled variant calls:",Cmvc.UID.count()," -- per sample: ",
      round(Cmvc.UID.count() / Cmvc.EXPERIMENT_SAMPLE_LABEL.nunique(),1),
      " -- and in excess of gen0: ",
      round(Cmvc.UID.count() / Cmvc.EXPERIMENT_SAMPLE_LABEL.nunique()-g0Cmvc.UID.count(),1),
      "\n   total unique variants: ",Cmvc.UID.nunique()," -- not found in gen0: ",
      Cmvc.UID[~Cmvc.UID.isin(g0Cmvc.UID)].nunique())
print("total MG1655 to C321_F11 culled variant calls:",Fmvc.UID.count()," -- per sample: ",
      round(Fmvc.UID.count() / Fmvc.EXPERIMENT_SAMPLE_LABEL.nunique(),1),
      " -- and in excess of gen0: ",
      round(Fmvc.UID.count() / Fmvc.EXPERIMENT_SAMPLE_LABEL.nunique()-g0Fmvc.UID.count(),1),
      "\n   total unique variants: ",Fmvc.UID.nunique()," -- not found in gen0: ",
      Fmvc.UID[~Fmvc.UID.isin(g0Fmvc.UID)].nunique())
print()
print("total gen0 C321 ENCR2.1 culled variant calls: ",g0Ecvc.UID.count())
print("total gen0 C321 C321.∆A culled variant calls: ",g0Ccvc.UID.count())
print("total gen0 C321 C321.∆A_F11 culled variant calls: ",g0Fcvc.UID.count())
print()
print("total C321 to ENCR2.1 culled variant calls:",Ecvc.UID.count()," -- per sample: ",
      round(Ecvc.UID.count() / Ecvc.EXPERIMENT_SAMPLE_LABEL.nunique(),1),
      " -- and in excess of gen0: ",
      round(Ecvc.UID.count() / Ecvc.EXPERIMENT_SAMPLE_LABEL.nunique()-g0Ecvc.UID.count(),1),
      "\n   total unique variants: ",Ecvc.UID.nunique()," -- not found in gen0: ",
      Ecvc.UID[~Ecvc.UID.isin(g0Ecvc.UID)].nunique())
print("total C321 to C321 culled variant calls:",Ccvc.UID.count()," -- per sample: ",
      round(Ccvc.UID.count() / Ccvc.EXPERIMENT_SAMPLE_LABEL.nunique(),1),
      " -- and in excess of gen0: ",
      round(Ccvc.UID.count() / Ccvc.EXPERIMENT_SAMPLE_LABEL.nunique()-g0Ccvc.UID.count(),1),
      "\n   total unique variants: ",Ccvc.UID.nunique()," -- not found in gen0: ",
      Ccvc.UID[~Ccvc.UID.isin(g0Ccvc.UID)].nunique())
print("total C321 to C321_F11 culled variant calls:",Fcvc.UID.count()," -- per sample: ",
      round(Fcvc.UID.count() / Fcvc.EXPERIMENT_SAMPLE_LABEL.nunique(),1),
      " -- and in excess of gen0: ",
      round(Fcvc.UID.count() / Fcvc.EXPERIMENT_SAMPLE_LABEL.nunique()-g0Fcvc.UID.count(),1),
      "\n   total unique variants: ",Fcvc.UID.nunique()," -- not found in gen0: ",
      Fcvc.UID[~Fcvc.UID.isin(g0Fcvc.UID)].nunique())

total gen0 MG1655 ENCR2.1 culled variant calls:  27
total gen0 MG1655 C321.∆A culled variant calls:  676
total gen0 MG1655 C321.∆A_F11 culled variant calls:  699

total MG1655 to ENCR2.1 culled variant calls: 1891  -- per sample:  67.5  -- and in excess of gen0:  40.5 
   total unique variants:  828  -- not found in gen0:  807
total MG1655 to C321 culled variant calls: 18886  -- per sample:  726.4  -- and in excess of gen0:  50.4 
   total unique variants:  1572  -- not found in gen0:  898
total MG1655 to C321_F11 culled variant calls: 20953  -- per sample:  748.3  -- and in excess of gen0:  49.3 
   total unique variants:  1538  -- not found in gen0:  840

total gen0 C321 ENCR2.1 culled variant calls:  670
total gen0 C321 C321.∆A culled variant calls:  25
total gen0 C321 C321.∆A_F11 culled variant calls:  84

total C321 to ENCR2.1 culled variant calls: 20045  -- per sample:  715.9  -- and in excess of gen0:  45.9 
   total unique variants:  1481  -- not found in gen0:  819
total C321 

Remove gen0 variants from the evolved populations.

(yaiO is a string of 9 Gs, so it's just miscalled)

In [6]:
threshold = 5
EmvcNEW = Emvc[~Emvc.UID.isin(g0Emvc.UID)]
CcvcNEW = Ccvc[~Ccvc.UID.isin(g0Ccvc.UID)]
FcvcNEW = Fcvc[~Fcvc.UID.isin(g0Fcvc.UID)]

ancestral_variants = pd.concat([Emvc[Emvc.UID.isin(g0Emvc.UID)].INFO_EFF_GENE.value_counts(),
                              Ccvc[Ccvc.UID.isin(g0Ccvc.UID)].INFO_EFF_GENE.value_counts(),
                              Fcvc[Fcvc.UID.isin(g0Fcvc.UID)].INFO_EFF_GENE.value_counts()], axis=1)
ancestral_variants.columns = ['ECNR2','C321','C321_F11']

threshold_genes_hit = pd.concat([EmvcNEW.INFO_EFF_GENE.value_counts(),CcvcNEW.INFO_EFF_GENE.value_counts(),
                               FcvcNEW.INFO_EFF_GENE.value_counts()], axis=1)
threshold_genes_hit.columns = ['ECNR2','C321','C321_F11']
threshold_genes_hit = threshold_genes_hit[(threshold_genes_hit.ECNR2 > threshold) | 
                                          (threshold_genes_hit.C321 > threshold) | 
                                          (threshold_genes_hit.C321_F11 > threshold)]

EmvcNEWTHRESHOLD = Emvc[Emvc.INFO_EFF_GENE.isin(threshold_genes_hit.index)]
CcvcNEWTHRESHOLD = Ccvc[Ccvc.INFO_EFF_GENE.isin(threshold_genes_hit.index)]
FcvcNEWTHRESHOLD = Fcvc[Fcvc.INFO_EFF_GENE.isin(threshold_genes_hit.index)]

print('removing the following ancestral hits from evolved populations: \n',ancestral_variants)
print('\nnumber of genes hit over threshold in evolved \nECNR2.1 populations:',
      threshold_genes_hit[threshold_genes_hit.ECNR2 > threshold].ECNR2.count(),
      ' --C321 poipulations:',
      threshold_genes_hit[threshold_genes_hit.C321 > threshold].C321.count(),
      ' --C321_F11 populations:',
      threshold_genes_hit[threshold_genes_hit.C321_F11 > threshold].C321_F11.count(),'\n')
print('genes hit: \n',threshold_genes_hit)

removing the following ancestral hits from evolved populations: 
       ECNR2  C321  C321_F11
aceE    NaN   NaN        28
adiC    NaN    26        28
bglH     28   NaN       NaN
bglX    NaN   NaN        28
citF    NaN    26        28
cpdA    NaN    26        28
cpxA    NaN   NaN        28
csgE    NaN   NaN        28
cstA    NaN   NaN        28
cyaA    NaN   NaN        28
dinJ    NaN   NaN        28
dusB    NaN    26        28
evgS    NaN   NaN        28
flgK    NaN   NaN        56
flk     NaN   NaN        28
focB    NaN   NaN        27
folA    NaN   NaN        16
ftsP    NaN   NaN        28
fucI    NaN   NaN        28
fucK     28   NaN       NaN
gadW    NaN   NaN        29
gatC     28   NaN       NaN
glnG    NaN   NaN        28
glpK    NaN   NaN        28
glpR     28   NaN       NaN
glxK    NaN   NaN        28
gspD     28   NaN       NaN
gtrA    NaN    26        28
hemA    NaN   NaN        28
hycI     28   NaN       NaN
...     ...   ...       ...
surA    NaN   NaN        28
tdh      2

Further analyze threshold variants, specifically looking for mutations in long strings of single nucleotides that could be sequencing errors.



In [7]:
temp = EmvcNEWTHRESHOLD[EmvcNEWTHRESHOLD.REF.str.len() > 6].sort_values(by = 'INFO_EFF_GENE')
temp

Unnamed: 0,UID,POSITION,REF,ALT,EXPERIMENT_SAMPLE_LABEL,INFO_DP,INFO_PAIREDR,INFO_PQR,INFO_MQMR,INFO_AN,...,INFO_RPL,INFO_EFF_GT,INFO_AF,INFO_EFF_CLASS,INFO_MEANALT,INFO_EFF_RANK,INFO_EFF_CODING,INFO_EFF_TR,INFO_EFF_GENE,ALT.1
125864,164b4d57,3664357,TGGGGGGT,TGGAGGGT,E-C1-1,6410,0.989029,11979.3,59.9709,174,...,30,1,0.017241,,1.0,1.0,CODING,b3515,gadW,TGGAGGGT
155947,73016812,3304806,GCCCCCCCA,GCCCCCCCCCA,E-E1-2,3938,0.989816,3846.33,59.9623,174,...,11,1,0.022988,,2.5,1.0,CODING,b3161,mtr,GCCCCCCCCCA
155948,73016812,3304806,GCCCCCCCA,GCCCCCCCCA,E-E1-2,3938,0.989816,3846.33,59.9623,174,...,105,2,0.097701,,1.59091,1.0,CODING,b3161,mtr,GCCCCCCCCA
152809,73016812,3304806,GCCCCCCCA,GCCCCCCCCA,E-E1-1,3938,0.989816,3846.33,59.9623,174,...,105,2,0.097701,,1.59091,1.0,CODING,b3161,mtr,GCCCCCCCCA
193735,73016812,3304806,GCCCCCCCA,GCCCCCCCCCA,E-H1-1,3938,0.989816,3846.33,59.9623,174,...,11,1,0.022988,,2.5,1.0,CODING,b3161,mtr,GCCCCCCCCCA
152810,73016812,3304806,GCCCCCCCA,GCCCCCCCCCA,E-E1-1,3938,0.989816,3846.33,59.9623,174,...,11,1,0.022988,,2.5,1.0,CODING,b3161,mtr,GCCCCCCCCCA
111736,73016812,3304806,GCCCCCCCA,GCCCCCCCCA,E-A2-2,3938,0.989816,3846.33,59.9623,174,...,105,2,0.097701,,1.59091,1.0,CODING,b3161,mtr,GCCCCCCCCA
109730,d23c7413,1190055,CGGGGGGA,CGGGGGGGA,E-A2-1,6451,0.990308,7531.5,59.9684,174,...,204,1,0.08046,,1.75,1.0,CODING,b1130,phoP,CGGGGGGGA
158674,d23c7413,1190055,CGGGGGGA,CGGGGGGGA,E-E1-2,6451,0.990308,7531.5,59.9684,174,...,204,1,0.08046,,1.75,1.0,CODING,b1130,phoP,CGGGGGGGA
114532,d23c7413,1190055,CGGGGGGA,CGGGGGGGA,E-A2-2,6451,0.990308,7531.5,59.9684,174,...,204,1,0.08046,,1.75,1.0,CODING,b1130,phoP,CGGGGGGGA


SyntaxError: invalid syntax (<ipython-input-8-16b579c424f5>, line 4)