In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Parsing the variants.csv file for all C321 clones

In [2]:
mgdata = pd.read_csv("variants_gen1100toMG1655_smallfilesize.csv",dtype = {'IS_HET': str, 'AO': str, 'QA': str} )
mgdata.sort_values(by='EXPERIMENT_SAMPLE_LABEL')
c3data = pd.read_csv("variants_gen1100toC321_smallfilesize.csv",dtype = {'IS_HET': str, 'AO': str, 'QA': str} )
c3data.sort_values(by='EXPERIMENT_SAMPLE_LABEL')

Unnamed: 0,UID,POSITION,REF,ALT,EXPERIMENT_SAMPLE_LABEL,INFO_DP,INFO_PAIREDR,INFO_PQR,INFO_MQMR,INFO_AN,...,INFO_RPL,INFO_EFF_GT,INFO_AF,INFO_EFF_CLASS,INFO_MEANALT,INFO_EFF_RANK,INFO_EFF_CODING,INFO_EFF_TR,INFO_EFF_GENE,ALT.1
28697,6ff56711,404273,G,,C-A4-1,4683,0.986466,0.0,59.9646,170,...,,,,,,,,,,
15913,e066a940,230490,CAA,,C-A4-1,3336,0.983173,1242.0,59.9706,170,...,,,,,,,,,,
105848,5710f7cc,1569401,A,,C-A4-1,5431,0.988062,0.0,59.9502,170,...,,,,,,,,,,
301823,8660e7a2,4535440,C,,C-A4-1,5389,0.987210,0.0,59.9854,170,...,,,,,,,,,,
253655,de8d107d,3791603,C,,C-A4-1,4698,0.986870,0.0,59.9834,170,...,,,,,,,,,,
214123,21f7612b,3137113,GG,,C-A4-1,4719,0.989759,703.5,59.9762,170,...,,,,,,,,,,
105764,2813a323,1567303,T,,C-A4-1,4507,0.988259,0.0,59.9826,170,...,,,,,,,,,,
179572,ec77a789,2633501,A,,C-A4-1,5144,0.990989,0.0,59.9587,170,...,,,,,,,,,,
105673,bbdbfed1,1567178,T,,C-A4-1,5443,0.988493,0.0,59.9801,170,...,,,,,,,,,,
179658,738a26fd,2633746,A,,C-A4-1,1886,0.964668,0.0,59.8994,170,...,,,,,,,,,,


Figure out a bit of stuff about the variant set....looking for numbers of certain types of variants, read depth, etc.

Instance 1: Number of variant calls with read depth < min_read_depth
Instance 2: Number of variant calls with allele fracgion < min_allele_fraction 

In [3]:
long_ref = 7
min_read_depth = 6
min_read_depth_longref = 14
min_allele_fraction = 0.7
min_allele_fraction_longref = 0.85

mgvariants = mgdata[mgdata.ALT.notnull()]
c3variants = c3data[c3data.ALT.notnull()]
mg_avg_readdepth = mgdata.DP.mean()
c3_avg_readdepth = c3data.DP.mean()
print("total MG1655 variant calls: ",mgvariants.UID.count(),
      " -- at an average read depth of: ", round(mg_avg_readdepth,1))
print("variant MG1655 calls under min read depth: ",mgvariants[mgvariants.DP < min_read_depth].UID.count())
print("variant MG1655 calls under min allele fraction: ",mgvariants[mgvariants.AF < min_allele_fraction].UID.count())
print()
print("total C321 variant calls: ",c3variants.UID.count(),
      " -- at an average read depth of: ", round(c3_avg_readdepth,1))
print("variant C321 calls under min read depth: ",c3variants[c3variants.DP < min_read_depth].UID.count())
print("variant C321 calls under min allele fraction: ",c3variants[c3variants.AF < min_allele_fraction].UID.count())

total MG1655 variant calls:  47262  -- at an average read depth of:  58.5
variant MG1655 calls under min read depth:  1244
variant MG1655 calls under min allele fraction:  1814

total C321 variant calls:  28697  -- at an average read depth of:  58.8
variant C321 calls under min read depth:  640
variant C321 calls under min allele fraction:  865


Begin to cull the data.

Remove everything without a minimal read depth or allele fraction.  Apply more stringent criteria to variants with long strings of single nucleotides.

In [4]:
mvc = mgvariants[((mgvariants.DP >= min_read_depth) & (mgvariants.REF.str.len() < long_ref)) |
                 ((mgvariants.DP >= min_read_depth_longref) & (mgvariants.REF.str.len() > long_ref))]
mvc = mvc[((mvc.AF >= min_allele_fraction) & (mvc.REF.str.len() < long_ref)) | 
          ((mvc.AF >= min_allele_fraction_longref) & (mvc.REF.str.len() > long_ref))]
cvc = c3variants[((c3variants.DP >= min_read_depth) & (c3variants.REF.str.len() < long_ref)) |
                 ((c3variants.DP >= min_read_depth_longref) & (c3variants.REF.str.len() > long_ref))]
cvc = cvc[((cvc.AF >= min_allele_fraction) & (cvc.REF.str.len() < long_ref)) | 
          ((cvc.AF >= min_allele_fraction_longref) & (cvc.REF.str.len() > long_ref))]
print("total MG1655 culled variant calls: ",mvc.UID.count())
print("total C321 culled variant calls: ",cvc.UID.count())

total MG1655 culled variant calls:  43412
total C321 culled variant calls:  26776


Separate the culled variant calls by:

data set: ECNR2.1, C321.∆A, and C321.∆A_F11

generation: g0 vs g1,100

alignment: to MG1655 or to C321.deltaA

In [5]:
g0Emvc = mvc[(mvc.EXPERIMENT_SAMPLE_LABEL.str[3] == '0') & (mvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'E')]
g0Cmvc = mvc[(mvc.EXPERIMENT_SAMPLE_LABEL.str[3] == '0') & (mvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'C')]
g0Fmvc = mvc[(mvc.EXPERIMENT_SAMPLE_LABEL.str[3] == '0') & (mvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'F')]
Emvc = mvc[(mvc.EXPERIMENT_SAMPLE_LABEL.str[3] != '0') & (mvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'E')]
Cmvc = mvc[(mvc.EXPERIMENT_SAMPLE_LABEL.str[3] != '0') & (mvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'C')]
Fmvc = mvc[(mvc.EXPERIMENT_SAMPLE_LABEL.str[3] != '0') & (mvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'F')]
g0Ecvc = cvc[(cvc.EXPERIMENT_SAMPLE_LABEL.str[3] == '0') & (cvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'E')]
g0Ccvc = cvc[(cvc.EXPERIMENT_SAMPLE_LABEL.str[3] == '0') & (cvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'C')]
g0Fcvc = cvc[(cvc.EXPERIMENT_SAMPLE_LABEL.str[3] == '0') & (cvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'F')]
Ecvc = cvc[(cvc.EXPERIMENT_SAMPLE_LABEL.str[3] != '0') & (cvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'E')]
Ccvc = cvc[(cvc.EXPERIMENT_SAMPLE_LABEL.str[3] != '0') & (cvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'C')]
Fcvc = cvc[(cvc.EXPERIMENT_SAMPLE_LABEL.str[3] != '0') & (cvc.EXPERIMENT_SAMPLE_LABEL.str[0] == 'F')]
print("total gen0 MG1655 ENCR2.1 culled variant calls: ",g0Emvc.UID.count())
print("total gen0 MG1655 C321.∆A culled variant calls: ",g0Cmvc.UID.count())
print("total gen0 MG1655 C321.∆A_F11 culled variant calls: ",g0Fmvc.UID.count())
print()
print("total MG1655 to ENCR2.1 culled variant calls:",Emvc.UID.count()," -- per sample: ",
      round(Emvc.UID.count() / Emvc.EXPERIMENT_SAMPLE_LABEL.nunique(),1),
      " -- and in excess of gen0: ",
      round(Emvc.UID.count() / Emvc.EXPERIMENT_SAMPLE_LABEL.nunique()-g0Emvc.UID.count(),1),
      "\n   total unique variants: ",Emvc.UID.nunique()," -- not found in gen0: ",
      Emvc.UID[~Emvc.UID.isin(g0Emvc.UID)].nunique())
print("total MG1655 to C321 culled variant calls:",Cmvc.UID.count()," -- per sample: ",
      round(Cmvc.UID.count() / Cmvc.EXPERIMENT_SAMPLE_LABEL.nunique(),1),
      " -- and in excess of gen0: ",
      round(Cmvc.UID.count() / Cmvc.EXPERIMENT_SAMPLE_LABEL.nunique()-g0Cmvc.UID.count(),1),
      "\n   total unique variants: ",Cmvc.UID.nunique()," -- not found in gen0: ",
      Cmvc.UID[~Cmvc.UID.isin(g0Cmvc.UID)].nunique())
print("total MG1655 to C321_F11 culled variant calls:",Fmvc.UID.count()," -- per sample: ",
      round(Fmvc.UID.count() / Fmvc.EXPERIMENT_SAMPLE_LABEL.nunique(),1),
      " -- and in excess of gen0: ",
      round(Fmvc.UID.count() / Fmvc.EXPERIMENT_SAMPLE_LABEL.nunique()-g0Fmvc.UID.count(),1),
      "\n   total unique variants: ",Fmvc.UID.nunique()," -- not found in gen0: ",
      Fmvc.UID[~Fmvc.UID.isin(g0Fmvc.UID)].nunique())
print()
print("total gen0 C321 ENCR2.1 culled variant calls: ",g0Ecvc.UID.count())
print("total gen0 C321 C321.∆A culled variant calls: ",g0Ccvc.UID.count())
print("total gen0 C321 C321.∆A_F11 culled variant calls: ",g0Fcvc.UID.count())
print()
print("total C321 to ENCR2.1 culled variant calls:",Ecvc.UID.count()," -- per sample: ",
      round(Ecvc.UID.count() / Ecvc.EXPERIMENT_SAMPLE_LABEL.nunique(),1),
      " -- and in excess of gen0: ",
      round(Ecvc.UID.count() / Ecvc.EXPERIMENT_SAMPLE_LABEL.nunique()-g0Ecvc.UID.count(),1),
      "\n   total unique variants: ",Ecvc.UID.nunique()," -- not found in gen0: ",
      Ecvc.UID[~Ecvc.UID.isin(g0Ecvc.UID)].nunique())
print("total C321 to C321 culled variant calls:",Ccvc.UID.count()," -- per sample: ",
      round(Ccvc.UID.count() / Ccvc.EXPERIMENT_SAMPLE_LABEL.nunique(),1),
      " -- and in excess of gen0: ",
      round(Ccvc.UID.count() / Ccvc.EXPERIMENT_SAMPLE_LABEL.nunique()-g0Ccvc.UID.count(),1),
      "\n   total unique variants: ",Ccvc.UID.nunique()," -- not found in gen0: ",
      Ccvc.UID[~Ccvc.UID.isin(g0Ccvc.UID)].nunique())
print("total C321 to C321_F11 culled variant calls:",Fcvc.UID.count()," -- per sample: ",
      round(Fcvc.UID.count() / Fcvc.EXPERIMENT_SAMPLE_LABEL.nunique(),1),
      " -- and in excess of gen0: ",
      round(Fcvc.UID.count() / Fcvc.EXPERIMENT_SAMPLE_LABEL.nunique()-g0Fcvc.UID.count(),1),
      "\n   total unique variants: ",Fcvc.UID.nunique()," -- not found in gen0: ",
      Fcvc.UID[~Fcvc.UID.isin(g0Fcvc.UID)].nunique())

total gen0 MG1655 ENCR2.1 culled variant calls:  27
total gen0 MG1655 C321.∆A culled variant calls:  678
total gen0 MG1655 C321.∆A_F11 culled variant calls:  704

total MG1655 to ENCR2.1 culled variant calls: 1912  -- per sample:  68.3  -- and in excess of gen0:  41.3 
   total unique variants:  837  -- not found in gen0:  816
total MG1655 to C321 culled variant calls: 19064  -- per sample:  733.2  -- and in excess of gen0:  55.2 
   total unique variants:  1583  -- not found in gen0:  907
total MG1655 to C321_F11 culled variant calls: 21027  -- per sample:  751.0  -- and in excess of gen0:  47.0 
   total unique variants:  1544  -- not found in gen0:  841

total gen0 C321 ENCR2.1 culled variant calls:  675
total gen0 C321 C321.∆A culled variant calls:  26
total gen0 C321 C321.∆A_F11 culled variant calls:  84

total C321 to ENCR2.1 culled variant calls: 20159  -- per sample:  720.0  -- and in excess of gen0:  45.0 
   total unique variants:  1491  -- not found in gen0:  824
total C321 

Remove gen0 variants from the evolved populations.  Then apply a threshold cutoff to look at interesting variants.  Only look at genes his multiple times across biological replicates.

(yaiO is a string of 9 Gs, so it's just miscalled)

In [6]:
threshold = 5
EmvcNEW = Emvc[~Emvc.UID.isin(g0Emvc.UID)]
CmvcNEW = Cmvc[~Cmvc.UID.isin(g0Cmvc.UID)]
FmvcNEW = Fmvc[~Fmvc.UID.isin(g0Fmvc.UID)]
CcvcNEW = Ccvc[~Ccvc.UID.isin(g0Ccvc.UID)]
FcvcNEW = Fcvc[~Fcvc.UID.isin(g0Fcvc.UID)]

ancestral_variants = pd.concat([Emvc[Emvc.UID.isin(g0Emvc.UID)].INFO_EFF_GENE.value_counts(),
                              Cmvc[Cmvc.UID.isin(g0Cmvc.UID)].INFO_EFF_GENE.value_counts(),
                              Fmvc[Fmvc.UID.isin(g0Fmvc.UID)].INFO_EFF_GENE.value_counts()], axis=1)
ancestral_variants.columns = ['ECNR2','C321','C321_F11']

threshold_genes_hit = pd.concat([EmvcNEW.INFO_EFF_GENE.value_counts(),CmvcNEW.INFO_EFF_GENE.value_counts(),
                               FmvcNEW.INFO_EFF_GENE.value_counts()], axis=1)
threshold_genes_hit.columns = ['ECNR2','C321','C321_F11']
threshold_genes_hit = threshold_genes_hit[(threshold_genes_hit.ECNR2 > threshold) | 
                                          (threshold_genes_hit.C321 > threshold) | 
                                          (threshold_genes_hit.C321_F11 > threshold)]

EmvcNEWTHRESHOLD = Emvc[Emvc.INFO_EFF_GENE.isin(threshold_genes_hit.index)]
CmvcNEWTHRESHOLD = Cmvc[Cmvc.INFO_EFF_GENE.isin(threshold_genes_hit.index)]
FmvcNEWTHRESHOLD = Fmvc[Fmvc.INFO_EFF_GENE.isin(threshold_genes_hit.index)]

print('removing the following ancestral hits from evolved populations: \n',ancestral_variants)
print('\nnumber of genes hit over threshold in evolved \nECNR2.1 populations:',
      threshold_genes_hit[threshold_genes_hit.ECNR2 > threshold].ECNR2.count(),
      ' --C321 poipulations:',
      threshold_genes_hit[threshold_genes_hit.C321 > threshold].C321.count(),
      ' --C321_F11 populations:',
      threshold_genes_hit[threshold_genes_hit.C321_F11 > threshold].C321_F11.count(),'\n')
print('genes hit: \n',threshold_genes_hit)

removing the following ancestral hits from evolved populations: 
       ECNR2  C321  C321_F11
aat     NaN    26        28
aceE    NaN   NaN        28
aceK    NaN    26        28
ackA    NaN    25        28
acrD    NaN    25        28
adeQ    NaN    26        28
adiC    NaN    26        28
aer     NaN    26        28
agaR    NaN    26        28
allB    NaN    26        28
allC    NaN    26        28
anmK    NaN    26        28
apt     NaN    26        28
aqpZ    NaN    26        28
araD    NaN    26        28
araJ    NaN    26        28
argO    NaN    26        28
arnD    NaN    26        28
arpB    NaN    26        28
ascB    NaN    26        28
aslB    NaN    26        28
asmA    NaN    26        28
astE    NaN    26        28
atpB    NaN    26        28
atpE    NaN    26        28
baeR    NaN    26        28
bamA    NaN    25        28
barA    NaN    26        28
bcsA    NaN    26        28
bdcR    NaN    51        56
...     ...   ...       ...
yneO     28   NaN       NaN
ynfA    Na

Further analyze threshold variants, specifically looking for mutations in long strings of single nucleotides that could be sequencing errors.



In [7]:
temp = EmvcNEWTHRESHOLD[EmvcNEWTHRESHOLD.REF.str.len() > 6].sort_values(by = 'INFO_EFF_GENE')
temp

Unnamed: 0,UID,POSITION,REF,ALT,EXPERIMENT_SAMPLE_LABEL,INFO_DP,INFO_PAIREDR,INFO_PQR,INFO_MQMR,INFO_AN,...,INFO_RPL,INFO_EFF_GT,INFO_AF,INFO_EFF_CLASS,INFO_MEANALT,INFO_EFF_RANK,INFO_EFF_CODING,INFO_EFF_TR,INFO_EFF_GENE,ALT.1
4790,73016812,3304806,GCCCCCCCA,GCCCCCCCCA,E-A2-2,3938,0.989816,3846.33,59.9623,174,...,105,2,0.097701,,1.59091,1.0,CODING,b3161,mtr,GCCCCCCCCA
4801,73016812,3304806,GCCCCCCCA,GCCCCCCCCA,E-E1-1,3938,0.989816,3846.33,59.9623,174,...,105,2,0.097701,,1.59091,1.0,CODING,b3161,mtr,GCCCCCCCCA
4802,73016812,3304806,GCCCCCCCA,GCCCCCCCCCA,E-E1-1,3938,0.989816,3846.33,59.9623,174,...,11,1,0.022988,,2.5,1.0,CODING,b3161,mtr,GCCCCCCCCCA
4803,73016812,3304806,GCCCCCCCA,GCCCCCCCCCA,E-E1-2,3938,0.989816,3846.33,59.9623,174,...,11,1,0.022988,,2.5,1.0,CODING,b3161,mtr,GCCCCCCCCCA
4804,73016812,3304806,GCCCCCCCA,GCCCCCCCCA,E-E1-2,3938,0.989816,3846.33,59.9623,174,...,105,2,0.097701,,1.59091,1.0,CODING,b3161,mtr,GCCCCCCCCA
4814,73016812,3304806,GCCCCCCCA,GCCCCCCCCCA,E-H1-1,3938,0.989816,3846.33,59.9623,174,...,11,1,0.022988,,2.5,1.0,CODING,b3161,mtr,GCCCCCCCCCA
98531,4e18d28c,1437222,CTTTTTTTTTA,CTTTTTTTTA,E-G1-2,4142,0.987342,3297.0,59.9662,174,...,291,1,0.109195,,1.36842,,CODING,b1378,pfo,CTTTTTTTTA
98512,4e18d28c,1437222,CTTTTTTTTTA,CTTTTTTTTA,E-B2-2,4142,0.987342,3297.0,59.9662,174,...,291,1,0.109195,,1.36842,,CODING,b1378,pfo,CTTTTTTTTA
98511,4e18d28c,1437222,CTTTTTTTTTA,CTTTTTTTTA,E-B2-1,4142,0.987342,3297.0,59.9662,174,...,291,1,0.109195,,1.36842,,CODING,b1378,pfo,CTTTTTTTTA
257756,d23c7413,1190055,CGGGGGGA,CGGGGGGGA,E-A2-1,6451,0.990308,7531.5,59.9684,174,...,204,1,0.08046,,1.75,1.0,CODING,b1130,phoP,CGGGGGGGA


# Building up a phylogenetic tree for each experiment

In [8]:
pd.set_option("display.max_rows",101)
pd.set_option("display.max_columns",101)
# create a dataframe containing all UIDs for each sample
i = 0
all_Emutations = pd.DataFrame()
for sample in EmvcNEW.EXPERIMENT_SAMPLE_LABEL.unique():
    Emutations = pd.DataFrame()
    Emutations = pd.DataFrame(EmvcNEW[EmvcNEW.EXPERIMENT_SAMPLE_LABEL == sample].UID)
    Emutations = Emutations.set_index(pd.Series(range(len(Emutations))))
    Emutations.columns = [sample]
    all_Emutations = pd.concat([all_Emutations,Emutations],axis=1)
    i += 1

i = 0
all_Cmutations = pd.DataFrame()
for sample in CmvcNEW.EXPERIMENT_SAMPLE_LABEL.unique():
    Cmutations = pd.DataFrame()
    Cmutations = pd.DataFrame(CmvcNEW[CmvcNEW.EXPERIMENT_SAMPLE_LABEL == sample].UID)
    Cmutations = Cmutations.set_index(pd.Series(range(len(Cmutations))))
    Cmutations.columns = [sample]
    all_Cmutations = pd.concat([all_Cmutations,Cmutations],axis=1)
    i += 1

i = 0
all_Fmutations = pd.DataFrame()
for sample in FmvcNEW.EXPERIMENT_SAMPLE_LABEL.unique():
    Fmutations = pd.DataFrame()
    Fmutations = pd.DataFrame(FmvcNEW[FmvcNEW.EXPERIMENT_SAMPLE_LABEL == sample].UID)
    Fmutations = Fmutations.set_index(pd.Series(range(len(Fmutations))))
    Fmutations.columns = [sample]
    all_Fmutations = pd.concat([all_Fmutations,Fmutations],axis=1)
    i += 1

# create a comparative dataframe that contains the number of shared UIDs for each sample pair
compare_Emutations = pd.DataFrame(index=all_Emutations.columns,columns=all_Emutations.columns)
for column in compare_Emutations.columns:
    for row in compare_Emutations.columns:
        testrow = all_Emutations[row].dropna()
        testcolumn = all_Emutations[column].dropna()
        test = testrow.isin(testcolumn)
        count = test.value_counts()
        if True in count:
            compare_Emutations[column][row] = count[True]

compare_Cmutations = pd.DataFrame(index=all_Cmutations.columns,columns=all_Cmutations.columns)
for column in compare_Cmutations.columns:
    for row in compare_Cmutations.columns:
        testrow = all_Cmutations[row].dropna()
        testcolumn = all_Cmutations[column].dropna()
        test = testrow.isin(testcolumn)
        count = test.value_counts()
        if True in count:
            compare_Cmutations[column][row] = count[True]

compare_Fmutations = pd.DataFrame(index=all_Fmutations.columns,columns=all_Fmutations.columns)
for column in compare_Fmutations.columns:
    for row in compare_Fmutations.columns:
        testrow = all_Fmutations[row].dropna()
        testcolumn = all_Fmutations[column].dropna()
        test = testrow.isin(testcolumn)
        count = test.value_counts()
        if True in count:
            compare_Fmutations[column][row] = count[True]
       
compare_Emutations = compare_Emutations.sort_index()
compare_Emutations = compare_Emutations.sort_index(axis = 1)
compare_Cmutations = compare_Cmutations.sort_index()
compare_Cmutations = compare_Cmutations.sort_index(axis = 1)
compare_Fmutations = compare_Fmutations.sort_index()
compare_Fmutations = compare_Fmutations.sort_index(axis = 1)
compare_Emutations
        

Unnamed: 0,E-A1-1,E-A1-2,E-A2-1,E-A2-2,E-B2-1,E-B2-2,E-C1-1,E-C1-2,E-C2-1,E-C2-2,E-D1-1,E-D1-2,E-D2-1,E-D2-2,E-E1-1,E-E1-2,E-E2-1,E-E2-2,E-F1-1,E-F1-2,E-F2-1,E-F2-2,E-G1-1,E-G1-2,E-H1-1,E-H1-2,E-H2-1,E-H2-2
E-A1-1,40.0,30.0,2,3,1,1,1,1,2,1,,,1,,,,2,3,1,1,,,,,,,1,
E-A1-2,30.0,35.0,2,3,1,1,1,1,2,1,,,1,,,,2,3,1,1,,,,,,,1,
E-A2-1,2.0,2.0,55,39,5,5,5,5,4,4,4.0,4.0,5,5.0,4.0,5.0,11,11,7,6,5.0,2.0,2.0,2.0,5.0,5.0,4,4.0
E-A2-2,3.0,3.0,39,53,5,5,5,5,6,5,4.0,4.0,5,4.0,6.0,6.0,11,12,7,6,4.0,2.0,2.0,2.0,6.0,5.0,5,4.0
E-B2-1,1.0,1.0,5,5,34,24,5,5,3,3,5.0,5.0,4,4.0,6.0,6.0,6,6,5,5,7.0,2.0,3.0,4.0,4.0,6.0,4,4.0
E-B2-2,1.0,1.0,5,5,24,31,5,5,3,3,5.0,5.0,4,4.0,6.0,6.0,5,5,5,5,7.0,2.0,2.0,3.0,4.0,5.0,4,4.0
E-C1-1,1.0,1.0,5,5,5,5,48,28,5,5,8.0,7.0,6,6.0,7.0,5.0,6,7,4,5,5.0,2.0,2.0,2.0,5.0,4.0,3,5.0
E-C1-2,1.0,1.0,5,5,5,5,27,38,4,4,6.0,6.0,6,6.0,5.0,4.0,6,6,4,4,4.0,2.0,2.0,2.0,4.0,4.0,4,5.0
E-C2-1,2.0,2.0,4,6,3,3,4,4,55,44,3.0,3.0,4,3.0,3.0,3.0,4,5,4,4,4.0,2.0,2.0,2.0,3.0,3.0,4,4.0
E-C2-2,1.0,1.0,4,5,3,3,5,5,45,54,3.0,3.0,3,3.0,3.0,3.0,4,4,4,4,4.0,2.0,2.0,2.0,3.0,3.0,3,5.0


Begin to analyze what's going on with some of the very frequently called hits.  

Sort by UID.  UIDs hit very frequently in patterns across the population may have been present in the founder lines.  

ECNR2.1 seems to especially have a few founder mutations, and interestingly well A1, uniquely, does not share the two most abundant mutations. Those two mutations are present in every other evolved line, but not the founder line, and are as follows: UID 6b18e433 - Gene purU AND UID a63282ad - Gene rusA. Then the third most abundant mutation is additionally not shared by well G1, and only present in one of two colonies from well F2, and is as follows: UID 5a07dac2 - Gene dcuS. Then the fourth most abundant mutation is additionally not shared by wells C2 and H2: UID 80fcf8a2 - Gene yeiE.

C321 has founder mutations that was screened out by variant quality analysis: UID a5616d4c - Gene glvC, UID 126680ba - Intergenic, UID 714efe6a - Gene lomR, UID 353d6035 - Gene xylB, UID a9216ab5 - Intergenic, UID 19fc57f6 - Intergenic, UID 606222fa - Intergenic, UID 7bb6e761 & 840b02b7 - Gene insB1.

C321_F11 has founder mutations that was screened out by variant quality analysis: UID b5381553 - Intergenic, UID 02b7ef3b - Intergenic, UID 714efe6a - Gene lomR, UID 7bb6e761 & 840b02b7 - Gene insB1.

Poor read quality alleles found in all three populations: UID 7f76dbc7 - Gene yneO, UID 2cde5396 - Intergenic, UID 77119983 - Intergenic, UID e0202b3b - Gene yaiO, UID 73016812 - Gene mtr.

Cluster of F populations that share founder mutations: wells E5/E6/F5/G5/G6. All wells share three unique mutations that are found nowhere else: eb59298a - Gene cmtB, 3db45b21 - Gene efeO, and 45ebd648 - Gene tyrR

Cluster of F populations that share founder muations: wells B5/B6/C6/H5. All wells share three unique mutations that are found nowhere else: 8a6ffc8b - Gene waaH, d3e9dd99 - Gene fryA, and 8a634e86 - Gene dcp.

Remove genes that appear to be parental variants: rusA, purU, and dcuS, from dataset.

In [9]:
problemic_UIDs = ['a63282ad','6b18e433','5a07dac2','80fcf8a2','a5616d4c','b5381553','02b7ef3b','126680ba','714efe6a',
                  '353d6035','a9216ab5','7f76dbc7','2cde5396','77119983','19fc57f6','606222fa','7bb6e761','840b02b7',
                  'e0202b3b','73016812','eb59298a','3db45b21','45ebd648','8a6ffc8b','8a634e86','d3e9dd99']

EmvcNEWc = EmvcNEW[~EmvcNEW.UID.isin(problemic_UIDs)]
CmvcNEWc = CmvcNEW[~CmvcNEW.UID.isin(problemic_UIDs)]
FmvcNEWc = FmvcNEW[~FmvcNEW.UID.isin(problemic_UIDs)]

# create a dataframe containing all UIDs for each sample
i = 0
all_EmutationsC = pd.DataFrame()
for sample in EmvcNEWc.EXPERIMENT_SAMPLE_LABEL.unique():
    Emutations = pd.DataFrame()
    Emutations = pd.DataFrame(EmvcNEWc[EmvcNEWc.EXPERIMENT_SAMPLE_LABEL == sample].UID)
    Emutations = Emutations.set_index(pd.Series(range(len(Emutations))))
    Emutations.columns = [sample]
    all_EmutationsC = pd.concat([all_EmutationsC,Emutations],axis=1)
    i += 1

i = 0
all_CmutationsC = pd.DataFrame()
for sample in CmvcNEWc.EXPERIMENT_SAMPLE_LABEL.unique():
    Cmutations = pd.DataFrame()
    Cmutations = pd.DataFrame(CmvcNEWc[CmvcNEWc.EXPERIMENT_SAMPLE_LABEL == sample].UID)
    Cmutations = Cmutations.set_index(pd.Series(range(len(Cmutations))))
    Cmutations.columns = [sample]
    all_CmutationsC = pd.concat([all_CmutationsC,Cmutations],axis=1)
    i += 1

i = 0
all_FmutationsC = pd.DataFrame()
for sample in FmvcNEWc.EXPERIMENT_SAMPLE_LABEL.unique():
    Fmutations = pd.DataFrame()
    Fmutations = pd.DataFrame(FmvcNEWc[FmvcNEWc.EXPERIMENT_SAMPLE_LABEL == sample].UID)
    Fmutations = Fmutations.set_index(pd.Series(range(len(Fmutations))))
    Fmutations.columns = [sample]
    all_FmutationsC = pd.concat([all_FmutationsC,Fmutations],axis=1)
    i += 1

# create a comparative dataframe that contains the number of shared UIDs for each sample pair
compare_EmutationsC = pd.DataFrame(index=all_EmutationsC.columns,columns=all_EmutationsC.columns)
for column in compare_EmutationsC.columns:
    for row in compare_EmutationsC.columns:
        testrow = all_EmutationsC[row].dropna()
        testcolumn = all_EmutationsC[column].dropna()
        test = testrow.isin(testcolumn)
        count = test.value_counts()
        if True in count:
            compare_EmutationsC[column][row] = count[True]

compare_CmutationsC = pd.DataFrame(index=all_CmutationsC.columns,columns=all_CmutationsC.columns)
for column in compare_CmutationsC.columns:
    for row in compare_CmutationsC.columns:
        testrow = all_CmutationsC[row].dropna()
        testcolumn = all_CmutationsC[column].dropna()
        test = testrow.isin(testcolumn)
        count = test.value_counts()
        if True in count:
            compare_CmutationsC[column][row] = count[True]

compare_FmutationsC = pd.DataFrame(index=all_FmutationsC.columns,columns=all_FmutationsC.columns)
for column in compare_FmutationsC.columns:
    for row in compare_FmutationsC.columns:
        testrow = all_FmutationsC[row].dropna()
        testcolumn = all_FmutationsC[column].dropna()
        test = testrow.isin(testcolumn)
        count = test.value_counts()
        if True in count:
            compare_FmutationsC[column][row] = count[True]
       
compare_EmutationsC = compare_EmutationsC.sort_index()
compare_EmutationsC = compare_EmutationsC.sort_index(axis = 1)
compare_CmutationsC = compare_CmutationsC.sort_index()
compare_CmutationsC = compare_CmutationsC.sort_index(axis = 1)
compare_FmutationsC = compare_FmutationsC.sort_index()
compare_FmutationsC = compare_FmutationsC.sort_index(axis = 1)

all_UIDs = pd.concat([all_EmutationsC.stack().value_counts(),all_CmutationsC.stack().value_counts(),
           all_FmutationsC.stack().value_counts()],axis = 1)
all_UIDs.columns = ['ECNR2','C321','C321_F11']
print("Culled gene variant UIDs that occur most frequently include: \n",
      all_UIDs[(all_UIDs.ECNR2 > 5) | (all_UIDs.C321 > 5) | (all_UIDs.C321_F11 > 5)].sort_values(by = 'ECNR2',ascending = False))


compare_FmutationsC

Culled gene variant UIDs that occur most frequently include: 
           ECNR2  C321  C321_F11
5ac9acba     15    15         4
40e04db3     10     2         9
4fa88f95     10     1         6
947e4eb7      8   NaN       NaN
d9b18ee8      7     6       NaN
dcb8adb0      7   NaN       NaN
e5ad4a3e      7     2         8
460e3a13      6   NaN       NaN
4ffd1247      6   NaN         4
c99348e8      6   NaN       NaN
9389f535      3     8       NaN
d7075859      2     3         6
a0bfa89f      1   NaN         6
2401d3be    NaN     9        22
800bc3d8    NaN     8       NaN
80ff041e    NaN     2         6
d3ded9fa    NaN     7       NaN


Unnamed: 0,F-A6-1,F-A6-2,F-B5-1,F-B5-2,F-B6-1,F-B6-2,F-C5-1,F-C5-2,F-C6-1,F-C6-2,F-D5-1,F-D5-2,F-D6-1,F-D6-2,F-E5-1,F-E5-2,F-E6-1,F-E6-2,F-F5-1,F-F5-2,F-F6-1,F-F6-2,F-G5-1,F-G5-2,F-G6-1,F-G6-2,F-H5-1,F-H5-2
F-A6-1,57.0,46.0,1,1,1.0,1.0,1.0,1.0,1.0,1.0,,,,,1.0,1.0,,,1.0,1.0,1.0,1.0,,2,,,2.0,
F-A6-2,46.0,52.0,1,1,1.0,1.0,1.0,1.0,1.0,1.0,,,,,1.0,1.0,,,1.0,,1.0,,,1,,,1.0,
F-B5-1,1.0,1.0,35,29,3.0,3.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,3.0,3,2.0,2.0,1.0,1.0
F-B5-2,1.0,1.0,29,39,3.0,3.0,2.0,2.0,1.0,1.0,3.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,3.0,3,2.0,2.0,1.0,1.0
F-B6-1,1.0,1.0,3,3,55.0,38.0,,,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1,2.0,2.0,2.0,2.0
F-B6-2,1.0,1.0,3,3,38.0,50.0,,,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2,2.0,2.0,2.0,2.0
F-C5-1,1.0,1.0,2,2,,,47.0,42.0,,,1.0,3.0,2.0,2.0,,,,,,,,,1.0,1,,,,
F-C5-2,1.0,1.0,2,2,,,41.0,59.0,,,1.0,2.0,2.0,2.0,,,,,,,,,1.0,1,,,,
F-C6-1,1.0,1.0,1,1,2.0,2.0,,,56.0,36.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,,
F-C6-2,1.0,1.0,1,1,2.0,2.0,,,36.0,56.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1,1.0,1.0,,1.0


Figure out newly culled set of threshold genes hit

Discount branched populations of C321:
Well B3 = D3
Well B4 = C4
Well C3 = E3
Well D4 = F4
Well E4 = G4
Well F3 = G3

Accomplish this by dividing all counts by 1.5 (to remove some excess counts, but keep some due to partial independence) except for the one unbranched well (A4).

In [10]:
adjustment_value = 1.25
adjusted_c321_counts = CmvcNEWc[CmvcNEWc.EXPERIMENT_SAMPLE_LABEL.str[2] == 'A'].INFO_EFF_GENE.value_counts()
adjusted_c321_counts *= (adjustment_value - 1)
adjusted_c321_counts = adjusted_c321_counts.add(CmvcNEWc.INFO_EFF_GENE.value_counts(),fill_value = 0)
adjusted_c321_counts /= adjustment_value
adjusted_c321_counts = adjusted_c321_counts.round()

threshold_genes_hitC = pd.concat([EmvcNEWc.INFO_EFF_GENE.value_counts(),adjusted_c321_counts,
                               FmvcNEWc.INFO_EFF_GENE.value_counts()], axis=1)
threshold_genes_hitC.columns = ['ECNR2','C321','C321_F11']
threshold_genes_hitC = threshold_genes_hitC[(threshold_genes_hitC.ECNR2 > threshold) | 
                                          (threshold_genes_hitC.C321 > threshold) | 
                                          (threshold_genes_hitC.C321_F11 > threshold)]

print('\nnumber of genes hit over threshold in evolved \nECNR2.1 populations:',
      threshold_genes_hitC[threshold_genes_hitC.ECNR2 > threshold].ECNR2.count(),
      ' --C321 poipulations:',
      threshold_genes_hitC[threshold_genes_hitC.C321 > threshold].C321.count(),
      ' --C321_F11 populations:',
      threshold_genes_hitC[threshold_genes_hitC.C321_F11 > threshold].C321_F11.count(),'\n')
print('genes hit: \n',threshold_genes_hitC)


number of genes hit over threshold in evolved 
ECNR2.1 populations: 10  --C321 poipulations: 14  --C321_F11 populations: 10 

genes hit: 
        ECNR2  C321  C321_F11
fimH     NaN     2        19
fis      NaN    16       NaN
flu       30     9        22
folA     NaN    10         4
insB1      3     1         7
lomR       4     6         4
mdtJ     NaN     2         8
ompT      10   NaN         1
oxyR     NaN     6        20
phoP       6     1         9
prfB       2    12        22
ptrB       3     6       NaN
purT     NaN     3        10
pykF      10     1         3
pyrC      10   NaN         2
recE       6   NaN         3
rlmB       2   NaN         6
rph       29    19        32
rpoB      13     2       NaN
rpoC      18     6         3
yahL     NaN     7         2
ycgB     NaN     6       NaN
ydfJ       5    10         4
yehB       1     7       NaN
yfiH       7     5       NaN
yhjK     NaN     7       NaN


In [11]:
Ftop_hits = pd.DataFrame()
for v in FmvcNEWc.EXPERIMENT_SAMPLE_LABEL.unique():
    hits = pd.Series(FmvcNEWc[(FmvcNEWc.EXPERIMENT_SAMPLE_LABEL == v) & 
                    (FmvcNEWc.INFO_EFF_GENE.isin(threshold_genes_hitC.index))].INFO_EFF_GENE.value_counts())
    hits.name = v
    Ftop_hits = pd.concat([Ftop_hits, hits], axis = 1)
Ftop_hits.sort_index().reindex_axis(sorted(Ftop_hits.columns),axis = 1)

Unnamed: 0,F-A6-1,F-A6-2,F-B5-1,F-B5-2,F-B6-1,F-B6-2,F-C5-1,F-C5-2,F-C6-1,F-C6-2,F-D5-1,F-D5-2,F-D6-1,F-D6-2,F-E5-1,F-E5-2,F-E6-1,F-E6-2,F-F5-1,F-F5-2,F-F6-1,F-F6-2,F-G5-1,F-G5-2,F-G6-1,F-G6-2,F-H5-1,F-H5-2
fimH,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,,,1.0,1.0,1.0,1.0
flu,1.0,1.0,1.0,1.0,1.0,1.0,,,,,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,,
folA,1.0,1.0,,,,,,,,,,,,,,,,,,,1.0,1.0,,,,,,
insB1,,,,,,,,,,1.0,,,,1.0,,,,1.0,,1.0,1.0,1.0,,,,,,1.0
lomR,1.0,,,,,,,,,,,,,,,,,,,,,1.0,,1.0,,,1.0,
mdtJ,,,,,,,1.0,1.0,,,,,,,,,,,,,,,1.0,1.0,1.0,,1.0,2.0
ompT,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,
oxyR,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0
phoP,,,,,1.0,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,1.0,,
prfB,,,1.0,1.0,1.0,1.0,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,


# Check TAG to TAAs

In [12]:
newTAApositions = pd.read_csv("mg1655v3_uag_locations.csv",dtype = {'IS_HET': str, 'AO': str, 'QA': str})

Cmvc.sort_values(by='POSITION')
Fmvc.sort_values(by='POSITION')

# Nested loop strategy to run through the 321 positions for all of the C321 strains
for x in range(0, 2):
    pos = newTAApositions[x:(x+1)]
    truth1 = all(Cmvc[Cmvc.POSITION == pos])
    truth2 = all(Fmvc[Fmvc.POSITION == pos])
    if truth1 == false or truth2 == false:
        print(pos)
# Any positions that are printed reflect at least one instance of a mutation at that position.
# If no positions are printed, then all TAG substitutions have remained as TAA.

OSError: File b'mg1655v3_uag_locations.csv' does not exist