In [1]:
import numpy as np
from IPython.display import HTML
from bokeh.plotting import output_notebook, show
import genomes_dnj.lct_interval.series_plots as dm
import genomes_dnj.lct_interval.anal_series as an
import genomes_dnj.lct_interval_snp_anal.lct_interval_snp_anal as snp_anal
import genomes_dnj.lct_interval.series_masks as sm
output_notebook(hide_banner=True)
sa_64_1575 = snp_anal.series_anal_cls(dm.di_64_1575, sm.series_data)

<h3>EUR Root Fragments</h3>
<div style="width:700px">
<p>
The upper 400,000 base region of the EUR tree root includes the series 26_1414, 10_2206, 64_1575, and
7_1868.  These 4 series appear to have had an history as a single series of SNPs.  Samples that
express 26_1414 without 10_2206 are very uncommon.  But there are a substantial number of samples
that express 26_1414 and 10_2206 without 64_1575.  This notebook examines the fragments of
the 64_1575 SNPs that are often expressed by these samples in association with 26_1414 and
10_2206.
<p>
This notebook focuses on samples that express the most significant series in the EUR tree root
but not any of its descendants.  The first plot shows results for samples selected for the
expression of 26_1414 and 6_1503 but not 64_1575.  These criteria identified 38 samples.  All
of them expressed both 4_1699 and 10_2206.  None of them come from European populations.
The samples from overexpressed populations are the 20 American and the 15 East Asian.
</div>

In [2]:
plt_obj = dm.superset_basis_yes_no(dm.di_26_1414, [dm.di_6_1503], [dm.di_64_1575], min_match=0.1)
plt = plt_obj.do_plot()
am_not_64_1575 = plt_obj.plot_context.yes_allele_mask
show(plt)

In [3]:
HTML(plt_obj.get_html())

index,first,length,snps,alleles,alleles.1,matches,matches.1,afr,afr.1,afx,afx.1,amr,amr.1,eas,eas.1,eur,eur.1,sas,sas.1,sax,sax.1
353921,136501840,53819,10,2206,0.02,38,1.0,0,0.0,1,0.42,20,3.8,15,1.96,0,0.0,1,0.36,1,0.21
353462,135915358,79721,4,1699,0.02,38,1.0,0,0.0,1,0.42,20,3.8,15,1.96,0,0.0,1,0.36,1,0.21
353283,135771974,368330,6,1503,0.03,38,1.0,0,0.0,1,0.42,20,3.8,15,1.96,0,0.0,1,0.36,1,0.21
353797,136398174,75924,26,1414,0.03,38,1.0,0,0.0,1,0.42,20,3.8,15,1.96,0,0.0,1,0.36,1,0.21


<div style="width:700px">
<p>
The count data below shows the number of samples among the 38 identified above
that expressed different numbers of SNPs from the series 64_1575.
</div>

In [4]:
count_data = sa_64_1575.unique_snps_per_allele(am_not_64_1575)
count_data

array([(30, 1), (31, 28), (32, 4), (37, 1), (39, 1), (40, 2), (57, 1)], 
      dtype=[('count', '<u2'), ('snps', '<u2')])

<div style="width:700px">
<p>
The array below shows the number of the 38 samples that express each of 64 SNPs
in the series 64_1575.  The first 30 SNPs in 64_1575 are expressed by all 38 samples.
</div>

In [5]:
sa_64_1575.alleles_per_snp(am_not_64_1575)

array([38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
       38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 37,  3,  7,  3,
        3,  3,  3,  3,  3,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        2,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2])

<div style="width:700px">
<p>
The most common case identified above is the 38 samples that express 31 series
64_1575 SNPs.  The data below shows that all of these samples express the first
31 64_1575 SNPs.  This pattern fits a single recombination event.  None of these
28 chromosomes express any series in the region beyond the common association of
6_1503, 4_1699, 26_1414, and 10_2206.
</div>

In [6]:
aps_31, am_31 = sa_64_1575.snps_from_aps_value(31, am_not_64_1575)
aps_31

array([28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
       28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [7]:
plt_obj = dm.superset_allele_mask(am_31, min_match=0.001)
plt = plt_obj.do_plot()
show(plt)

In [8]:
HTML(plt_obj.get_html())

index,first,length,snps,alleles,alleles.1,matches,matches.1,afr,afr.1,afx,afx.1,amr,amr.1,eas,eas.1,eur,eur.1,sas,sas.1,sax,sax.1
353921,136501840,53819,10,2206,0.01,28,1.0,0,0.0,1,0.57,14,3.61,13,2.31,0,0.0,0,0.0,0,0.0
353462,135915358,79721,4,1699,0.02,28,1.0,0,0.0,1,0.57,14,3.61,13,2.31,0,0.0,0,0.0,0,0.0
353283,135771974,368330,6,1503,0.02,28,1.0,0,0.0,1,0.57,14,3.61,13,2.31,0,0.0,0,0.0,0,0.0
353797,136398174,75924,26,1414,0.02,28,1.0,0,0.0,1,0.57,14,3.61,13,2.31,0,0.0,0,0.0,0,0.0


<div style="width:700px">
<p>
The data below is for 4 samples that express 32 64_1575 SNPs.  Again
all of the first 31 SNPs are expressed.  The samples all come from AMR or EAS
populations.  All 4 samples also don't express the SNP after the first 31, but
do express the next one.  This pattern would fit an event that mutated that SNP in
an instance of the population derived from the recombination event identified above.
</div>

In [9]:
aps_32, am_32 = sa_64_1575.snps_from_aps_value(32, am_not_64_1575)
aps_32

array([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [10]:
plt_obj = dm.superset_allele_mask(am_32, min_match=0.001)
plt = plt_obj.do_plot()
show(plt)

In [11]:
HTML(plt_obj.get_html())

index,first,length,snps,alleles,alleles.1,matches,matches.1,afr,afr.1,afx,afx.1,amr,amr.1,eas,eas.1,eur,eur.1,sas,sas.1,sax,sax.1
353921,136501840,53819,10,2206,0.0,4,1.0,0,0.0,0,0.0,2,3.61,2,2.48,0,0.0,0,0.0,0,0.0
353462,135915358,79721,4,1699,0.0,4,1.0,0,0.0,0,0.0,2,3.61,2,2.48,0,0.0,0,0.0,0,0.0
353283,135771974,368330,6,1503,0.0,4,1.0,0,0.0,0,0.0,2,3.61,2,2.48,0,0.0,0,0.0,0,0.0
353797,136398174,75924,26,1414,0.0,4,1.0,0,0.0,0,0.0,2,3.61,2,2.48,0,0.0,0,0.0,0,0.0


<div style="width:700px">
<p>
The data below identifies another apparent recombination event that has left the
first 40 SNPs of 64_1575 associated with the lower region of the EUR tree root.
This event appears of have taken place in a South Asian population.
</div>

In [12]:
aps_40, am_40 = sa_64_1575.snps_from_aps_value(40, am_not_64_1575)
aps_40

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [13]:
plt_obj = dm.superset_allele_mask(am_40, min_match=0.001)
plt = plt_obj.do_plot()
show(plt)

In [14]:
HTML(plt_obj.get_html())

index,first,length,snps,alleles,alleles.1,matches,matches.1,afr,afr.1,afx,afx.1,amr,amr.1,eas,eas.1,eur,eur.1,sas,sas.1,sax,sax.1
353921,136501840,53819,10,2206,0.0,2,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,6.88,1,4.08
353462,135915358,79721,4,1699,0.0,2,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,6.88,1,4.08
353283,135771974,368330,6,1503,0.0,2,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,6.88,1,4.08
353797,136398174,75924,26,1414,0.0,2,1.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,6.88,1,4.08


<div style="width:700px">
<p>
The data below identifies one American samples that expresses the first 30 SNPs
of 64_1575.  Perhaps it is a separate recombination event.  Or perhaps it is a case
of an instance of the 31 SNP recombination event going through another genetic event
that mutated the last SNP back to the common value.
</div>

In [15]:
aps_30, am_30 = sa_64_1575.snps_from_aps_value(30, am_not_64_1575)
aps_30

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [16]:
plt_obj = dm.superset_allele_mask(am_30, min_match=0.001)
plt = plt_obj.do_plot()
show(plt)

In [17]:
HTML(plt_obj.get_html())

index,first,length,snps,alleles,alleles.1,matches,matches.1,afr,afr.1,afx,afx.1,amr,amr.1,eas,eas.1,eur,eur.1,sas,sas.1,sax,sax.1
353921,136501840,53819,10,2206,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0
353462,135915358,79721,4,1699,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0
353283,135771974,368330,6,1503,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0
353797,136398174,75924,26,1414,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0


<div style="width:700px">
<p>
The data below identifies one American sample that expresses the first 31 SNPs
and the last 6 of 64_1575.  It looks like another American instance of the 31 SNP mutation
that has experienced a second recombination event that restored 7_1868 and an associated
fragment of the end of 64_1575.
</div>

In [18]:
aps_37, am_37 = sa_64_1575.snps_from_aps_value(37, am_not_64_1575)
aps_37

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

In [19]:
plt_obj = dm.superset_allele_mask(am_37, min_match=0.001)
plt = plt_obj.do_plot()
show(plt)

In [20]:
HTML(plt_obj.get_html())

index,first,length,snps,alleles,alleles.1,matches,matches.1,afr,afr.1,afx,afx.1,amr,amr.1,eas,eas.1,eur,eur.1,sas,sas.1,sax,sax.1
353921,136501840,53819,10,2206,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0
354170,136682274,93624,7,1868,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0
353462,135915358,79721,4,1699,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0
353283,135771974,368330,6,1503,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0
353797,136398174,75924,26,1414,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0


<div style="width:700px">
<p>
The data below shows another American sample that appears to be derived
from the 31 SNP recombination event.  It also expresses the last 7 SNPs
of 64_1575 and 1 in the middle.  It is most likely the result of another
event that recombined the 31 SNP fragment with 7_1868 and an overlapping
fragment of the upper part of 64_1575.  Somewhere in the history another
mutation generated the isolated middle SNP.
</div>

In [21]:
aps_39, am_39 = sa_64_1575.snps_from_aps_value(39, am_not_64_1575)
aps_39

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])

In [22]:
plt_obj = dm.superset_allele_mask(am_39, min_match=0.001)
plt = plt_obj.do_plot()
show(plt)

In [23]:
HTML(plt_obj.get_html())

index,first,length,snps,alleles,alleles.1,matches,matches.1,afr,afr.1,afx,afx.1,amr,amr.1,eas,eas.1,eur,eur.1,sas,sas.1,sax,sax.1
353921,136501840,53819,10,2206,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0
354170,136682274,93624,7,1868,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0
353462,135915358,79721,4,1699,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0
353283,135771974,368330,6,1503,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0
353797,136398174,75924,26,1414,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0


<div style="width:700px">
<p>
The data below shows another American sample that has experienced a
recombination event that left it expressing the first 57 SNPs of 64_1575.
</div>

In [24]:
aps_57, am_57 = sa_64_1575.snps_from_aps_value(57, am_not_64_1575)
aps_57

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0])

In [25]:
plt_obj = dm.superset_allele_mask(am_57, min_match=0.001)
plt = plt_obj.do_plot()
show(plt)

In [26]:
HTML(plt_obj.get_html())

index,first,length,snps,alleles,alleles.1,matches,matches.1,afr,afr.1,afx,afx.1,amr,amr.1,eas,eas.1,eur,eur.1,sas,sas.1,sax,sax.1
353921,136501840,53819,10,2206,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0
353462,135915358,79721,4,1699,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0
353283,135771974,368330,6,1503,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0
353797,136398174,75924,26,1414,0.0,1,1.0,0,0.0,0,0.0,1,7.22,0,0.0,0,0.0,0,0.0,0,0.0
