# Notebook 5: PCA within



In [1]:
import ipyrad.analysis as ipa
import numpy as np
import toytree

### DATA

In [3]:
SEQS = "../analysis-ipyrad/Strept_min4_outfiles/Strept_min4.seqs.hdf5"
SNPS = "../analysis-ipyrad/Strept_min4_outfiles/Strept_min4.snps.hdf5"

In [43]:
# load tree, root, and drop outgroups
tre = toytree.tree("../analysis-raxml/RAxML_bipartitions.strept-min20-r05")
tre = tre.root("A_thaliana_TAIR10")
tre = tre.drop_tips(["A_thaliana_TAIR10", "Sy_irio_NJ_3877", "reference", "S_irio"])
#tre.draw(node_labels="idx", use_edge_lengths=False, node_sizes=20);

### PCA within Streptanthus

In [39]:
PMAP = {
    'howellii': tre.get_tip_labels(154),
    'longisiliquus': tre.get_tip_labels(147),
    'barbatus': tre.get_tip_labels(146),
    'oliganthus': tre.get_tip_labels(145),
    'cordatus': tre.get_tip_labels(144),
    'campestris': tre.get_tip_labels(143),
    'campestris-2': ['S_campestris_NJ_3784', 'S_campestris_NJ_3785'],
    'juneae': tre.get_tip_labels(120),
    'bernardinus': tre.get_tip_labels(119),
}

SMAP = {
    'campestris': tre.get_tip_labels(143),
    'campestris-2': ['S_campestris_NJ_3784', 'S_campestris_NJ_3785'],
    'juneae': tre.get_tip_labels(120),
    'bernardinus': tre.get_tip_labels(119),
}

### Within streptanthus

In [51]:
# init tool with args
tool = ipa.pca(
    data=SNPS,
    impute_method="sample",
    mincov=0.85,
    imap=PMAP,
    minmap={i:1 for i in PMAP},
)

# run multiple reps
tool.run(subsample=1, nreplicates=25, seed=123)

# draw main result
canvas, axes = tool.draw(size=9, cycle=9);

Samples: 55
Sites before filtering: 711484
Filtered (indels): 0
Filtered (bi-allel): 27589
Filtered (mincov): 690558
Filtered (minmap): 633011
Filtered (subsample invariant): 493091
Filtered (combined): 703242
Sites after filtering: 8242
Sites containing missing values: 8223 (99.77%)
Missing values in SNP matrix: 52227 (11.52%)
Imputation: 'sampled'; (0, 1, 2) = 87.9%, 3.3%, 8.8%
Subsampling SNPs: 648/8242


In [48]:
canvas, axes = tool.draw(size=9, cycle=9);
axes.x.ticks.show = True
axes.y.ticks.show = True
import toyplot.svg
toyplot.svg.render(canvas, "../figures/pca-strept-pmap-min85.svg")

### Other filtering ...

In [40]:
tool = ipa.pca(
    data=SNPS,
    impute_method="sample",
    mincov=0.75,
    imap=PMAP,
    minmap={i:1 for i in PMAP},
)
tool.run(subsample=1, nreplicates=25, seed=123)
tool.draw(size=9, cycle=9);

Samples: 55
Sites before filtering: 711484
Filtered (indels): 0
Filtered (bi-allel): 27589
Filtered (mincov): 657318
Filtered (minmap): 633011
Filtered (subsample invariant): 493091
Filtered (combined): 691623
Sites after filtering: 19861
Sites containing missing values: 19842 (99.90%)
Missing values in SNP matrix: 178916 (16.38%)
Imputation: 'sampled'; (0, 1, 2) = 86.5%, 3.5%, 10.0%
Subsampling SNPs: 1397/19861


In [41]:
tool = ipa.pca(
    data=SNPS,
    impute_method="sample",
    mincov=0.75,
    imap=SMAP,
    minmap={i:2 for i in SMAP},
)
tool.run(subsample=1, nreplicates=25, seed=123)
tool.draw(size=9, cycle=9);

Samples: 20
Sites before filtering: 711484
Filtered (indels): 0
Filtered (bi-allel): 7459
Filtered (mincov): 624631
Filtered (minmap): 621642
Filtered (subsample invariant): 662029
Filtered (combined): 701032
Sites after filtering: 10452
Sites containing missing values: 9940 (95.10%)
Missing values in SNP matrix: 30372 (14.53%)
Imputation: 'sampled'; (0, 1, 2) = 73.5%, 10.1%, 16.4%
Subsampling SNPs: 2152/10452


In [37]:
tool.run(subsample=1, nreplicates=25, seed=123)
tool.draw(size=9, cycle=9);

Subsampling SNPs: 1181/16561


In [30]:
tool.run(subsample=1, nreplicates=10)
tool.draw_panels();

Subsampling SNPs: 1181/16561


### Broader sampling

In [10]:
DATA = "..."
IMAP = {
    #1: tre.get_tip_labels(192),
    #2: tre.get_tip_labels(184),
    #3: tre.get_tip_labels(181),
    #4: tre.get_tip_labels(178),
    5: tre.get_tip_labels(174),
    6: tre.get_tip_labels(168),
    7: tre.get_tip_labels(161),
    8: tre.get_tip_labels(152),
    9: tre.get_tip_labels(151),
}

### Infer PCA structure

In [11]:
tet = ipa.pca(
    data=SNPS,
    impute_method="sample",
    mincov=0.75,
    imap=IMAP,
    minmap={i:2 for i in IMAP},
)

Samples: 76
Sites before filtering: 711484
Filtered (indels): 0
Filtered (bi-allel): 47560
Filtered (mincov): 667677
Filtered (minmap): 605932
Filtered (subsample invariant): 307664
Filtered (combined): 687371
Sites after filtering: 24113
Sites containing missing values: 24107 (99.98%)
Missing values in SNP matrix: 329132 (17.96%)
Imputation: 'sampled'; (0, 1, 2) = 87.6%, 4.9%, 7.6%


In [18]:
tet.run(nreplicates=20)
tet.draw();

Subsampling SNPs: 1146/24113
