## Viburnum Oreinotinus PCAs
And other dimensionality reduction methods.

In [1]:
import pandas as pd
import ipyrad.analysis as ipa
print(ipa.__version__)

1.0.0-alpha


### Path to ipyrad assembled RAD SNPs database

In [2]:
DATA = (
    "/home/deren/Documents/Viburnum-Oreinotinus/assembly_hic_feb/"
    "full_dataset_outfiles/full_dataset.snps.hdf5"
)

### Load metadata

In [3]:
# import database
fulldata = pd.read_csv(
    "https://raw.githubusercontent.com/eaton-lab/"
    "Orienotinus-phylogeny/main/Raw_data/oreinotinus_samples_database.csv"
)

# import region info
regions = pd.read_csv(
    "https://raw.githubusercontent.com/eaton-lab/"
    "Orienotinus-phylogeny/main/Raw_data/oreinotinus_color_codes.csv"
)

In [4]:
# get color codes
region_colors = dict(zip(regions.RegionShort, regions.Color))
{i: region_colors[i] for i in sorted(region_colors)}

{'CR': '#501644',
 'Chiapas/CA': '#d35f5f',
 'E.Col/Ven': '#5555ff',
 'E.Mex': '#a8ffaa',
 'Jam': '#ffd42a',
 'Oaxaca': '#87cdde',
 'S.Col/N.Ecu': '#de87cd',
 'S.Ecu/N.Peru': '#ff6600',
 'S.Peru/Bol': '#d4ff2a',
 'W.Col': '#66ff00',
 'W.Mex': '#00966D',
 'out': '#a09999'}

### Population assignments

In [5]:
# build a dict mapping region names to lists of samples
IMAP = {}
for idx in fulldata.index:
    if fulldata.full_dataset_withAyava[idx] == "1":
        sample = fulldata.NameInAssembly[idx]
        species = fulldata.Last_valid_name[idx]
        region = regions[regions.Species == species].RegionShort.values[0]
        if species != "dentatum":
            if region in IMAP:
                IMAP[region].append(sample)
            else:
                IMAP[region] = [sample]

In [6]:
# number of samples in each region
{i: len(IMAP[i]) for i in sorted(IMAP)}

{'CR': 2,
 'Chiapas/CA': 51,
 'E.Col/Ven': 16,
 'E.Mex': 28,
 'Jam': 2,
 'Oaxaca': 31,
 'S.Col/N.Ecu': 6,
 'S.Ecu/N.Peru': 15,
 'S.Peru/Bol': 6,
 'W.Col': 19,
 'W.Mex': 2}

### PCA analysis: Mexico and Central America

In [7]:
select = ["CR", "Chiapas/CA", "E.Mex", "W.Mex", "Jam", "Oaxaca"]
imap_mex_central_america = {k: IMAP[k] for k in select}

In [8]:
tool = ipa.pca(
    data=DATA,
    imap=imap_mex_central_america,
    minmap=0.5,
    mincov=0.75,
    minmaf=0.02,
    random_seed=123,
    cores=20,
)

[1mINFO[0m [37m|[0m [38;2;0;0;95mcluster.py  [0m [37m|[0m [1m[22mEstablishing parallel ipcluster: [0m[1m[0m20 engines.
[1mINFO[0m [37m|[0m [38;2;0;0;95mprogress.py [0m [37m|[0m [1m[22m[####################] SNP filtering[0m[1m[0m
[1mINFO[0m [37m|[0m [38;2;0;0;95msnps_extract[0m [37m|[0m [1m[22mfilter statistics:
samples                                          116
pre_filter_snps                              3419673
pre_filter_percent_missing                        67
filter_by_indels_present                           0
filter_by_non_biallelic                       129070
filter_by_mincov                             3392855
filter_by_minmap                             3153237
filter_by_invariant_after_subsampling        1611534
filter_by_minor_allele_frequency             2146978
post_filter_snps                                2931
post_filter_snp_containing_linkage_blocks        823
post_filter_percent_missing                       21
dtype: int64[

In [9]:
tool.run(50, random_seed=333, subsample=True)

[1mINFO[0m [37m|[0m [38;2;0;0;95msnps_extract[0m [37m|[0m [1m[22msubsampled 823 unlinked SNPs.[0m[1m[0m


In [10]:
tool.draw(
    colors=[region_colors[i] for i in imap_mex_central_america],
    centroids_only=True,
    size=8,
    #outfile="./figures/Oreinotinus-CA.svg",
);

In [12]:
tool.run_umap(n_neighbors=5, random_seed=333, subsample=True, min_dist=0.8)

[1mINFO[0m [37m|[0m [38;2;0;0;95msnps_extract[0m [37m|[0m [1m[22msubsampled 823 unlinked SNPs.[0m[1m[0m


In [15]:
tool.draw(
    colors=[region_colors[i] for i in imap_mex_central_america],
    size=8,
    #outfile="./figures/Oreinotinus-CA-umap.svg",
);

### PCA Analysis: South America

In [7]:
select = ["W.Col", "S.Peru/Bol", "S.Ecu/N.Peru", "E.Col/Ven", "S.Col/N.Ecu"]
imap_south_america = {k: IMAP[k] for k in select}

In [8]:
tool = ipa.pca(
    data=DATA,
    imap=imap_south_america,
    minmap=0.5,
    mincov=0.75,
    minmaf=0.02,
    random_seed=123,
    cores=20,
)

[1mINFO[0m [37m|[0m [38;2;0;0;95mcluster.py  [0m [37m|[0m [1m[22mEstablishing parallel ipcluster: [0m[1m[0m20 engines.
[1mINFO[0m [37m|[0m [38;2;0;0;95mprogress.py [0m [37m|[0m [1m[22m[####################] SNP filtering[0m[1m[0m
[1mINFO[0m [37m|[0m [38;2;0;0;95msnps_extract[0m [37m|[0m [1m[22mfilter statistics:
samples                                           62
pre_filter_snps                              3419673
pre_filter_percent_missing                        57
filter_by_indels_present                           0
filter_by_non_biallelic                        72203
filter_by_mincov                             3083036
filter_by_minmap                             2670397
filter_by_invariant_after_subsampling        2381817
filter_by_minor_allele_frequency             2463579
post_filter_snps                               37737
post_filter_snp_containing_linkage_blocks      11006
post_filter_percent_missing                       19
dtype: int64[

In [9]:
tool.run(50, random_seed=333, subsample=True)

[1mINFO[0m [37m|[0m [38;2;0;0;95msnps_extract[0m [37m|[0m [1m[22msubsampled 11006 unlinked SNPs.[0m[1m[0m


In [11]:
tool.draw(
    colors=[region_colors[i] for i in imap_south_america],
    centroids_only=True,
    size=8,
    #outfile="./figures/Oreinotinus-SA.svg",
);

In [13]:
tool.run_umap(n_neighbors=3, random_seed=333, subsample=True, min_dist=0.15)
tool.draw(
    colors=[region_colors[i] for i in imap_south_america],
    centroids_only=True,
    size=8,
    #outfile="./figures/Oreinotinus-SA-umap.svg",
);

[1mINFO[0m [37m|[0m [38;2;0;0;95msnps_extract[0m [37m|[0m [1m[22msubsampled 11006 unlinked SNPs.[0m[1m[0m


### PCA Analysis: Full sampling

In [8]:
tool = ipa.pca(
    data=DATA,
    imap=IMAP,
    minmap=0.5,
    mincov=0.75,
    minmaf=0.02,
    random_seed=123,
    cores=20,
)

[1mINFO[0m [37m|[0m [38;2;0;0;95mcluster.py  [0m [37m|[0m [1m[22mEstablishing parallel ipcluster: [0m[1m[0m20 engines.
[1mINFO[0m [37m|[0m [38;2;0;0;95mprogress.py [0m [37m|[0m [1m[22m[####################] SNP filtering[0m[1m[0m
[1mINFO[0m [37m|[0m [38;2;0;0;95msnps_extract[0m [37m|[0m [1m[22mfilter statistics:
samples                                          178
pre_filter_snps                              3419673
pre_filter_percent_missing                        63
filter_by_indels_present                           0
filter_by_non_biallelic                       193270
filter_by_mincov                             3357698
filter_by_minmap                             3225437
filter_by_invariant_after_subsampling         908359
filter_by_minor_allele_frequency             2130075
post_filter_snps                                5497
post_filter_snp_containing_linkage_blocks       1843
post_filter_percent_missing                       21
dtype: int64[

In [7]:
tool = ipa.pca(
    data=DATA,
    imap=IMAP,
    minmap=0.5,
    mincov=0.75,
    minmaf=0.02,
    random_seed=123,
    cores=20,
)

[1mINFO[0m [37m|[0m [38;2;0;0;95mcluster.py  [0m [37m|[0m [1m[22mEstablishing parallel ipcluster: [0m[1m[0m20 engines.
[1mINFO[0m [37m|[0m [38;2;0;0;95mprogress.py [0m [37m|[0m [1m[22m[####################] SNP filtering[0m[1m[0m
[1mINFO[0m [37m|[0m [38;2;0;0;95msnps_extract[0m [37m|[0m [1m[22mfilter statistics:
samples                                          178
pre_filter_snps                              3419673
pre_filter_percent_missing                        63
filter_by_indels_present                           0
filter_by_non_biallelic                       193270
filter_by_mincov                             3357698
filter_by_minmap                             2552916
filter_by_invariant_after_subsampling         908359
filter_by_minor_allele_frequency             2130075
post_filter_snps                                5714
post_filter_snp_containing_linkage_blocks       1917
post_filter_percent_missing                       21
dtype: int64[

In [9]:
tool.run(50, random_seed=333, subsample=True)

[1mINFO[0m [37m|[0m [38;2;0;0;95msnps_extract[0m [37m|[0m [1m[22msubsampled 1843 unlinked SNPs.[0m[1m[0m


In [12]:
tool.draw(
    colors=[region_colors[i] for i in IMAP],
    centroids_only=True,
    size=8, 
    cycle=11,
    #outfile="./figures/Oreinotinus-full.svg",
);

In [38]:
tool.run_umap(n_neighbors=55, random_seed=777, min_dist=0.25, subsample=True)
tool.draw(
    colors=[region_colors[i] for i in IMAP],
    size=8, 
    cycle=11,
    #outfile="./figures/Oreinotinus-full-umap.svg",
);

[1mINFO[0m [37m|[0m [38;2;0;0;95msnps_extract[0m [37m|[0m [1m[22msubsampled 1843 unlinked SNPs.[0m[1m[0m
