## Viburnum Oreinotinus PCAs
And other dimensionality reduction methods.

In [1]:
import dbgdrive
import ipyrad.analysis as ipa
print(ipa.__version__)

1.0.0-alpha


### Assembled dataset

In [2]:
DATA = (
    "/home/deren/Documents/Viburnum-Oreinotinus/assembly_hic_feb/"
    "full_dataset_outfiles/full_dataset.snps.hdf5"
)

### Metadata

In [3]:
fulldata = dbgdrive.get_database(
    sheet_name='sample-data', 
    id_spreadsheet='1mnbZVtnI4AQDseWaawV2au4bNyFD_B9M0z8REMXbOcs',
    api_key='AIzaSyDh0Apfm562l_vlXOYizzyiZjAbFGYEnzo',
)

In [4]:
# those regions correspond to the Early March 2021 version, not with late march or april
regions = dbgdrive.get_database(
    sheet_name='regions',
    id_spreadsheet='1mnbZVtnI4AQDseWaawV2au4bNyFD_B9M0z8REMXbOcs',
    api_key='AIzaSyDh0Apfm562l_vlXOYizzyiZjAbFGYEnzo',
)

In [5]:
region_colors = {}
for row, region in enumerate(regions["RegionUnique"]):
    if region != None: 
        region_colors[region] = regions["ColorRegionUnique"][row]

In [6]:
region_colors

{'11th_area': '#00966D',
 'bol': '#d4ff2a',
 'central': '#d35f5f',
 'col_e': '#5555ff',
 'col_w': '#66ff00',
 'costa': '#501644',
 'ecu_n': '#de87cd',
 'jamaica': '#ffd42a',
 'mex_e': '#a8ffaa',
 'mex_w': '#87cdde',
 'out': '#a09999',
 'peru': '#ff6600'}

### Population assignments

In [7]:
imap = {}
for index, row in fulldata.iterrows():
    # only check fulldataset specimens
    if row["full_dataset_withAyava"]:
        if row["Last_valid_name"] != "dentatum": 
            sp = row["Last_valid_name"]
            region = regions[regions["Species"] == sp]["Region"].values[0]
            if region in imap:
                imap[region].append(row["NameInAssembly"])
            else:
                imap[region] = [row["NameInAssembly"]]

### PCA analysis: Mexico and Central America

In [8]:
select = ["central", "mex_w", "mex_e", "11th_area", "jamaica", "costa"]
imap_central_america = {k: imap[k] for k in select}

In [9]:
tool = ipa.pca(
    data=DATA,
    imap=imap_central_america,
    minmap=0.5,
    mincov=0.75,
    minmaf=0.02,
    random_seed=123,
    cores=20,
)

Establishing parallel cluster (20 cores)
[####################] 100% 0:00:10 | SNP filtering        | ipa |


[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30mfilter statistics:
samples                                          116
pre_filter_snps                              3419673
pre_filter_percent_missing                        67
filter_by_indels_present                           0
filter_by_non_biallelic                       129070
filter_by_mincov                             3392855
filter_by_minmap                             3153237
filter_by_invariant_after_subsampling        1611534
filter_by_minor_allele_frequency             2146978
post_filter_snps                                2931
post_filter_snp_containing_linkage_blocks        823
post_filter_percent_missing                       21
dtype: int64[0m
[1mipa[0m [37m|[0m [1msnps_imputer.py[0m [37m|[0m [30mImputation: sampled genotypes (0, 1, 2) = 74.6%, 16.6%, 8.8%[0m


In [14]:
tool.run(50, random_seed=333, subsample=True)

[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30msubsampled 823 unlinked SNPs.[0m


In [24]:
tool.draw(
    colors=[region_colors[i] for i in imap_central_america],
    centroids_only=True,
    size=8,
    #outfile="./figures/Oreinotinus-CA.svg",
);

In [56]:
tool.run_umap(n_neighbors=5, random_seed=333, subsample=True, min_dist=0.8)

[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30msubsampled 823 unlinked SNPs.[0m


In [59]:
tool.draw(
    colors=[region_colors[i] for i in imap_central_america],
    centroids_only=True,
    size=8,
    #outfile="./figures/Oreinotinus-CA-umap.svg",
);

### PCA Analysis: South America

In [60]:
select = ["col_w", "bol", "peru", "col_e", "ecu_n"]
imap_south_america = {k: imap[k] for k in select}

In [61]:
tool = ipa.pca(
    data=DATA,
    imap=imap_south_america,
    minmap=0.5,
    mincov=0.75,
    minmaf=0.02,
    random_seed=123,
    cores=20,
)

Establishing parallel cluster (20 cores)
[####################] 100% 0:00:06 | SNP filtering        | ipa |


[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30mfilter statistics:
samples                                           62
pre_filter_snps                              3419673
pre_filter_percent_missing                        57
filter_by_indels_present                           0
filter_by_non_biallelic                        72203
filter_by_mincov                             3083036
filter_by_minmap                             2670397
filter_by_invariant_after_subsampling        2381817
filter_by_minor_allele_frequency             2463579
post_filter_snps                               37737
post_filter_snp_containing_linkage_blocks      11006
post_filter_percent_missing                       19
dtype: int64[0m
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
[1mipa[0m [37m|[0m [1msnps_imputer.py[0m [37m|[0m [30mImputation: sampled genotypes (0, 1, 2) = 62.8%, 24.2%, 12.9%[0m


In [28]:
tool.run(50, random_seed=333, subsample=True)

[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30msubsampled 11006 unlinked SNPs.[0m


In [31]:
tool.draw(
    colors=[region_colors[i] for i in imap_south_america],
    centroids_only=True,
    size=8,
    #outfile="./figures/Oreinotinus-SA.svg",
);

In [88]:
tool.run_umap(n_neighbors=3, random_seed=333, subsample=True, min_dist=0.15)
tool.draw(
    colors=[region_colors[i] for i in imap_south_america],
    centroids_only=True,
    size=8,
    #outfile="./figures/Oreinotinus-SA.svg",
);

[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30msubsampled 11006 unlinked SNPs.[0m


### PCA Analysis: Full sampling

In [32]:
tool = ipa.pca(
    data=DATA,
    imap=imap,
    minmap=1,
    mincov=0.75,
    minmaf=0.02,
    random_seed=123,
    cores=20,
)

Establishing parallel cluster (20 cores)
[####################] 100% 0:00:15 | SNP filtering        | ipa |


[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30mfilter statistics:
samples                                          178
pre_filter_snps                              3419673
pre_filter_percent_missing                        63
filter_by_indels_present                           0
filter_by_non_biallelic                       193270
filter_by_mincov                             3357698
filter_by_minmap                             2552916
filter_by_invariant_after_subsampling         908359
filter_by_minor_allele_frequency             2130075
post_filter_snps                                5714
post_filter_snp_containing_linkage_blocks       1917
post_filter_percent_missing                       21
dtype: int64[0m
[1mipa[0m [37m|[0m [1msnps_imputer.py[0m [37m|[0m [30mImputation: sampled genotypes (0, 1, 2) = 73.0%, 17.2%, 9.7%[0m


In [33]:
tool.run(50, random_seed=333, subsample=True)

[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30msubsampled 1917 unlinked SNPs.[0m


In [36]:
tool.draw(
    colors=[region_colors[i] for i in imap],
    centroids_only=True,
    size=8, 
    cycle=11,
    #outfile="./figures/Oreinotinus-full.svg",
);

#### Full analysis using UMAP

In [39]:
tool.run_umap(n_neighbors=55, random_seed=333, subsample=True)
tool.draw(
    colors=[region_colors[i] for i in imap],
    centroids_only=True,
    size=8, 
    cycle=11,
    #outfile="./figures/Oreinotinus-full-umap.svg",
);

[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30msubsampled 1917 unlinked SNPs.[0m


#### Full analysis using t-SNE

In [42]:
tool.run_tsne(perplexity=20, random_seed=333, subsample=True)
tool.draw(
    colors=[region_colors[i] for i in imap],
    centroids_only=True,
    size=8, 
    cycle=11,
    #outfile="./figures/Oreinotinus-full-tsne.svg",
);

[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30msubsampled 1917 unlinked SNPs.[0m


### Kmeans approach: imputation without a-priori assignments
This analysis is used to validate that the a-priori IMAP assignments of samples to populations does not significantly affect the results. Here the input IMAP is used only to select which samples are included in the analysis. Then, SNPs are sampled allowing very little missing data (mincov=0.9), a PCA is performed, and samples are assigned to populations by grouping into KMeans groups. This new imap grouping is then used for the next iteration to impute data by populatiosn with a lower missing threshold, and continues over iterations until we reach the target mincov (0.85 here).

In [43]:
tool = ipa.pca(
    data=DATA,
    imap=imap,
    minmap=1,
    mincov=0.75,
    minmaf=0.02,
    random_seed=123,
    cores=20,
    kmeans_mincov_max=0.9,
    kmeans_niters=5,
    kmeans_clusters=4,
)

Establishing parallel cluster (20 cores)
[####################] 100% 0:00:15 | SNP filtering        | ipa |


[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30mfilter statistics:
samples                                          178
pre_filter_snps                              3419673
pre_filter_percent_missing                        63
filter_by_indels_present                           0
filter_by_non_biallelic                       193270
filter_by_mincov                             3417640
filter_by_minmap                               23522
filter_by_invariant_after_subsampling         908359
filter_by_minor_allele_frequency                   0
post_filter_snps                                1271
post_filter_snp_containing_linkage_blocks        202
post_filter_percent_missing                        5
dtype: int64[0m
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
[1mipa[0m [37m|[0m [1msnps_imputer.py[0m [37m|[0m [30mImputation: sampled genotypes (0, 1, 2) = 90.3%, 2.1%, 7.6%[0m
[1mipa[0m [37m|[0m

Establishing parallel cluster (20 cores)
[####################] 100% 0:00:15 | SNP filtering        | ipa |


Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
[1mipa[0m [37m|[0m [1msnps_imputer.py[0m [37m|[0m [30mImputation: sampled genotypes (0, 1, 2) = 93.7%, 2.3%, 3.9%[0m
[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30msubsampled 347 unlinked SNPs.[0m


Establishing parallel cluster (20 cores)
[####################] 100% 0:00:15 | SNP filtering        | ipa |


Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
[1mipa[0m [37m|[0m [1msnps_imputer.py[0m [37m|[0m [30mImputation: sampled genotypes (0, 1, 2) = 95.1%, 2.8%, 2.2%[0m
[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30msubsampled 918 unlinked SNPs.[0m


Establishing parallel cluster (20 cores)
[####################] 100% 0:00:15 | SNP filtering        | ipa |


Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
[1mipa[0m [37m|[0m [1msnps_imputer.py[0m [37m|[0m [30mImputation: sampled genotypes (0, 1, 2) = 94.7%, 3.4%, 1.8%[0m
[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30msubsampled 2052 unlinked SNPs.[0m


Establishing parallel cluster (20 cores)
[####################] 100% 0:00:15 | SNP filtering        | ipa |


[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30mfilter statistics:
samples                                          178
pre_filter_snps                              3419673
pre_filter_percent_missing                        63
filter_by_indels_present                           0
filter_by_non_biallelic                       193270
filter_by_mincov                             3357698
filter_by_minmap                              810648
filter_by_invariant_after_subsampling         908359
filter_by_minor_allele_frequency                   0
post_filter_snps                               41084
post_filter_snp_containing_linkage_blocks       4721
post_filter_percent_missing                       21
dtype: int64[0m
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
[1mipa[0m [37m|[0m [1msnps_imputer.py[0m [37m|[0m [30mImputation: sampled genotypes (0, 1, 2) = 94.1%, 3.9%, 1.9%[0m


In [44]:
tool.run(nreplicates=1, random_seed=123)
tool.draw(
    colors=[region_colors[i] for i in imap],
    #centroids_only=True,
    size=8, 
    cycle=11,
);

[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30msubsampled 4721 unlinked SNPs.[0m


In [49]:
tool.run_umap(n_neighbors=85, random_seed=333, subsample=True)
tool.draw(
    colors=[region_colors[i] for i in imap],
    centroids_only=True,
    size=8, 
    cycle=11,
);

[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30msubsampled 4721 unlinked SNPs.[0m


In [50]:
tool.run_tsne(perplexity=20, random_seed=333, subsample=True)
tool.draw(
    colors=[region_colors[i] for i in imap],
    centroids_only=True,
    size=8, 
    cycle=11,
);

[1mipa[0m [37m|[0m [1msnps_extracter.py[0m [37m|[0m [30msubsampled 4721 unlinked SNPs.[0m
