### Introduction
Script plots a PCA for the observed data, alongside a PCA using simulated data, utilising the medians of the priors.

### Imports
All imports occur here

In [1]:
import pandas as pd
import sim.model
from sim import sum_stats as ss
import time
import tskit
import msprime
import allel
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

### Choose parameters
Choose some parameters so it runs relatively quickly:

In [2]:
seq_features = sim.model.SeqFeatures(length=int(20e6), recombination_rate=1.8e-8, mutation_rate=6e-8)

slim_parameters = {
    'pop_size_domestic_1': 200,  # Population sizes are diploid.
    'pop_size_wild_1': 200,
    'pop_size_captive': 70,
    'mig_rate_captive': 0.005,  # Migration from wild -> captive
    'mig_length_wild': 34,
    'mig_rate_wild': 0.008,  # Rate of migration from domestic -> wildcats
    'captive_time': 28,  # Time captive population established in SLiM
    }

recapitate_parameters = {
        'pop_size_domestic_2': 1000,
        'pop_size_wild_2': 1000,
        'div_time': 30000,
        'mig_rate_post_split': 0.005,
        'mig_length_post_split': 5000,
        'bottleneck_time_wild': 3000,
        'bottleneck_strength_wild': 20000,
        'bottleneck_time_domestic': 3000,
        'bottleneck_strength_domestic': 20000,
    }

### Run simulation

In [3]:
start_time = time.time()

# Run model
s = sim.model.WildcatSimulation(seq_features=seq_features, random_seed=42)
command = s.slim_command(slim_parameters)
decap_trees = s.run_slim(command)
demographic_events = s.demographic_model(**recapitate_parameters)
tree_seq = s.recapitate(decap_trees, demographic_events)

# Print out useful bits and bobs
print("Simulation finished in {:.2f} s".format(time.time()-start_time))
print("Command ran: {}".format(command))
# tree_seq.slim_provenance.model_type = "WF"

Simulation finished in 1.03 s
Command ran: slim -d pop_size_domestic_1=200 -d pop_size_wild_1=200 -d pop_size_captive=70 -d mig_rate_captive=0.005 -d mig_length_wild=34 -d mig_rate_wild=0.008 -d captive_time=28 -d length=20000000 -d recombination_rate=1.8e-08  -d decap_trees_filename='"../output/decap_42.trees"' -s 40 slim_model.slim


### Sample population

In [4]:
samples = s.sample_nodes(tree_seq, [5, 30, 10])  # Match number of samples to the WGS data
tree_seq = tree_seq.simplify(samples=samples)
data = sim.model.collate_results(tree_seq)

### Calculate r2

In [5]:
ss.r2(np.array([1,2,3]),
     np.array([
         [1,2,3],
         [3,4,5],
         [5,6,6],
     ]))

array([1.  , 1.  , 0.75])

In [11]:
np.corrcoef(
    np.array([1,2,3]),
     np.array([
         [1,2,3],
         [3,4,5],
         [5,6,6],
     ])
)

array([1.  , 1.  , 0.75])

In [38]:
%%time

df1 = ss.r2_data(data.genotypes["all_pops"], data.positions,
               data.seq_length, bins = [0, 1e6, 2e6], labels = ["0_1", "1_2"])

Wall time: 5.7 s


In [39]:
%%time
df2 = ss.r2_new(data.genotypes["all_pops"], data.positions,
               data.seq_length, bins = [0, 1e6, 2e6], labels = ["0_1", "1_2"], comparison_mut_lim=500)

Wall time: 1.1 s


In [40]:
df1

Unnamed: 0,index,pos,dist,bins,r2
0,46016,1.193763e+07,6.146044e+05,0_1,0.690551
1,47188,1.224340e+07,9.203696e+05,0_1,0.680277
2,46157,1.197394e+07,6.509078e+05,0_1,0.638074
3,46308,1.201250e+07,6.894664e+05,0_1,0.638074
4,43970,1.138503e+07,6.200189e+04,0_1,1.000000
...,...,...,...,...,...
995,48106,1.246518e+07,1.395344e+06,1_2,0.623714
996,48461,1.255044e+07,1.480605e+06,1_2,0.623714
997,50152,1.297489e+07,1.905052e+06,1_2,0.388396
998,49044,1.270153e+07,1.631696e+06,1_2,0.530685


In [46]:
df2["dist"].max()

1999966.5014830604

In [30]:
r2_stats = df.groupby("bins")["r2"].agg([np.median, np.std]).add_prefix("r2_").to_dict()  # Just use more bins not iqr

{'r2_median': {'0_1': 0.8460380040271407, '1_2': 0.6050091994660701},
 'r2_std': {'0_1': 0.35371726764621453, '1_2': 0.2796929859440968}}

In [11]:
data.genotypes["all_pops"]

Unnamed: 0,index,pos,dist,bins,r2
0,10997,2.882448e+06,3.227119e+05,0_1,19.407141
1,12060,3.162462e+06,6.027266e+05,0_1,14.529696
2,11402,2.998494e+06,4.387580e+05,0_1,19.407141
3,10418,2.716303e+06,1.565676e+05,0_1,0.011494
4,12900,3.385033e+06,8.252977e+05,0_1,1.985512
...,...,...,...,...,...
1995,48054,1.244914e+07,1.686131e+06,1_2,2.133966
1996,49195,1.273361e+07,1.970601e+06,1_2,2.262104
1997,47566,1.232502e+07,1.562010e+06,1_2,2.133966
1998,46715,1.212066e+07,1.357654e+06,1_2,17.822370


In [82]:
g = data.genotypes["domestic"].compress(data.allele_counts["domestic"].is_segregating())
g[614:618]

Unnamed: 0,0,1,2,3,4
0,0/1,0/1,1/0,0/1,0/0
1,1/0,1/0,0/1,1/0,1/0
2,1/0,1/0,0/1,1/0,1/0
3,0/0,0/0,0/0,0/0,1/0


In [75]:
genotypes = data.genotypes["domestic"][data.allele_counts["domestic"].is_segregating()].to_n_alt()
genotypes.shape

(2764, 5)

In [56]:
x = np.array([0, 1, 0, 0, 0])
y = np.array([
         [0, 0, 0, 0, 1],
         [0, 0, 0, 0, 1],
     ])

ss.r2(x,y)

array([0.0625, 0.0625])

In [84]:
r2 = ss.r2(genotypes[0], genotypes[:613])
r2[~np.isnan(r2)].max()

2

In [109]:
genotypes_012 = genotypes

(array([False,  True]), array([2737,   27], dtype=int64))

In [155]:
def r2(x, y):
    n = len(x)
    num = n * np.inner(x, y) - np.sum(x) * np.sum(y, axis=1)
    den = np.sqrt(n * np.sum(x ** 2) - np.sum(x) ** 2) * np.sqrt(n * np.sum(y ** 2, axis=1) - np.sum(y, axis=1) ** 2)
    r2 = (num / den) ** 2
    return r2

In [158]:
r2(np.array([0, 0, 0, 1, 0]), np.array([[2, 2, 2, 1, 1]]))

array([2.04166667])

In [43]:
y-y.mean(axis=1)

ValueError: operands could not be broadcast together with shapes (2,4) (2,) 

In [7]:
%%time
df = ss.r2_new(data.genotypes["all_pops"], data.positions, data.allele_counts["all_pops"],
               data.seq_length, bins = [0, 1e6, 2e6], labels = ["0_1", "1_2"], comparison_mut_lim=1000)

Wall time: 1.01 s


In [18]:
ss.r2(np.array([3,2,1]), np.array([[3,2,1]]))

array([1.])

In [30]:
x = np.array([1, 2 , 3, 4])
y = np.sqrt(np.arange(12).reshape(3, 4))

In [31]:
r2_fast(x, y)

array([0.92461613, 0.99828357, 0.99943905])

In [32]:
ss.r2(x, y)

array([0.92461613, 0.99828357, 0.99943905])

In [None]:
# We will use rogers huff, (it's simple and should be faster to calculate)

# Params

genotypes_012 = data.genotypes["domestic"].to_n_alt()




In [9]:
"""Takes a scikit.allel.GenotypeArray and returns a df of r2 values for different bins.
See r2_stats for more info."""
haplotypes = genotypes.to_haplotypes()
iterate_length = bins[-1]
df_list = []
# Find max index to avoid choosing focal mutation at end of chromosome
max_idx = np.where(pos > seq_length - iterate_length)[0].min()
for i in range(0, n_focal_muts):
    focal_mut_idx = np.random.randint(0, max_idx)
    focal_mut_pos = pos[focal_mut_idx]

    next_muts_idx = np.where(np.logical_and(pos > focal_mut_pos, pos < focal_mut_pos + iterate_length))[0]

    df_i = pd.DataFrame({
        "index": next_muts_idx,
        "pos": pos[next_muts_idx],
    })
    df_i["dist"] = df_i["pos"] - focal_mut_pos
    df_i["bins"] = pd.cut(df_i["dist"], bins, labels=labels)

    df_i = df_i.groupby("bins").apply(
        lambda x: x.sample(n_iter_muts) if len(x) > n_iter_muts else x).reset_index(drop=True)

    df_i["r2"] = df_i["index"].apply(
        lambda x: pearsonr(haplotypes[focal_mut_idx], haplotypes[x])[0]**2)

    df_list.append(df_i)

results = pd.concat(df_list)

return results

NameError: name 'genotypes' is not defined

In [None]:
r2

### Check tsinfer ancestral state irrelevant toy example

In [None]:
import joblib
joblib.dump(np.array(genotypes), "../output/test_genotypes.joblib")
joblib.dump(pos, "../output/test_pos.joblib")

In [None]:
df = pd.DataFrame({"AB": [1,2,3], "AC": [5,4,5], "CC": [5,7,5]})

In [None]:
[col for col in list(df) if "A" in col]

In [None]:
positions = np.loadtxt("../data/e3.012.pos", delimiter="\t", usecols=1)
genotypes = np.loadtxt("../data/e3.012", delimiter="\t", usecols=range(1, len(positions)+1))
genotypes = genotypes.T
assert len(positions) == genotypes.shape[0]

# For now just assume that missings are ancestral
genotypes[genotypes == -1] = 0


In [None]:
# Cam read in with scikit allel but genotypes looks dodge
callset = allel.read_vcf("../data/e3.vcf")
pos = callset["variants/POS"]
genotypes = allel.GenotypeArray(callset["calldata/GT"])

In [None]:
callset["samples"][0:3]

In [None]:
def pca_pipeline(genotypes, pos, pop_list):
    genotypes, pos = ss.maf_filter(genotypes, pos)
    genotypes = genotypes.to_n_alt()  # 012 with ind as cols
    genotypes, pos = ss.ld_prune(genotypes, pos)
    pca_stats = ss.pca_stats(genotypes, pop_list)
    return pca_stats

In [None]:
sample_info = pd.read_csv("../data/e3_sample_info.csv", usecols=["NAME", "SOURCE"])

# Ensure that individuals are in same order (after 012 conversion)
assert np.all(sample_info["NAME"] == np.loadtxt("../data/e3.012.indv", dtype=str))

pca_pipeline(genotypes, pos, sample_info["SOURCE"].to_list())

In [None]:
import tsinfer

with tsinfer.SampleData(sequence_length=6) as sample_data:
    sample_data.add_site(0, [0, 1, 0, 0, 0], ["A", "T"])
    sample_data.add_site(1, [0, 0, 0, 1, 1], ["G", "C"])
    sample_data.add_site(2, [0, 1, 1, 0, 0], ["C", "A"])
    sample_data.add_site(3, [0, 1, 1, 0, 0], ["G", "C"])
    sample_data.add_site(4, [0, 0, 0, 1, 1], ["A", "C"])
    sample_data.add_site(5, [0, 1, 2, 0, 0], ["T", "G", "C"])

### Calculate summary statistics

In [None]:
def pca_pipeline(genotypes, pos, pop_list):
    genotypes, pos = ss.maf_filter(genotypes, pos)
    genotypes = genotypes.to_n_alt()  # 012 with ind as cols
    genotypes, pos = ss.ld_prune(genotypes, pos)
    pca_stats = ss.pca_stats(genotypes, pop_list)
    return pca_stats

In [None]:
summary_functions = [
    ss.tskit_stats(tree_seq, samples),
    ss.afs_stats(tree_seq, samples),
    ss.r2_stats(tree_seq, samples, [0, 1e6, 2e6, 4e6], labels=["0_1Mb", "1_2Mb", "2_4MB"]),
    ss.roh_stats(genotypes, pos, pop_list, seq_features.length),
    pca_pipeline(genotypes, pos, pop_list),
]

stats_dict = {"random_seed": sim.random_seed}  # Random seed acts as ID

for func in summary_functions:
    stat = func
    stats_dict = {**stats_dict, **stat}

In [None]:
stats_dict

### Caluculate summary statistics

In [None]:
samples = sim.sample_nodes(tree_seq, [4, 45, 46])  # Match number of samples to the WGS data
tree_seq = tree_seq.simplify(samples=np.concatenate(samples))

# Calculate summary statistics
def pca_pipeline(genotypes, pos):
    genotypes, pos = ss.maf_filter(genotypes, pos)
    genotypes = genotypes.to_n_alt()  # 012 with ind as cols
    genotypes, pos = ss.ld_prune(genotypes, pos)
    pca_stats = ss.pca_stats(genotypes)
    return pca_stats

genotypes = ss.genotypes(tree_seq)  # scikit-allel format
pos = ss.positions(tree_seq)

# Using a list to call function in for loop so we can use try/except (in case any functions fail)
summary_functions = [
    sum_stats.tskit_stats(),
    sum_stats.afs_stats(),
    sum_stats.r2_stats(),
    sum_stats.roh_stats(genotypes, pos),
    pca_pipeline(genotypes, pos),
]

stats_dict = {"random_seed": sim.random_seed}  # Random seed acts as ID

for func in summary_functions:
    stat = func
    stats_dict = {**stats_dict, **stat}

In [None]:
coverage_stats = {'domestic_roh_cov_median': 0.9996666, 'domestic_roh_cov_iqr': 0.00022224444444440827, 'wild_roh_cov_median': 0.6211887222222223, 'wild_roh_cov_iqr': 0.19288330555555555, 'captive_roh_cov_median': 0.8555888222222222, 'captive_roh_cov_iqr': 0.10507222222222223}
coverage_stats["all_pops_roh_cov_median"] = np.median()

In [None]:
np.all(prior_df[["mig_rate_wild", "mig_rate_post_split"]]<=1) & np.all(prior_df[["mig_rate_wild", "mig_rate_post_split"]]>=0)

### Calculate ROH

Below seems fine but we should probably filter minor alleles. With this a single mutation breaks a ROH... I think that is ok? Presumably v. informative for PODs? Can do once with singletons once without?

In [169]:
ss.

727185
81996


array([2, 6])

In [179]:
roh(genotypes)

array([126087.72165816,  39576.01810196,  24097.46138472, ...,
         2194.49066955,      0.        ,    853.37625222])

In [280]:
np.partition([3,2,1], 1)[1]

2

KeyError: 'Column not found: position'

Unnamed: 0_level_0,Unnamed: 1_level_0,position,position
Unnamed: 0_level_1,Unnamed: 1_level_1,amin,amax
individual,roh_id,Unnamed: 2_level_2,Unnamed: 3_level_2
0,0,6.733423e+01,1.261551e+05
0,1,1.263148e+05,1.658908e+05
0,2,1.660940e+05,1.901914e+05
0,3,1.910586e+05,4.473508e+05
0,4,4.483197e+05,4.484538e+05
...,...,...,...
29,33043,1.999142e+07,1.999220e+07
29,33044,1.999237e+07,1.999526e+07
29,33045,1.999535e+07,1.999754e+07
29,33046,1.999773e+07,1.999773e+07


In [91]:
%%time
df.groupby('individual')['position'].diff(1)  # First value na as nothing to compare to

Wall time: 227 ms


0                 NaN
1           41.086620
2          189.024970
3          170.155704
4          253.787280
              ...    
2314285    191.943543
2314286     64.658906
2314287     29.507752
2314288     94.270981
2314289    729.597519
Name: position, Length: 2314290, dtype: float64

In [96]:
df.groupby(['individual')['position'].diff(1)  # First value na as nothing to compare to

(77143,)

In [None]:

df = roh_id.melt(id_vars="position", var_name="individual", value_name="roh_id")

df["roh_length"] = df.groupby('individual')['position'].diff(1)
df = df.dropna()  # Drops first "ROH" (as no previous heterozygote)
df = df.drop(columns = ["position", "roh_id"])
# pd.DataFrame({"population": sum_stats.individual_pop_list, "id": range(0, len(sum_stats.individual_pop_list))})

### PCA

#### Take a sample and get the genotypes

In [None]:
samples = sim.sample_nodes(tree_seq, [4, 45, 46])  # Match number of samples to the SNP data
tree_seq = tree_seq.simplify(samples=np.concatenate(samples))

In [None]:
genotypes = ss.genotypes(tree_seq)
pos = ss.positions(tree_seq)
pop_list = ss.pop_list(tree_seq)
samples = ss.sampled_nodes(tree_seq)

#### Check for LD

In [None]:
def plot_ld(gn, title):
    m = allel.rogers_huff_r(gn) ** 2
    ax = allel.plot_pairwise_ld(m)
    ax.set_title(title)

In [None]:
plot_ld(genotypes[:1000].to_n_alt(), 'Pairwise LD.')

#### Filter singletons and SNPs in LD
SNPs in LD can bias PCA.

In [None]:
genotypes, pos = ss.maf_filter(genotypes, pos)
genotypes, pos = ss.ld_prune(genotypes.to_n_alt(), pos)

In [None]:
plot_ld(genotypes[:1000], 'Pairwise LD after pruning')

### Plot both

In [None]:
sample_population = np.asarray(["domestic"]*4 + ["wild"]*45 + ["captive"]*46)
populations = ["domestic", "wild", "captive"]
pop_colours = ["#FF0000", "#FFA500", "#0000FF"]

# Simulated data pca
coords, model = allel.pca(genotypes, n_components=10, scaler='patterson')
sim_df = pd.DataFrame({"pc1": coords[:, 0], "pc2": coords[:, 1],
                       "population": sample_population, "simulated_or_observed": "simulated"})

# Real data pca
real_genotypes = np.loadtxt("../data/snps.012", delimiter=" ", skiprows=1)
real_genotypes = real_genotypes[:,1:].transpose()  # Get rid of index and convert individuals to columns
coords, model = allel.pca(real_genotypes, n_components=2, scaler='patterson')
real_df = pd.DataFrame({"pc1": coords[:, 0], "pc2": coords[:, 1],
                   "population": sample_population, "simulated_or_observed": "observed"})

# Combined data
combined_df = sim_df.append(real_df)

In [None]:
sns.set(style='darkgrid', font_scale=1.3)

g = sns.relplot(x="pc1", y="pc2",
                row="simulated_or_observed", hue="population",
                kind="scatter", data=combined_df,
                facet_kws=dict(sharex=False, sharey=False),
                aspect=1.4)

axes = g.axes.flatten()
axes[0].set_title("Simulated")
axes[1].set_title("Observed")

#g.savefig("../plots/simulated_vs_observed_pca.png", dpi=600)

### Scaling summary statistics

In [None]:
stats = pd.read_csv("../output/summary_stats.csv")
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_sum_stats = scaler.fit_transform(stats)
scaler.inverse_transform(scaled_sum_stats)