## simcat Analysis demo

In [1]:
# conda install multicore-tsne -c conda-forge
# conda install toyplot -c eaton-lab

In [2]:
import simcat
import toytree
import toyplot
import toyplot.svg
import numpy as np
from MulticoreTSNE import MulticoreTSNE as TSNE

### Load Database results

#### Set labels to unordered labels

In [9]:
ml1 = simcat.Analysis(
    name="test-111",
    workdir="./databases",
    ulabel=True,
)

subset data to: 1056 tests
Dataset: test-111
loaded counts matrix: (1056, 5, 16, 16)
scaled integers to floats by max count
reshaped into X: (1056, 1280)
loaded labels DataFrame: (1056, 6)
subset as y: (1056,)
added 5 features from abba-baba
split train/test data: (707, 1285)/(349, 1285)


#### Mask (set label to NaN) low migration and/or sister edges

In [19]:
ml2 = simcat.Analysis(
    name="test-111",
    workdir="./databases",
    mask_admixture_min=0.02,
    mask_sisters=True,
    ulabel=True,
)

subset data to: 1056 tests
Dataset: test-111
loaded counts matrix: (1056, 5, 16, 16)
scaled integers to floats by max count
reshaped into X: (1056, 1280)
loaded labels DataFrame: (1056, 6)
subset as y: (1056,)
added 5 features from abba-baba
split train/test data: (707, 1285)/(349, 1285)


#### Exclude low migration and/or sisters

In [20]:
ml3 = simcat.Analysis(
    name="test-111",
    workdir="./databases",
    exclude_admixture_min=0.05,
    exclude_sisters=True,
)

subset data to: 648 tests
Dataset: test-111
loaded counts matrix: (648, 5, 16, 16)
scaled integers to floats by max count
reshaped into X: (648, 1280)
loaded labels DataFrame: (648, 6)
subset as y: (648,)
added 5 features from abba-baba
split train/test data: (434, 1285)/(214, 1285)


#### Other setups

In [21]:
ml4 = simcat.Analysis(
    name="test-111",
    workdir="./databases",
    exclude_admixture_min=0.1,
    exclude_sisters=True,
    ulabel=True,
)

subset data to: 576 tests
Dataset: test-111
loaded counts matrix: (576, 5, 16, 16)
scaled integers to floats by max count
reshaped into X: (576, 1280)
loaded labels DataFrame: (576, 6)
subset as y: (576,)
added 5 features from abba-baba
split train/test data: (385, 1285)/(191, 1285)


### Fit Classifier
This will fit an extra trees classifier to try to classify the placement of edges on the tree. It is a crude yes/no answer, and we may want to also report incorrect edges in terms of their distance off... or something. 

In [22]:
ml1.train_model()

training ExtraTrees model...
model score on training set: 1.000
model score on test/validation set: 0.751


In [23]:
ml2.train_model()

training ExtraTrees model...
model score on training set: 1.000
model score on test/validation set: 0.722


In [24]:
ml3.train_model()

training ExtraTrees model...
model score on training set: 1.000
model score on test/validation set: 0.860


In [25]:
ml4.train_model()

training ExtraTrees model...
model score on training set: 1.000
model score on test/validation set: 0.979


### Visualize features

In [94]:
from sklearn.decomposition import PCA

In [95]:
pX = PCA().fit_transform(ml.X)
pX.shape

(1056, 1056)

In [96]:
tsne = TSNE(
    init="random",
    perplexity=50, 
    n_iter=20000, 
    n_jobs=8,
    random_state=123,
)

In [97]:
embedding = tsne.fit_transform(pX)#ml.X)

In [98]:
# greys = toyplot.color.brewer.palette("Greys", 3)
# toyplot.color.to_css(greys[1]), toyplot.color.to_css(greys[0])

In [99]:
# spec = toyplot.color.brewer.palette("Spectral", count=11)
# spec

In [103]:
colors = toyplot.color.brewer.palette("Paired")
colors

In [110]:
ml.df.ulabel[ml.df.sisters == 0]

3       NaN
4       NaN
5       NaN
6       NaN
7       NaN
       ... 
1048    0,3
1049    0,3
1050    0,2
1051    0,2
1052    0,2
Name: ulabel, Length: 792, dtype: object

In [112]:
cdict = {
    "NaN": {
        "fill": 'rgba(74.1%,74.1%,74.1%,1.000)', 
        'stroke': 'none',
    },
}
labels = set(ml.df.ulabel[ml.df.sisters == 0])
labels = labels - {"NaN"}
for label, color in zip(labels, colors):
    
    # set ->
    cdict[label] = {
        "stroke": "none",
        "fill": toyplot.color.to_css(color),
    }
    
    # set <-
    alt = "{1},{0}".format(*label.split(","))
    cdict[alt] = {
        "stroke": toyplot.color.to_css(color),
        "stroke-width": 1.5,
        "fill": 'rgba(100%,100%,100%,1.000)',
    }

In [118]:
markers = []
for idx in ml.df.index:
    if ml.df.sisters[idx]:
        mark = toyplot.marker.create(
            shape="o",
            size= 3, #+ (ml.df.aprop[idx] / ml.df.aprop.max()) * 10,# * 35,
            mstyle=cdict["NaN"])
    
    else:
        mark = toyplot.marker.create(
            shape='o',
            size= 3 + (ml.df.aprop[idx] / ml.df.aprop.max()) * 10,# * 35,
            mstyle=cdict[ml.df.label[idx]],
        )
    markers.append(mark)

In [121]:
canvas = toyplot.Canvas(width=800, height=400,)
ax0 = canvas.cartesian(bounds=(50, 350, 70, 330))
ax1 = canvas.cartesian(bounds=(450, 700, 100, 300))

ax0.scatterplot(
    embedding[:, 0],
    embedding[:, 1],
    marker=markers,
    title=ml.df.label,
);

tre = toytree.tree(ml.tree)
tre.draw(ts='c', axes=ax1, node_sizes=18, node_colors="lightgrey");
ax1.y.show = False
ax0.x.label.text = "t-SNE axis 1"
ax0.y.label.text = "t-SNE axis 2"

# save image
#toyplot.svg.render(canvas, "./tsne-imb5-snps10K-11tests-withsis2.svg")

In [193]:
toyplot.scatterplot(
    embedding[:, 0],
    embedding[:, 1],
    width=400, height=400,
    marker=markers,
    title=ml.df.label,
);
toytree.tree(ml.tree).draw(ts='c');

In [140]:
toyplot.scatterplot(
    embedding[:, 0],
    embedding[:, 1],
    width=400, height=400,
    marker=markers,
    title=ml.df.label,
);
toytree.tree(ml.tree).draw(ts='c');