In [1]:
import os
os.chdir(os.environ['PROJECT_ROOT'])

%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

pd.set_option('max.columns', 100)
pd.set_option('max.rows', 100)

# Genotype -> geolocation

In [3]:
import os
import scipy.stats as st
import folium
from geolocation.config import TARGET_AREA
from folium.plugins import HeatMap
    
import matplotlib.pyplot as plt
import numpy as np

def plot_data(data, c=False):
    m = folium.Map([54, 28], tiles='stamentoner', zoom_start=6)
    if c:
        data = data[['lat', 'long']]
    HeatMap(data, radius=10).add_to(m)
    return m

def plot_results(data, point, dt):
    m = plot_data(data)
    m.add_child(folium.Circle(*point[['lat', 'long']].values, radius=1.1e5*dt))
    return m

In [4]:
from geolocation.config import AVAILABLE_LOCI, DATASETS

X_train = pd.read_excel(DATASETS['train'])
X_val = pd.read_excel(DATASETS['val'])
X_test = pd.read_excel(DATASETS['test'])

X_train.shape, X_val.shape, X_test.shape

((1169, 41), (62, 41), (65, 41))

In [5]:
from geolocation.geolocation import Geolocator
from geolocation.distance import D
from geolocation.metrics import collect_q_metrics, collect_test_metrics
from geolocation.candidates_ranking import RankingGeneratorGDLinkage

In [None]:
locator = (
    Geolocator(X_train)
    .set_candidates_generator('grid', grid_size=20)
)

# Bayes opt

In [None]:
??collect_q_metrics

In [None]:
from functools import partial

def F(train, test, dt, k, **params):
    locator = (
        Geolocator(train)
        .set_candidates_generator('grid', grid_size=20)
        .set_ranking_generator('gdlink', **params)
    )
    batch_candidates = locator.batch_locate(test, k)
    test_metrics = collect_test_metrics(dt, k, test, batch_candidates)
    return test_metrics[f'hit_count@{k}_{dt}']

def get_F(train, test, dt, k):
    return partial(F, train=train, test=test, dt=dt, k=k)

In [None]:
from bayes_opt import BayesianOptimization
from functools import partial
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events

**Estimate thresholds**

In [6]:
from geolocation.geolocation import Geolocator

locator_r = (
    Geolocator(X_train)
    .set_candidates_generator('grid', grid_size=20)
    .set_ranking_generator('random')
)
locator_bm = (
    Geolocator(X_train)
    .set_candidates_generator('f')
    .set_ranking_generator('best_match')
)
locator_g = (
    Geolocator(X_train)
    .set_candidates_generator('f')
    .set_ranking_generator('greedy')
)

In [296]:
for dt in [0.25, 0.5, 1, 2]:
    for k in [1, 5, 10, 20, 30]:
        print(dt, k)
        bc_r = locator_r.batch_locate(X_val, k)
        bc_bm = locator_bm.batch_locate(X_val, k)
        bc_g = locator_g.batch_locate(X_val, k)
        results = []
        results.append(pd.DataFrame(collect_test_metrics(dt, k, X_val, bc_r)).T)
        results.append(pd.DataFrame(collect_test_metrics(dt, k, X_val, bc_bm)).T)
        results.append(pd.DataFrame(collect_test_metrics(dt, k, X_val, bc_g)).T)
        results = pd.concat(results).reset_index(drop=True)
        display(results)

0.25 1


Unnamed: 0,hit@1_0.25,hit_count@1_0.25,min_dist@1_0.25,median_dist@1_0.25:,compactness@1_0.25,coverage,mean_c_repeats
0,0.0,0.0,3.678854,3.678854,0.0,58.0,1.068966
1,0.112903,0.112903,2.697057,2.697057,0.0,49.0,1.265306
2,0.080645,0.080645,2.747747,2.747747,0.0,31.0,2.0


0.25 5


Unnamed: 0,hit@5_0.25,hit_count@5_0.25,min_dist@5_0.25,median_dist@5_0.25:,compactness@5_0.25,coverage,mean_c_repeats
0,0.032258,0.032258,1.542957,3.43492,2.6078,219.0,1.415525
1,0.112903,0.516129,2.624528,2.708826,0.090546,190.0,1.631579
2,0.290323,0.322581,1.051528,2.773751,1.908214,160.0,1.9375


0.25 10


Unnamed: 0,hit@10_0.25,hit_count@10_0.25,min_dist@10_0.25,median_dist@10_0.25:,compactness@10_0.25,coverage,mean_c_repeats
0,0.032258,0.032258,1.1285,3.427362,2.66457,315.0,1.968254
1,0.112903,1.0,2.533378,2.722187,0.148163,278.0,2.230216
2,0.435484,0.5,0.594743,2.913447,2.218704,252.0,2.460317


0.25 20


Unnamed: 0,hit@20_0.25,hit_count@20_0.25,min_dist@20_0.25,median_dist@20_0.25:,compactness@20_0.25,coverage,mean_c_repeats
0,0.096774,0.096774,0.694131,3.303913,2.691321,379.0,3.271768
1,0.129032,2.064516,2.354036,2.718883,0.230513,348.0,3.563218
2,0.66129,0.983871,0.254887,2.932963,2.223539,349.0,3.553009


0.25 30


Unnamed: 0,hit@30_0.25,hit_count@30_0.25,min_dist@30_0.25,median_dist@30_0.25:,compactness@30_0.25,coverage,mean_c_repeats
0,0.080645,0.080645,0.569096,3.292612,2.711699,397.0,4.685139
1,0.129032,3.112903,2.148231,2.702412,0.329313,385.0,4.831169
2,0.790323,1.483871,0.160619,2.924541,2.233738,377.0,4.933687


0.5 1


Unnamed: 0,hit@1_0.5,hit_count@1_0.5,min_dist@1_0.5,median_dist@1_0.5:,compactness@1_0.5,coverage,mean_c_repeats
0,0.032258,0.032258,3.359326,3.359326,0.0,57.0,1.087719
1,0.112903,0.112903,2.697057,2.697057,0.0,49.0,1.265306
2,0.080645,0.080645,2.747747,2.747747,0.0,31.0,2.0


0.5 5


Unnamed: 0,hit@5_0.5,hit_count@5_0.5,min_dist@5_0.5,median_dist@5_0.5:,compactness@5_0.5,coverage,mean_c_repeats
0,0.064516,0.064516,1.466406,3.308617,2.377173,215.0,1.44186
1,0.112903,0.564516,2.624528,2.708826,0.090546,190.0,1.631579
2,0.322581,0.354839,1.051528,2.773751,1.908214,160.0,1.9375


0.5 10


Unnamed: 0,hit@10_0.5,hit_count@10_0.5,min_dist@10_0.5,median_dist@10_0.5:,compactness@10_0.5,coverage,mean_c_repeats
0,0.258065,0.258065,0.974739,3.379577,2.707698,319.0,1.943574
1,0.112903,1.048387,2.533378,2.722187,0.148163,278.0,2.230216
2,0.516129,0.580645,0.594743,2.913447,2.218704,252.0,2.460317


0.5 20


Unnamed: 0,hit@20_0.5,hit_count@20_0.5,min_dist@20_0.5,median_dist@20_0.5:,compactness@20_0.5,coverage,mean_c_repeats
0,0.435484,0.580645,0.588664,3.1995,2.681686,390.0,3.179487
1,0.129032,2.129032,2.354036,2.718883,0.230513,348.0,3.563218
2,0.790323,1.193548,0.254887,2.932963,2.223539,349.0,3.553009


0.5 30


Unnamed: 0,hit@30_0.5,hit_count@30_0.5,min_dist@30_0.5,median_dist@30_0.5:,compactness@30_0.5,coverage,mean_c_repeats
0,0.435484,0.532258,0.583004,3.333345,2.759688,397.0,4.685139
1,0.129032,3.177419,2.148231,2.702412,0.329313,385.0,4.831169
2,0.887097,1.774194,0.160619,2.924541,2.233738,377.0,4.933687


1 1


Unnamed: 0,hit@1_1,hit_count@1_1,min_dist@1_1,median_dist@1_1:,compactness@1_1,coverage,mean_c_repeats
0,0.096774,0.096774,3.598799,3.598799,0.0,60.0,1.033333
1,0.129032,0.129032,2.697057,2.697057,0.0,49.0,1.265306
2,0.129032,0.129032,2.747747,2.747747,0.0,31.0,2.0


1 5


Unnamed: 0,hit@5_1,hit_count@5_1,min_dist@5_1,median_dist@5_1:,compactness@5_1,coverage,mean_c_repeats
0,0.435484,0.516129,1.304404,3.25108,2.472831,241.0,1.286307
1,0.129032,0.645161,2.624528,2.708826,0.090546,190.0,1.631579
2,0.516129,0.709677,1.051528,2.773751,1.908214,160.0,1.9375


1 10


Unnamed: 0,hit@10_1,hit_count@10_1,min_dist@10_1,median_dist@10_1:,compactness@10_1,coverage,mean_c_repeats
0,0.564516,0.725806,1.016112,3.273152,2.577111,312.0,1.987179
1,0.145161,1.306452,2.533378,2.722187,0.148163,278.0,2.230216
2,0.693548,1.225806,0.594743,2.913447,2.218704,252.0,2.460317


1 20


Unnamed: 0,hit@20_1,hit_count@20_1,min_dist@20_1,median_dist@20_1:,compactness@20_1,coverage,mean_c_repeats
0,0.790323,1.306452,0.713066,3.376207,2.746334,377.0,3.289125
1,0.209677,2.870968,2.354036,2.718883,0.230513,348.0,3.563218
2,0.903226,2.354839,0.254887,2.932963,2.223539,349.0,3.553009


1 30


Unnamed: 0,hit@30_1,hit_count@30_1,min_dist@30_1,median_dist@30_1:,compactness@30_1,coverage,mean_c_repeats
0,0.870968,1.983871,0.605509,3.248308,2.747911,400.0,4.65
1,0.290323,4.516129,2.148231,2.702412,0.329313,385.0,4.831169
2,0.951613,3.354839,0.160619,2.924541,2.233738,377.0,4.933687


2 1


Unnamed: 0,hit@1_2,hit_count@1_2,min_dist@1_2,median_dist@1_2:,compactness@1_2,coverage,mean_c_repeats
0,0.16129,0.16129,3.515749,3.515749,0.0,57.0,1.087719
1,0.387097,0.387097,2.697057,2.697057,0.0,49.0,1.265306
2,0.387097,0.387097,2.747747,2.747747,0.0,31.0,2.0


2 5


Unnamed: 0,hit@5_2,hit_count@5_2,min_dist@5_2,median_dist@5_2:,compactness@5_2,coverage,mean_c_repeats
0,0.741935,1.258065,1.395591,3.289079,2.470556,214.0,1.448598
1,0.419355,1.919355,2.624528,2.708826,0.090546,190.0,1.631579
2,0.83871,1.725806,1.051528,2.773751,1.908214,160.0,1.9375


2 10


Unnamed: 0,hit@10_2,hit_count@10_2,min_dist@10_2,median_dist@10_2:,compactness@10_2,coverage,mean_c_repeats
0,0.983871,2.564516,1.006509,3.239726,2.561497,318.0,1.949686
1,0.451613,3.774194,2.533378,2.722187,0.148163,278.0,2.230216
2,0.951613,3.209677,0.594743,2.913447,2.218704,252.0,2.460317


2 20


Unnamed: 0,hit@20_2,hit_count@20_2,min_dist@20_2,median_dist@20_2:,compactness@20_2,coverage,mean_c_repeats
0,0.967742,4.387097,0.896442,3.354866,2.756023,374.0,3.315508
1,0.5,7.887097,2.354036,2.718883,0.230513,348.0,3.563218
2,0.983871,6.048387,0.254887,2.932963,2.223539,349.0,3.553009


2 30


Unnamed: 0,hit@30_2,hit_count@30_2,min_dist@30_2,median_dist@30_2:,compactness@30_2,coverage,mean_c_repeats
0,1.0,7.193548,0.582485,3.260487,2.697321,398.0,4.673367
1,0.532258,11.854839,2.148231,2.702412,0.329313,385.0,4.831169
2,0.983871,9.241935,0.160619,2.924541,2.233738,377.0,4.933687


**Bopt**

In [42]:
for dt in [0.5, 1, 2]:
    print(f"DT = {dt}")
    for k in [5, 30, 50]:
        print(f'K = {k}')
        optimizer = BayesianOptimization(
            f=get_F(X_train, X_val, dt, k),
            pbounds=RankingGeneratorGDLinkage(X_train).get_params(),
            verbose=2,
            random_state=1,
        )
        dt_str = f'{dt}'.replace('.', '')
        log_file = f'data/bayes_opt_logs/b_opt_log_dt_{dt_str}_k_{k}.json'
        logger = JSONLogger(path=log_file)
        optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)
        
        optimizer.maximize(
            init_points=50,
            n_iter=100,
        )
        print(optimizer.max)

DT = 0.5
K = 5
{'target': 0.8064516129032258, 'params': {'d_alpha': 1.7619625557505525, 'd_t': 0.332063574366837, 'g_mean': 0.1309968448109169, 'g_std': 8.285416229139239, 'g_t': 0.34473665268329345}}
K = 30
{'target': 3.096774193548387, 'params': {'d_alpha': 2.263812219300573, 'd_t': 0.0, 'g_mean': 1.0, 'g_std': 5.553768064012728, 'g_t': 0.0}}
K = 50
{'target': 4.32258064516129, 'params': {'d_alpha': 3.7017287879059517, 'd_t': 0.0, 'g_mean': 0.0, 'g_std': 6.508822047333712, 'g_t': 0.8527412921380608}}
DT = 1
K = 5
{'target': 1.3870967741935485, 'params': {'d_alpha': 1.7863792775070197, 'd_t': 0.0, 'g_mean': 0.5053368041991528, 'g_std': 1.4132516677755633, 'g_t': 0.0}}
K = 30
{'target': 7.645161290322581, 'params': {'d_alpha': 0.6001014008251204, 'd_t': 0.5504265265329386, 'g_mean': 0.1662535096462875, 'g_std': 6.424477638720284, 'g_t': 0.6021081830896102}}
K = 50
{'target': 11.129032258064516, 'params': {'d_alpha': 0.40302695880081496, 'd_t': 0.6640206048635708, 'g_mean': 0.0489581825

# Evaluate

In [11]:
import os
import json


results = []
for dt in [0.5, 1, 2]:
    for k in [5, 30, 50]:
        dt_str = f'{dt}'.replace('.', '')
        log_file = f'data/bayes_opt_logs/probes_dt_{dt_str}_k_{k}_v2.json'

        probes = []
        with open(log_file, 'r') as f:
            for line in f:
                probe = json.loads(line)
                probes.append(probe)
        best_probe = sorted(probes, key=lambda x: x['target'], reverse=True)[0]

        result = pd.DataFrame({'dt': [dt], 'k': [k], 'target': [best_probe['target']],
                               **{k: [v] for k, v in best_probe['params'].items()}})
    
        results.append(result)

results = pd.concat(results).reset_index(drop=True)
results

Unnamed: 0,dt,k,target,d_alpha,d_t,g_mean,g_std,g_t
0,0.5,5,0.806452,1.761963,0.332064,0.130997,8.285416,0.344737
1,0.5,30,3.096774,2.263812,0.0,1.0,5.553768,0.0
2,0.5,50,4.322581,3.701729,0.0,0.0,6.508822,0.852741
3,1.0,5,1.387097,1.786379,0.0,0.505337,1.413252,0.0
4,1.0,30,7.645161,0.600101,0.550427,0.166254,6.424478,0.602108
5,1.0,50,11.129032,0.403027,0.664021,0.048958,6.411477,0.820671
6,2.0,5,3.080645,0.19367,0.678836,0.211628,3.38992,0.491573
7,2.0,30,16.419355,0.137693,0.735726,0.13209,3.395357,0.367609
8,2.0,50,25.274194,0.835008,0.161852,0.357083,4.515601,0.539859


In [300]:
bc_g = locator_g.batch_locate(X_val, 10)

In [301]:
i = 22
plot_results(bc_g[i], X_val.iloc[i:i+1].reset_index(), 1)

In [227]:
locator = (
    Geolocator(X_train)
    .set_candidates_generator('grid', grid_size=20)
    .set_ranking_generator('gdlink', 
        d_alpha=1,
        d_t=0.4,
        g_mean=0.5,
        g_std=5,
        g_t=0.5
    )
)
bc = locator.batch_locate(X_val, k)

In [281]:
display(collect_test_metrics(dt, k, X_val, bc_r))
display(collect_test_metrics(dt, k, X_val, bc_bm))
display(collect_test_metrics(dt, k, X_val, bc_g))
display(collect_test_metrics(dt, k, X_val, bc))

hit@5_0.5               0.064516
hit_count@5_0.5         0.064516
min_dist@5_0.5          1.538298
median_dist@5_0.5:      3.328163
compactness@5_0.5       2.367309
coverage              211.000000
mean_c_repeats          1.469194
dtype: float64

hit@5_0.5              0.112903
hit_count@5_0.5        0.532258
min_dist@5_0.5         2.392057
median_dist@5_0.5:     2.687745
compactness@5_0.5      0.265607
coverage              86.000000
mean_c_repeats         3.604651
dtype: float64

hit@5_0.5              0.306452
hit_count@5_0.5        0.322581
min_dist@5_0.5         0.823155
median_dist@5_0.5:     2.921352
compactness@5_0.5      2.151586
coverage              73.000000
mean_c_repeats         4.246575
dtype: float64

hit@5_0.5              0.161290
hit_count@5_0.5        0.225806
min_dist@5_0.5         1.314612
median_dist@5_0.5:     2.347907
compactness@5_0.5      2.573274
coverage              10.000000
mean_c_repeats        31.000000
dtype: float64

In [239]:
from itertools import product

params = RankingGeneratorGDLinkage(X_train).get_params()
for p in product(*[np.linspace(*v, 2) for k, v in params.items()]):
    locator = (
        Geolocator(X_train)
        .set_candidates_generator('grid', grid_size=20)
        .set_ranking_generator('gdlink', 
            **{
                k: p[index]
                for index, (k, _) in enumerate(params.items())
            }
        )
    )
    bc = locator.batch_locate(X_val, k)
    display(collect_test_metrics(dt, k, X_val, bc))

hit@5_0.5              0.145161
hit_count@5_0.5        0.306452
min_dist@5_0.5         1.907336
median_dist@5_0.5:     2.250519
compactness@5_0.5      0.286910
coverage               5.000000
mean_c_repeats        62.000000
dtype: float64

hit@5_0.5              0.145161
hit_count@5_0.5        0.306452
min_dist@5_0.5         1.907336
median_dist@5_0.5:     2.250519
compactness@5_0.5      0.286910
coverage               5.000000
mean_c_repeats        62.000000
dtype: float64

hit@5_0.5              0.145161
hit_count@5_0.5        0.306452
min_dist@5_0.5         1.907336
median_dist@5_0.5:     2.250519
compactness@5_0.5      0.286910
coverage               5.000000
mean_c_repeats        62.000000
dtype: float64

hit@5_0.5              0.161290
hit_count@5_0.5        0.338710
min_dist@5_0.5         1.900651
median_dist@5_0.5:     2.245657
compactness@5_0.5      0.286898
coverage               8.000000
mean_c_repeats        38.750000
dtype: float64

hit@5_0.5              0.258065
hit_count@5_0.5        0.564516
min_dist@5_0.5         1.944041
median_dist@5_0.5:     2.294204
compactness@5_0.5      0.305548
coverage               5.000000
mean_c_repeats        62.000000
dtype: float64

hit@5_0.5              0.322581
hit_count@5_0.5        0.629032
min_dist@5_0.5         1.814790
median_dist@5_0.5:     2.289688
compactness@5_0.5      0.483363
coverage               7.000000
mean_c_repeats        44.285714
dtype: float64

hit@5_0.5              0.258065
hit_count@5_0.5        0.564516
min_dist@5_0.5         1.944041
median_dist@5_0.5:     2.294204
compactness@5_0.5      0.305548
coverage               5.000000
mean_c_repeats        62.000000
dtype: float64

hit@5_0.5              0.306452
hit_count@5_0.5        0.677419
min_dist@5_0.5         1.770413
median_dist@5_0.5:     2.306948
compactness@5_0.5      0.565768
coverage              12.000000
mean_c_repeats        25.833333
dtype: float64

hit@5_0.5              0.145161
hit_count@5_0.5        0.290323
min_dist@5_0.5         1.888390
median_dist@5_0.5:     2.236658
compactness@5_0.5      0.320769
coverage               5.000000
mean_c_repeats        62.000000
dtype: float64

hit@5_0.5              0.161290
hit_count@5_0.5        0.306452
min_dist@5_0.5         1.873340
median_dist@5_0.5:     2.240443
compactness@5_0.5      0.340329
coverage               6.000000
mean_c_repeats        51.666667
dtype: float64

hit@5_0.5              0.145161
hit_count@5_0.5        0.290323
min_dist@5_0.5         1.888390
median_dist@5_0.5:     2.236658
compactness@5_0.5      0.320769
coverage               5.000000
mean_c_repeats        62.000000
dtype: float64

hit@5_0.5              0.161290
hit_count@5_0.5        0.338710
min_dist@5_0.5         1.796673
median_dist@5_0.5:     2.258571
compactness@5_0.5      0.430641
coverage              11.000000
mean_c_repeats        28.181818
dtype: float64

hit@5_0.5              0.419355
hit_count@5_0.5        0.677419
min_dist@5_0.5         1.661434
median_dist@5_0.5:     2.303538
compactness@5_0.5      0.774402
coverage               6.000000
mean_c_repeats        51.666667
dtype: float64

hit@5_0.5              0.403226
hit_count@5_0.5        0.677419
min_dist@5_0.5         1.688413
median_dist@5_0.5:     2.297751
compactness@5_0.5      0.887976
coverage               7.000000
mean_c_repeats        44.285714
dtype: float64

hit@5_0.5              0.419355
hit_count@5_0.5        0.693548
min_dist@5_0.5         1.658928
median_dist@5_0.5:     2.299919
compactness@5_0.5      0.790866
coverage               6.000000
mean_c_repeats        51.666667
dtype: float64

hit@5_0.5              0.338710
hit_count@5_0.5        0.709677
min_dist@5_0.5         1.829607
median_dist@5_0.5:     2.271097
compactness@5_0.5      0.777385
coverage              12.000000
mean_c_repeats        25.833333
dtype: float64

hit@5_0.5              0.145161
hit_count@5_0.5        0.306452
min_dist@5_0.5         1.907336
median_dist@5_0.5:     2.250519
compactness@5_0.5      0.286910
coverage               5.000000
mean_c_repeats        62.000000
dtype: float64

hit@5_0.5              0.145161
hit_count@5_0.5        0.306452
min_dist@5_0.5         1.907336
median_dist@5_0.5:     2.250519
compactness@5_0.5      0.286910
coverage               5.000000
mean_c_repeats        62.000000
dtype: float64

hit@5_0.5              0.145161
hit_count@5_0.5        0.306452
min_dist@5_0.5         1.907336
median_dist@5_0.5:     2.250519
compactness@5_0.5      0.286910
coverage               5.000000
mean_c_repeats        62.000000
dtype: float64

hit@5_0.5              0.161290
hit_count@5_0.5        0.225806
min_dist@5_0.5         1.314612
median_dist@5_0.5:     2.347907
compactness@5_0.5      2.573274
coverage              10.000000
mean_c_repeats        31.000000
dtype: float64

hit@5_0.5              0.258065
hit_count@5_0.5        0.564516
min_dist@5_0.5         1.944041
median_dist@5_0.5:     2.294204
compactness@5_0.5      0.305548
coverage               5.000000
mean_c_repeats        62.000000
dtype: float64

hit@5_0.5              0.322581
hit_count@5_0.5        0.629032
min_dist@5_0.5         1.784496
median_dist@5_0.5:     2.286491
compactness@5_0.5      0.506725
coverage               7.000000
mean_c_repeats        44.285714
dtype: float64

hit@5_0.5              0.258065
hit_count@5_0.5        0.564516
min_dist@5_0.5         1.944041
median_dist@5_0.5:     2.294204
compactness@5_0.5      0.305548
coverage               5.000000
mean_c_repeats        62.000000
dtype: float64

hit@5_0.5              0.161290
hit_count@5_0.5        0.225806
min_dist@5_0.5         1.314612
median_dist@5_0.5:     2.347907
compactness@5_0.5      2.573274
coverage              10.000000
mean_c_repeats        31.000000
dtype: float64

hit@5_0.5              0.145161
hit_count@5_0.5        0.290323
min_dist@5_0.5         1.888390
median_dist@5_0.5:     2.236658
compactness@5_0.5      0.320769
coverage               5.000000
mean_c_repeats        62.000000
dtype: float64

hit@5_0.5              0.161290
hit_count@5_0.5        0.306452
min_dist@5_0.5         1.855956
median_dist@5_0.5:     2.240979
compactness@5_0.5      0.346849
coverage               6.000000
mean_c_repeats        51.666667
dtype: float64

hit@5_0.5              0.145161
hit_count@5_0.5        0.290323
min_dist@5_0.5         1.888390
median_dist@5_0.5:     2.236658
compactness@5_0.5      0.320769
coverage               5.000000
mean_c_repeats        62.000000
dtype: float64

hit@5_0.5              0.161290
hit_count@5_0.5        0.225806
min_dist@5_0.5         1.314612
median_dist@5_0.5:     2.347907
compactness@5_0.5      2.573274
coverage              10.000000
mean_c_repeats        31.000000
dtype: float64

hit@5_0.5              0.419355
hit_count@5_0.5        0.677419
min_dist@5_0.5         1.661434
median_dist@5_0.5:     2.303538
compactness@5_0.5      0.774402
coverage               6.000000
mean_c_repeats        51.666667
dtype: float64

hit@5_0.5              0.387097
hit_count@5_0.5        0.645161
min_dist@5_0.5         1.716923
median_dist@5_0.5:     2.300205
compactness@5_0.5      0.886298
coverage               7.000000
mean_c_repeats        44.285714
dtype: float64

hit@5_0.5              0.419355
hit_count@5_0.5        0.693548
min_dist@5_0.5         1.658928
median_dist@5_0.5:     2.299919
compactness@5_0.5      0.790866
coverage               6.000000
mean_c_repeats        51.666667
dtype: float64

hit@5_0.5              0.161290
hit_count@5_0.5        0.225806
min_dist@5_0.5         1.314612
median_dist@5_0.5:     2.347907
compactness@5_0.5      2.573274
coverage              10.000000
mean_c_repeats        31.000000
dtype: float64

In [None]:
X_val

In [40]:
# import json

# def generate_example(row, k):
#     genotype = row[AVAILABLE_LOCI].sample(k).astype('float')
#     print(genotype)
#     return {
#         'meta_data': dict(row[['population', 'id', 'region', 'lat', 'long']]),
#         'genotype': dict(genotype)
#     }
    
    
# with open("data/apps/geolocation/sample3.json", "w") as write_file:
#     json.dump(generate_example(examples.iloc[2], 30), write_file)

# HDBSCAN

In [152]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.datasets as data
%matplotlib inline
sns.set_context('poster')
sns.set_style('white')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.5, 's' : 80, 'linewidths':0}

In [195]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=10, prediction_data=True).fit(X_train[AVAILABLE_LOCI].sample(400))

In [196]:
clusterer.outlier_scores_

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.