In [None]:
import os
os.chdir(os.environ['PROJECT_ROOT'])

%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from populations.dataset import STRDatasetsHandler
pd.set_option('max.columns', 100)
pd.set_option('max.rows', 100)

**Survey (correct region, lat/long)**

In [None]:
survey_v1 = pd.read_csv('data/apps/map/survey_data_v1.csv')
survey_v2 = pd.read_csv('data/apps/map/survey_data_v2.csv')

**Dataset handler**

In [None]:
dh = STRDatasetsHandler()

**Bel Pop patch**

In [None]:
print(f'Shape before patching {dh._datasets["bel_pop"]._df.shape}')
dh._datasets['bel_pop'].patch_ids('bel_pop_')
dh._datasets['bel_pop'].patch_meta_data(survey_v2, ['id', 'population'])
print(f'Shape after patching {dh._datasets["bel_pop"]._df.shape}')

**Autosomal 2020 patch**

In [None]:
print(f'Shape before patching {dh._datasets["autosomal_2020"]._df.shape}')
dh._datasets['autosomal_2020'].patch_meta_data(survey_v2, ['id'])
print(f'Shape after patching {dh._datasets["autosomal_2020"]._df.shape}')

* **All Bel data**
* **New Bel data & Us data**

In [None]:
dh.join_datasets('bel_data', ['bel_pop', 'autosomal_2020'])
dh.join_datasets('new_bel_vs_us', ['us_data_2016', 'autosomal_2020'])

In [None]:
for ds_id in list(dh._datasets.keys()):
    dh.one_hot_encode(ds_id, f'{ds_id}_one_hot')

In [None]:
dh.available_datasets()

In [None]:
dh.describe()

### Belarussian population

**source**

In [None]:
from populations.viz_utils import vizualize_pca

vizualize_pca(dh.get('bel_data'), 'source', one_hot=True)
vizualize_pca(dh.get('bel_data_one_hot'), 'source')

**resion**

In [None]:
vizualize_pca(dh.get('bel_data'), 'region', one_hot=True)
vizualize_pca(dh.get('bel_data_one_hot'), 'region')

**US data**

In [None]:
vizualize_pca(dh.get('us_data_2016_one_hot'), 'population')
vizualize_pca(dh.get('us_data_2016'), 'population', one_hot=True)

**Bel vs US**

In [None]:
vizualize_pca(dh.get('new_bel_vs_us_one_hot'), 'source')
vizualize_pca(dh.get('new_bel_vs_us_one_hot'), 'population')

## 1 vs 1 clf

In [None]:
from sklearn.model_selection import StratifiedKFold

folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

**Bel data**

In [None]:
from populations.clf_utils import one_vs_one_clfs, one_vs_all_clfs
from populations.viz_utils import plot_heatmap

test_roc_auc = one_vs_one_clfs(
    dh.get('bel_data'),
    'region',
    folds
)
test_roc_auc_one_hot = one_vs_one_clfs(
    dh.get('bel_data_one_hot'),
    'region',
    folds
)

In [None]:
plot_heatmap(test_roc_auc, 'Bel data "region" 1vs1 results')
plot_heatmap(test_roc_auc_one_hot, 'Bel data (one-hot) "region" 1vs1 results')

In [None]:
plot_heatmap(test_roc_auc, 'Bel data "region" 1vs1 results')
plot_heatmap(test_roc_auc_one_hot, 'Bel data (one-hot) "region" 1vs1 results')

**US**

In [None]:
test_roc_auc = one_vs_one_clfs(
    dh.get('us_data_2016'),
    'population',
    folds
)
test_roc_auc_one_hot = one_vs_one_clfs(
    dh.get('us_data_2016_one_hot'),
    'population',
    folds
)

In [None]:
plot_heatmap(test_roc_auc, 'US data "population" 1vs1 results')
plot_heatmap(test_roc_auc_one_hot, 'US data (one-hot) "population" 1vs1 results')

**New Bel vs US**

In [None]:
test_roc_auc = one_vs_one_clfs(
    dh.get('new_bel_vs_us'),
    'source',
    folds
)
test_roc_auc_one_hot = one_vs_one_clfs(
    dh.get('new_bel_vs_us_one_hot'),
    'source',
    folds
)


In [None]:
plot_heatmap(test_roc_auc, 'US data vs Bel data results')
plot_heatmap(test_roc_auc_one_hot, 'US data vs Bel data results')

## 1 vs all clf

In [None]:
one_vs_all_clfs(
    dh.get('bel_data'),
    'region',
    folds
)