In [None]:
import pandas as pd
import numpy as np
import os

import seaborn as sns
from matplotlib import pyplot as plt

import optuna
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold, train_test_split

import optuna

In [None]:
np.random.seed(404)

In [None]:
split_root = '/home/k/k202141/rootgit/AI4EO-MapYourCity/data/AI4EO-MapYourCity/splits/'

In [None]:
train_df = pd.read_csv('/home/k/k202141/rootgit/AI4EO-MapYourCity/data/AI4EO-MapYourCity/v1/building-age-dataset/train/train-set.csv')
test_df = pd.read_csv('/home/k/k202141/rootgit/AI4EO-MapYourCity/data/AI4EO-MapYourCity/v1/building-age-dataset/test/test-set.csv')
train_df.info()

In [None]:
test_df['country_id'].value_counts() / len(test_df)

## Split frame for development

In [None]:
df1 = train_df.query('country_id in ["QCD","FMW","PNN"]')
ref = (test_df['country_id'].value_counts() / len(test_df))
ref = ref.to_frame().reset_index()
ref

In [None]:
n = 1500
print(np.round(ref['count'] * n))

In [None]:
dev_pids = []

for country_id, count in zip(ref['country_id'], ref['count']):
    print(country_id)
    country_df = train_df[train_df['country_id'] == country_id]
    print(len(country_df))
    _, df1 = train_test_split(country_df, test_size=int(count*n), 
                              stratify=country_df['label'], random_state=1312)
    
    dev_pids.extend(list(df1['pid']))

In [None]:
len(dev_pids)

In [None]:
ix = np.zeros(len(train_df))
for i, pid in enumerate(train_df['pid'].values):
    if pid in dev_pids:
        ix[i] = 1
        
ix = ix.astype(bool)
dev_df = train_df[ix]
train_df = train_df[~ix]

dev_df.shape, train_df.shape

In [None]:
sns.barplot(train_df['label'].value_counts())
sns.barplot(dev_df['label'].value_counts());

In [None]:
sns.histplot(test_df['country_id'], stat='density');
sns.histplot(dev_df['country_id'], stat='density');

In [None]:
dev_df.to_csv(os.path.join(split_root, 'dev-set.csv'), index=False)

In [None]:
def check_distribution(skey, variable, split='train'):
    fold_csvs = np.sort(os.listdir(os.path.join(split_root, split_key)))
    
    dfs = []
    
    for i in range(len([cc for cc in fold_csvs if split in cc])):
        df = pd.read_csv(os.path.join(split_root, skey, f'split_{split}_{i}.csv'))
        dfs.append( df[variable].value_counts().to_frame().reset_index().assign(fold=i) )
    
    return pd.concat(dfs)

## Random split, stratified by labels

In [None]:
split_key = 'random_stratified'

if not os.path.exists(os.path.join(split_root, split_key)):
    os.mkdir(os.path.join(split_root, split_key))
print(os.path.join(split_root, split_key))

In [None]:
skf = StratifiedKFold(n_splits=5)

In [None]:
check_dfs = []

for i, (train_idx, valid_idx) in enumerate(skf.split(train_df['pid'], train_df['label'])):
    print(i, len(valid_idx))
    
    cv_train = train_df.iloc[train_idx].assign(fold=i)
    cv_valid = train_df.iloc[valid_idx].assign(fold=i)
    
    
    
    check_dfs.append( cv_train['label'].value_counts().to_frame().reset_index().assign(fold=i) )
    
    #cv_train.to_csv(os.path.join(split_root, split_key, f'split_train_{i}.csv'), index=False)
    #cv_valid.to_csv(os.path.join(split_root, split_key, f'split_valid_{i}.csv'), index=False)

In [None]:
fig, axs = plt.subplots(2, 1, sharey=True)

check_df = check_distribution('random_stratified', 'label')
sns.barplot(data=check_df, x='label', y='count', hue='fold', ax=axs[0])

check_df = check_distribution('random_stratified', 'city_id')
sns.barplot(data=check_df, x='city_id', y='count', hue='fold', ax=axs[1])
plt.show()

## Random split, stratified by labels and cities

In [None]:
split_key = 'random_stratified_labels_cities'

if not os.path.exists(os.path.join(split_root, split_key)):
    os.mkdir(os.path.join(split_root, split_key))
print(os.path.join(split_root, split_key))

In [None]:
skf = StratifiedGroupKFold(n_splits=5)

In [None]:
for i, (train_idx, valid_idx) in enumerate(skf.split(train_df['pid'], train_df['label'], groups=train_df['city_id'])):    
    cv_train = train_df.iloc[train_idx].assign(fold=i)
    cv_valid = train_df.iloc[valid_idx].assign(fold=i)
    print(len(cv_valid))
    
    cv_train.to_csv(os.path.join(split_root, split_key, f'split_train_{i}.csv'), index=False)
    cv_valid.to_csv(os.path.join(split_root, split_key, f'split_valid_{i}.csv'), index=False)

In [None]:
fig, axs = plt.subplots(2, 1, sharey=True)

check_df = check_distribution(split_key, 'label')
sns.barplot(data=check_df, x='label', y='count', hue='fold', ax=axs[0])

check_df = check_distribution(split_key, 'city_id')
sns.barplot(data=check_df, x='city_id', y='count', hue='fold', ax=axs[1])
plt.show()

## Random split, stratified by labels and cities, no HUN

In [None]:
split_key = 'random_stratified_labels_cities_noHUN'

if not os.path.exists(os.path.join(split_root, split_key)):
    os.mkdir(os.path.join(split_root, split_key))
print(os.path.join(split_root, split_key))

In [None]:
skf = StratifiedGroupKFold(n_splits=5)

In [None]:
country_df = train_df.query('country_id != "HUN"')
country_df.shape, train_df.shape

In [None]:
for i, (train_idx, valid_idx) in enumerate(skf.split(country_df['pid'], country_df['label'], groups=country_df['city_id'])):    
    cv_train = country_df.iloc[train_idx].assign(fold=i)
    cv_valid = country_df.iloc[valid_idx].assign(fold=i)
        
    cv_train.to_csv(os.path.join(split_root, split_key, f'split_train_{i}.csv'), index=False)
    cv_valid.to_csv(os.path.join(split_root, split_key, f'split_valid_{i}.csv'), index=False)

In [None]:
fig, axs = plt.subplots(2, 1, sharey=True)

check_df = check_distribution(split_key, 'label')
sns.barplot(data=check_df, x='label', y='count', hue='fold', ax=axs[0])

check_df = check_distribution(split_key, 'city_id')
sns.barplot(data=check_df, x='city_id', y='count', hue='fold', ax=axs[1])
plt.show()

## Random split, stratified by labels and cities, 7 folds

In [None]:
split_key = 'random_stratified_labels_cities_7'

if not os.path.exists(os.path.join(split_root, split_key)):
    os.mkdir(os.path.join(split_root, split_key))
print(os.path.join(split_root, split_key))

In [None]:
skf = StratifiedGroupKFold(n_splits=7)

In [None]:
for i, (train_idx, valid_idx) in enumerate(skf.split(train_df['pid'], train_df['label'], groups=train_df['city_id'])):    
    cv_train = train_df.iloc[train_idx].assign(fold=i)
    cv_valid = train_df.iloc[valid_idx].assign(fold=i)
    
    cv_train.to_csv(os.path.join(split_root, split_key, f'split_train_{i}.csv'), index=False)
    cv_valid.to_csv(os.path.join(split_root, split_key, f'split_valid_{i}.csv'), index=False)

In [None]:
fig, axs = plt.subplots(2, 1, sharey=True)

check_df = check_distribution(split_key, 'label')
sns.barplot(data=check_df, x='label', y='count', hue='fold', ax=axs[0])

check_df = check_distribution(split_key, 'city_id')
sns.barplot(data=check_df, x='city_id', y='count', hue='fold', ax=axs[1])
plt.show()

## Split by countries

Leave-one-out with countries

In [None]:
country_ids = train_df['country_id'].unique()
country_ids

In [None]:
split_key = 'leave_one_country_out'

if not os.path.exists(os.path.join(split_root, split_key)):
    os.mkdir(os.path.join(split_root, split_key))
print(os.path.join(split_root, split_key))

In [None]:
for i, (country_id) in enumerate(country_ids):
    
    train_idx = train_df['country_id'] != country_id
    valid_idx = train_df['country_id'] == country_id
    
    cv_train = train_df[train_idx].assign(fold=i)
    cv_valid = train_df[valid_idx].assign(fold=i)
    
    print(i, train_idx.sum(), valid_idx.sum())
    
    cv_train.to_csv(os.path.join(split_root, split_key, f'split_train_{i}.csv'), index=False)
    cv_valid.to_csv(os.path.join(split_root, split_key, f'split_valid_{i}.csv'), index=False)

In [None]:
fig, axs = plt.subplots(2, 1, sharey=True)

check_df = check_distribution(split_key, 'label')
sns.barplot(data=check_df, x='label', y='count', hue='fold', ax=axs[0])

check_df = check_distribution(split_key, 'city_id')
sns.barplot(data=check_df, x='city_id', y='count', hue='fold', ax=axs[1])
plt.show()

## Use one country only

In [None]:
country_ids

In [None]:
split_key = 'use_only_one_country'

if not os.path.exists(os.path.join(split_root, split_key)):
    os.mkdir(os.path.join(split_root, split_key))
print(os.path.join(split_root, split_key))

In [None]:
# Dictionary based on actual counts
train_cities = dict(QCD=['JVVQZ', 'AJSUR'],
                    PNN=['U8MZD'],
                    HUN=['3TIYD'],
                    FMW=['H8ZYW', 'THRYV', 'O8WX6'])

In [None]:
for i, c in enumerate(country_ids):
    if c in ['EMA', 'NEA']:
        continue
    print('  ' + c)
    
    country_df = train_df[train_df['country_id'] == c]
    
    city_ids = country_df['city_id'].unique()
    
    cv_train = country_df.query(f"city_id in {train_cities[c]}").assign(fold=i)
    cv_valid = country_df.query(f"city_id not in {train_cities[c]}").assign(fold=i)
    cv_valid.to_csv(os.path.join(split_root, split_key, f'split_valid_{i}.csv'), index=False)
    cv_train.to_csv(os.path.join(split_root, split_key, f'split_train_{i}.csv'), index=False)

## Random split, stratified by labels and cities, drop EMA and NEA

In [None]:
split_key = 'random_stratified_labels_cities_noEMA_noNEA'

if not os.path.exists(os.path.join(split_root, split_key)):
    os.mkdir(os.path.join(split_root, split_key))
print(os.path.join(split_root, split_key))

In [None]:
skf = StratifiedGroupKFold(n_splits=5)

In [None]:
for i, (train_idx, valid_idx) in enumerate(skf.split(train_df['pid'], train_df['label'], groups=train_df['city_id'])):    
    cv_train = train_df.iloc[train_idx].assign(fold=i)
    cv_valid = train_df.iloc[valid_idx].assign(fold=i)
    
    cv_train.to_csv(os.path.join(split_root, split_key, f'split_train_{i}.csv'), index=False)
    cv_valid.to_csv(os.path.join(split_root, split_key, f'split_valid_{i}.csv'), index=False)

In [None]:
fig, axs = plt.subplots(2, 1, sharey=True)

check_df = check_distribution(split_key, 'label')
sns.barplot(data=check_df, x='label', y='count', hue='fold', ax=axs[0])

check_df = check_distribution(split_key, 'city_id')
sns.barplot(data=check_df, x='city_id', y='count', hue='fold', ax=axs[1])
plt.show()

## Use one country only, but stratified 5-fold

In [None]:
country_ids

In [None]:
split_key = 'use_only_one_country_5-fold'

if not os.path.exists(os.path.join(split_root, split_key)):
    os.mkdir(os.path.join(split_root, split_key))
print(os.path.join(split_root, split_key))

In [None]:
n = 0

for i, c in enumerate(country_ids):
    if c in ['EMA', 'NEA']:
        continue
    print('  ' + c)
    
    country_df = train_df[train_df['country_id'] == c]
    
    skf = StratifiedKFold(n_splits=5)
    
    for j, (train_idx, valid_idx) in enumerate(skf.split(country_df['pid'], country_df['label'])):
        print(n, len(valid_idx))
    
        cv_train = country_df.iloc[train_idx].assign(fold=n)
        cv_valid = country_df.iloc[valid_idx].assign(fold=n)
        cv_train.to_csv(os.path.join(split_root, split_key, f'split_train_{n}.csv'), index=False)
        cv_valid.to_csv(os.path.join(split_root, split_key, f'split_valid_{n}.csv'), index=False)
        n += 1
    
