In [None]:
import pandas as pd
import os
import glob

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
import numpy as np

from matplotlib import pyplot as plt
import rasterio

In [None]:
import pprint

In [None]:
#pprint.pprint(os.listdir('../logs/experiments/runs/'))

In [None]:
expname = 'topmodal_swin_05-01_A'

In [None]:
#expdir = f'../logs/experiments/runs/{expname}/'
expdir = f'../logs/evaluations/runs/{expname}/'
os.listdir(expdir)

In [None]:
all_logdirs = [os.path.join(expdir, dd) for dd in os.listdir(expdir)]

In [None]:
logdirs = []
folds = []

fold_key = None

for fold in range(20):
    for logdir in all_logdirs:
        if os.path.exists(os.path.join(logdir, f'test_predictions_fold_{fold}.csv')):
            print(fold, logdir)
            logdirs.append(logdir)
            folds.append(fold)
            
            with open(os.path.join(logdir, 'config_tree.log')) as f:
                config_tree = f.readlines()

In [None]:
config_tree

In [None]:
for l in config_tree:
    if 'fold_key' in l:
        fold_key = l.strip().split(' ')[-1]
        break
    if 'jpg' in l:
        print(l)
print(fold_key)

In [None]:
test_set = pd.read_csv('../data/AI4EO-MapYourCity/v1/building-age-dataset/test/test-set.csv')
test_set['country_id'].value_counts()

In [None]:
countries_fold = {'0':'QCD', '1':'QCD', '2':'QCD', '3':'QCD', '4':'QCD',
                  '5':'PNN', '6':'PNN', '7':'PNN', '8':'PNN', '9':'PNN',
                  '10':'HUN', '11':'HUN', '12':'HUN', '13':'HUN', '14':'HUN',
                  '15':'FMW', '16':'FMW', '17':'FMW', '18':'FMW', '19':'FMW'
                 }

In [None]:
all_test_df = []
all_valid_df = []

all_pids = []

for fold, logdir in zip(folds, logdirs):
    rnd = {'predicted_labels':'predicted_label'}
    
    tt = pd.read_csv(os.path.join(logdir, f'test_predictions_fold_{fold}.csv')).rename(columns=rnd)
    vv = pd.read_csv(os.path.join(logdir, f'valid_predictions_fold_{fold}.csv')).rename(columns=rnd) 
    dd = pd.read_csv(f'../data/AI4EO-MapYourCity/splits/{fold_key}/split_valid_{fold}.csv') 
    
    # restrict to single country if trained like thid
    if fold_key == 'use_only_one_country_5-fold':
        print('Use only test set from country', countries_fold[str(fold)])
        country_pids = test_set[test_set['country_id'] == countries_fold[str(fold)]]['pid'].values
        print(len(country_pids), ' samples')
        ix = np.zeros(len(tt['pid']))
        for i, pid in enumerate(tt['pid']):
            if pid in country_pids:
                ix[i] = 1.
                
        tt = tt[ix.astype(bool)]
    
    tt['fold'] = fold
    vv['fold'] = fold
    
    all_pids.append(tt['pid'].sort_values().values)
    
    print(len(tt['pid']), 'samples attached to test set')
    all_test_df.append(tt)
    all_valid_df.append(pd.merge(vv, dd, on='pid'))

In [None]:
test_df = pd.concat(all_test_df)
test_df['predicted_label'] = test_df['predicted_label'].astype(int)
test_df.pivot_table(index=['pid', 'fold'])

In [None]:
test_df.assign(experiment=expname).to_csv(f'../submissions/all_folds/{expname}.csv', index=False)

In [None]:
def get_best_class(sdf):
    '''
    Return the majority vote
    If tied, choose class closest to the mean
    
    '''
    
    spids = []
    slbls = []
    # mode did not work with groupby
    for pid in test_df['pid'].unique():
        dd = test_df.loc[test_df['pid'] == pid]
        mode = dd['predicted_label'].mode()
        mean = dd['predicted_label'].mean()
                        
        if len(mode) > 1:
            mode = mode.iloc[np.argmin(np.abs(mode-mean))]
        else:
            mode = mode.values[0]
            
        spids.append(pid)
        slbls.append(mode)
                
    return pd.DataFrame(dict(pid=spids, predicted_label=slbls)).sort_values('pid')

In [None]:
submission_df = get_best_class(test_df)

In [None]:
submission_df.to_csv(f'../submissions/{expname}.csv', index=False)
submission_df.head(40)

In [None]:
sns.barplot(submission_df['predicted_label'].value_counts().to_frame().reset_index(),
            x='predicted_label', y='count'
           );

In [None]:
reference_submissions = ['streetview_swinv2_trafo.csv',
                         'topview_swinv2_trafo.csv',
                         'merged_topstreet_swin_04-16_C.csv',
                         'topview_street_sentinel2_04-23_B.csv',
                         f'{expname}.csv',
                        ]

In [None]:
all_submissions = pd.concat([pd.read_csv(os.path.join('../submissions/', f))['predicted_label'].value_counts().to_frame().assign(source=f) \
                             for f in reference_submissions]).reset_index()

In [None]:
sns.barplot(all_submissions, x='predicted_label', y='count', hue='source');

In [None]:
#assert test_df['pid'].value_counts().std() == 0 # all pids in all folds

In [None]:
all_valid_df[1]

In [None]:
valid_df = pd.concat(all_valid_df)
valid_df.head()

## Accuracy

In [None]:
Cn = confusion_matrix(valid_df['label'], valid_df['predicted_label'], normalize='true')
C = confusion_matrix(valid_df['label'], valid_df['predicted_label'])

In [None]:
sns.heatmap(C, annot=True, fmt='.0f');
plt.gca().set_ylabel('True class')
plt.gca().set_xlabel('Predicted class');

In [None]:
acc = accuracy_score(valid_df['label'].values, valid_df['predicted_label'].values)
print(f'Accuracy score: {acc:.4f}')
print(f'MAP:            {np.diag(Cn).mean():.4f}')

In [None]:
country_ids = valid_df['country_id'].unique()

plot_dfs = []

for country in country_ids:
    country_df = valid_df.loc[valid_df['country_id'] == country]
    cacc = accuracy_score(country_df['label'].values, country_df['predicted_label'].values)
    
    cmat = confusion_matrix(country_df['label'], country_df['predicted_label'], normalize='true').diagonal()
    
    plot_dfs.append(pd.DataFrame(dict(country=country, classes=list(range(7)), cmat=cmat)))
    
    
    print(f'Country ID: {country}, MCA = {cacc:.4f}, count = {len(country_df)}')

In [None]:
plot_df = pd.concat(plot_dfs)
sns.barplot(plot_df, x='classes', y='cmat', hue='country')
plt.gca().legend(ncol=2)
plt.show()

In [None]:
folds = valid_df['fold_y'].unique()
for fold in folds:
    country_df = valid_df.loc[valid_df['fold_y'] == fold]
    cacc = accuracy_score(country_df['label'].values, country_df['predicted_label'].values)
    
    print(f'Fold: {fold}, MCA = {cacc:.4f}, count = {len(country_df)}')

## Inspect mis-classified samples

In [None]:
def inspect_misclassified(tclass, pclass, country_id=None):
    '''
    Pick a random misclassified sample and display
    '''
    
    input_path = "../data/AI4EO-MapYourCity/v1/building-age-dataset/"
    train_path = input_path + "train/data/"
    
    # Choose a building by pid:
    if country_id is None:
        mismatched_df = valid_df.query('label==@tclass and predicted_label==@pclass')
    else:
        mismatched_df = valid_df.query('label==@tclass and predicted_label==@pclass and country_id==@country_id')
    print(f'{len(mismatched_df)} samples were classified as {pclass} but are {tclass}')

    pid = mismatched_df['pid'].iloc[np.random.randint(len(mismatched_df))]

    print(valid_df.loc[valid_df['pid'] == pid][['pid', 'country_id', 'city_id']])
    street = plt.imread(f"{train_path}{pid}/street.jpg")
    orthophoto = plt.imread(f"{train_path}{pid}/orthophoto.tif")
    s2 = rasterio.open(f"{train_path}{pid}/s2_l2a.tif").read()
    s2 = np.transpose(s2,[1,2,0])

    # Show the 3 modalities - street view, orthophoto and Seninel-2

    fig, axs = plt.subplots(figsize=(15, 15), nrows=1, ncols = 3)
    axs = axs.flatten()
    axs[0].imshow(street)
    axs[1].imshow(orthophoto)
    axs[2].imshow(s2[...,[3,2,1]]*3e-4)

    axs[0].set_title("Street")
    axs[1].set_title("Orthophoto")
    axs[2].set_title("Sentinel-2-L2A")

    plt.show()
    

In [None]:
# Define paths to data

# input_path = "directory with MapYourCity image files"
#test_path = input_path + "test/data/"


In [None]:
inspect_misclassified(0, 6)

## Merge streetview

In [None]:
df1 = pd.read_csv('../submissions/topview_swin_04-19_A.csv').rename(columns={'predicted_label':'topview_label'})
df3 = pd.read_csv('../submissions/sentinel2_patch_swin_04-20_A.csv').rename(columns={'predicted_label':'sentinel2_label'})
df2 = pd.read_csv('../submissions/streetview_swin_04-19_A.csv').rename(columns={'predicted_label':'streetview_label'})
is_valid_streetview = pd.read_csv('valid_streetview.csv')
target = '../submissions/merged_topstreet_swin_04-19_A.csv'

In [None]:
df2 = df2[is_valid_streetview['is_valid_streetview']]

In [None]:
df = pd.merge(df3, pd.merge(df1, df2, 'outer'), 'outer')

In [None]:
predicted_label = []
for i in range(len(df)):
    if np.isnan(df.iloc[i]['streetview_label']) or \
               df.iloc[i]['topview_label'] == df.iloc[i]['streetview_label']:
        predicted_label.append(df.iloc[i]['topview_label'])
    else:
        predicted_label.append(df.iloc[i]['streetview_label'])
        

In [None]:
df['predicted_label'] = np.array(predicted_label).astype(int)
df = df.drop(columns=['topview_label', 'streetview_label'])
df.to_csv(target, index=False)

In [None]:
a = dict(b=3, c=4)

In [None]:
list(a.values())[0]