In [1]:
import pandas as pd

In [2]:
# Load combined training metrics
metrics = pd.read_csv("s3://w210-poverty-mapper/modeling/results/combined_results.csv")

In [3]:
metrics.head()

Unnamed: 0,split_name,num_classes,bin_method,pretrained,freeze_layers,epochs,learning_rate,gamma,step_size,batch_size,...,val_precision,val_recall,val_f1,test_precision,test_recall,test_f1,split,split_group,max_val_f1,top_n_max_val_f1
0,leave_one_out_bangladesh_10k_1d,5,across,resnet18,no,10,0.0001,0.1,10,448,...,,,,,,,leave_one_out_bangladesh,leave_one_out,,0
1,leave_one_out_bangladesh_10k_1d,5,across,resnet18,no,10,0.0001,0.1,10,448,...,,,,,,,leave_one_out_bangladesh,leave_one_out,,0
2,leave_one_out_bangladesh_10k_1d,5,across,resnet18,no,10,0.0001,0.1,10,448,...,,,,,,,leave_one_out_bangladesh,leave_one_out,,0
3,leave_one_out_bangladesh_10k_1d,5,across,resnet18,no,10,0.0001,0.1,10,448,...,,,,,,,leave_one_out_bangladesh,leave_one_out,,0
4,leave_one_out_bangladesh_10k_1d,5,across,resnet18,no,10,0.0001,0.1,10,448,...,,,,,,,leave_one_out_bangladesh,leave_one_out,,0


In [4]:
# Add country name column
metrics["country"] = metrics["split"].replace(regex=["within_country_", "leave_one_out_"], value="")

In [5]:
# Get unique countries
metrics["country"].unique()

array(['bangladesh', 'nepal', 'philippines', 'tajikistan', 'timor_leste',
       'south_central_cross_country_v1', 'south_central_cross_country_v2',
       'south_central_cross_country_v3', 'south_central_v1',
       'south_central_v2', 'south_central_v3', 'south_central_v4',
       'southeast_cross_country_v1', 'southeast_v1'], dtype=object)

In [6]:
# Add data threshold column (e.g. "5k_50d")
metrics["data_thresholds"] = metrics.apply(lambda row: "_".join(row["split_name"].split("_")[-2:]), axis=1)

In [7]:
# Print unique data thresholds
metrics["data_thresholds"].unique()

array(['10k_1d', '10k_50d', '5k_50d'], dtype=object)

In [8]:
# Print unique split groups
metrics["split_group"].unique()

array(['leave_one_out', 'similar geography', 'within_country'],
      dtype=object)

In [9]:
# Keep relevant metrics
subset = metrics.loc[(metrics["num_classes"] == 2) & 
                     (metrics["data_thresholds"] == "5k_50d") & 
                     (metrics["freeze_layers"] == "no") &
                     metrics["bin_method"].isin(["within_even", "across_even"])]

In [10]:
# Print unique bin methods
subset["bin_method"].unique()

array(['across_even', 'within_even'], dtype=object)

In [11]:
# Add indicator for top val F1 by group
grouped = subset.groupby(["split", "id"])
grouped = grouped["val_f1"].max()
grouped = grouped.reset_index()
grouped = grouped.rename(columns={"val_f1": "max_val_f1_final"})

def get_top_n(col, in_name, out_name, k):
    values = sorted(col[in_name].tolist(), reverse=True)
    #print(len(values))
    thresh = values[min(k, len(values))-1]
    col[out_name] = col.apply(lambda row: 1 if row[in_name] >= thresh else 0, axis=1)
    return col

grouped = grouped.groupby(["split"]).apply(lambda x: get_top_n(x, "max_val_f1_final", "best_val_f1", 1))
joined = subset.merge(grouped, on=["split", "id"], how="outer")

In [12]:
# Prinr unique countries for best validation f1
joined[joined["best_val_f1"] == 1]["country"].unique()

array(['bangladesh', 'nepal', 'philippines', 'timor_leste',
       'south_central_cross_country_v1', 'south_central_cross_country_v2',
       'south_central_cross_country_v3', 'south_central_v3',
       'south_central_v4', 'southeast_cross_country_v1', 'southeast_v1',
       'tajikistan'], dtype=object)

In [13]:
# Print unique ids for best validation metric
joined[joined["best_val_f1"] == 1]["id"].unique()

array([ 44, 125, 210, 347, 391, 411, 413, 468, 470, 491, 510, 569, 671,
       771, 861, 953])

In [14]:
# Keep best valiation metrics
joined_subset = joined.loc[(joined["best_val_f1"] == 1)] 

In [15]:
# Sort by id and epoch
joined_subset = joined_subset.sort_values(by=["id", "epoch"])

In [16]:
# Keep single record for each country and model type (since there are multiple epochs)
joined_subset = joined_subset.drop_duplicates(subset=["id"], keep="first")

In [17]:
joined_subset

Unnamed: 0,split_name,num_classes,bin_method,pretrained,freeze_layers,epochs,learning_rate,gamma,step_size,batch_size,...,test_recall,test_f1,split,split_group,max_val_f1,top_n_max_val_f1,country,data_thresholds,max_val_f1_final,best_val_f1
70,leave_one_out_bangladesh_5k_50d,2,across_even,resnet50,no,30,0.001,0.1,10,64,...,0.083945,0.147262,leave_one_out_bangladesh,leave_one_out,0.603927,1,bangladesh,5k_50d,0.603927,1
100,leave_one_out_nepal_5k_50d,2,across_even,resnet50,no,20,0.0001,0.1,10,64,...,0.290323,0.289474,leave_one_out_nepal,leave_one_out,0.32816,1,nepal,5k_50d,0.32816,1
220,leave_one_out_philippines_5k_50d,2,across_even,resnet50,no,20,0.001,0.1,10,64,...,0.505376,0.106155,leave_one_out_philippines,leave_one_out,0.15122,1,philippines,5k_50d,0.15122,1
440,leave_one_out_timor_leste_5k_50d,2,across_even,resnet50,no,30,0.0001,0.1,10,64,...,0.062189,0.108225,leave_one_out_timor_leste,leave_one_out,0.486369,1,timor_leste,5k_50d,0.486369,1
540,south_central_cross_country_v1_5k_50d,2,across_even,resnet50,no,30,0.0001,0.1,10,64,...,0.4,0.473684,south_central_cross_country_v1,similar geography,0.542373,1,south_central_cross_country_v1,5k_50d,0.542373,1
640,south_central_cross_country_v2_5k_50d,2,across_even,resnet50,no,30,0.0001,0.1,10,64,...,0.729483,0.735069,south_central_cross_country_v2,similar geography,0.741176,1,south_central_cross_country_v2,5k_50d,0.741176,1
700,south_central_cross_country_v3_5k_50d,2,across_even,resnet50,no,20,0.0001,0.1,10,64,...,0.702985,0.734217,south_central_cross_country_v3,similar geography,0.737389,1,south_central_cross_country_v3,5k_50d,0.737389,1
870,south_central_leave_one_out_v3_5k_50d,2,across_even,resnet50,no,30,0.001,0.1,10,64,...,0.441935,0.253939,south_central_leave_one_out_v3,similar geography,0.308571,1,south_central_v3,5k_50d,0.308571,1
920,south_central_leave_one_out_v4_5k_50d,2,across_even,resnet50,no,20,0.001,0.1,10,64,...,0.590625,0.266197,south_central_leave_one_out_v4,similar geography,0.284091,1,south_central_v4,5k_50d,0.284091,1
1040,southeast_cross_country_v1_5k_50d,2,across_even,resnet50,no,30,0.0001,0.1,10,64,...,0.45122,0.512111,southeast_cross_country_v1,similar geography,0.573134,1,southeast_cross_country_v1,5k_50d,0.573134,1


In [18]:
# Save top validation f1 results to s3
joined_subset.to_csv("s3://w210-poverty-mapper/modeling/results/combined_results_top_val_f1.csv", index = False)