In [80]:
import pandas as pd

version = 'v2.4'
cm_features = pd.read_csv(f'../data/cm_features_{version}.csv')
cm_features['date'] = pd.to_datetime(cm_features['date'])
# create country_id to ccode mapping
country_id_to_ccode = cm_features[['country_id', 'ccode']].drop_duplicates()
cm_features

Unnamed: 0,month_id,country_id,gleditsch_ward,ged_sb,ged_ns,ged_os,acled_sb,acled_sb_count,acled_os,ged_sb_tsum_24,...,region23_Northern America,region23_Northern Europe,region23_South America,region23_South-Eastern Asia,region23_Southern Africa,region23_Southern Asia,region23_Southern Europe,region23_Western Africa,region23_Western Asia,region23_Western Europe
0,121,1,110,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,121,2,115,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,121,3,52,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,121,4,101,0,0,12,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,121,7,160,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68273,532,242,510,0,0,0,0,0,1,2,...,0,0,0,0,0,0,0,0,0,0
68274,532,243,600,0,0,0,4,1,0,13,...,0,0,0,0,0,0,0,0,0,0
68275,532,244,435,0,0,0,0,0,0,6,...,0,0,0,0,0,0,0,1,0,0
68276,532,245,625,116,4,107,350,195,309,5762,...,0,0,0,0,0,0,0,0,0,0


In [81]:
import numpy as np

# TODO: Figure out partial creation for 2024
prediction_years = [2018, 2019, 2020, 2021, 2022, 2023]
prediction_window = 14
column_name = f'ged_sb_{prediction_window}'
for prediction_year in prediction_years:
    print(f'Prediction year: {prediction_year}')
    features_to_oct = pd.Timestamp(year=prediction_year - 1, month=10, day=1)  # 2021-Oct-01
    cm_features_year = cm_features[cm_features['date'] <= features_to_oct]
    # get last month_id
    last_month_id = cm_features_year['month_id'].max()
    print(f'Last month_id: {last_month_id}')
    last_month_cm_features = cm_features_year[cm_features_year['month_id'] == last_month_id]
    # create 3 month window based on last_month_cm_features
    two_month_buffer_features = []
    for counter in range(1, 3):
        temp_month = last_month_cm_features.copy()
        temp_month['month_id'] = last_month_id + counter
        temp_month['ged_sb'] = np.nan
        two_month_buffer_features.append(temp_month)

    two_month_buffer_features = pd.concat(two_month_buffer_features)
    print(f"two_month_buffer_features: {two_month_buffer_features['month_id'].unique()}")
    # read actuals for this year
    actuals_year = pd.read_parquet(f'../actuals/cm/window=Y{prediction_year}/cm_actuals_{prediction_year}.parquet')
    actuals_year.rename(columns={'outcome': 'ged_sb'}, inplace=True)  # rename outcome to ged_sb
    actuals_year.reset_index(drop=False, inplace=True)
    print(f"actuals_year: {actuals_year['month_id'].unique()}")
    # add ccode column to actuals_year
    actuals_year = actuals_year.merge(country_id_to_ccode, on='country_id', how='left')
    actuals_year = actuals_year[~actuals_year['ccode'].isnull()]

    _gap_months = two_month_buffer_features['month_id'].unique() - 11 - 3
    test_set_months_min = cm_features_year['month_id'].max() - 11
    test_set_months_max = cm_features_year['month_id'].max()
    print(f"_gap_months: expected empty months because of the gap: {_gap_months}")
    print(f"test set is from {test_set_months_min} to {test_set_months_max}")
    print(f"two month buffer months: {two_month_buffer_features['month_id'].unique()}")

    cm_features_year = pd.concat([cm_features_year, two_month_buffer_features, actuals_year])
    cm_features_year.reset_index(drop=True, inplace=True)

    cm_features_year[column_name] = cm_features_year.groupby('ccode')['ged_sb'].shift(-prediction_window)
    # drop rows with these months: actuals_year['month_id'].unique()
    cm_features_year = cm_features_year[~cm_features_year['month_id'].isin(actuals_year['month_id'].unique())]
    # drop rows with two_month_buffer_features['month_id'].unique()
    cm_features_year = cm_features_year[
        ~cm_features_year['month_id'].isin(two_month_buffer_features['month_id'].unique())]

    month_ids_is_null = cm_features_year[cm_features_year[column_name].isnull()]['month_id'].unique()
    print("month_ids_is_null: ", month_ids_is_null)
    assert all(_gap_months == month_ids_is_null), "Unexpected missing months"

    # drop gap months
    cm_features_year = cm_features_year[~cm_features_year['month_id'].isin(_gap_months)]

    cm_features_year.to_csv(f'../data/cm_features_{version}_Y{prediction_year}.csv', index=False)

print("All done!")


Prediction year: 2018
Last month_id: 454
two_month_buffer_features: [455 456]
actuals_year: [457 458 459 460 461 462 463 464 465 466 467 468]
_gap_months: expected empty months because of the gap: [441 442]
test set is from 443 to 454
two month buffer months: [455 456]
month_ids_is_null:  [441 442]
Prediction year: 2019
Last month_id: 466
two_month_buffer_features: [467 468]
actuals_year: [469 470 471 472 473 474 475 476 477 478 479 480]
_gap_months: expected empty months because of the gap: [453 454]
test set is from 455 to 466
two month buffer months: [467 468]
month_ids_is_null:  [453 454]
Prediction year: 2020
Last month_id: 478
two_month_buffer_features: [479 480]
actuals_year: [481 482 483 484 485 486 487 488 489 490 491 492]
_gap_months: expected empty months because of the gap: [465 466]
test set is from 467 to 478
two month buffer months: [479 480]
month_ids_is_null:  [465 466]
Prediction year: 2021
Last month_id: 490
two_month_buffer_features: [491 492]
actuals_year: [493 494

Unnamed: 0,month_id,country_id,gleditsch_ward,ged_sb,ged_ns,ged_os,acled_sb,acled_sb_count,acled_os,ged_sb_tsum_24,...,region23_Northern Europe,region23_South America,region23_South-Eastern Asia,region23_Southern Africa,region23_Southern Asia,region23_Southern Europe,region23_Western Africa,region23_Western Asia,region23_Western Europe,ged_sb_14
0,121,1,110.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,121,2,115.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,121,3,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,121,4,101.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,121,7,160.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65231,514,242,510.0,2.0,0.0,0.0,0.0,0.0,2.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
65232,514,243,600.0,0.0,0.0,0.0,0.0,0.0,1.0,18.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
65233,514,244,435.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
65234,514,245,625.0,21.0,275.0,0.0,14.0,2.0,38.0,59.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,364.0
