In [1]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

In [2]:
def MAPE(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
def run_all_models(X_train, X_test, y_train, y_test, preds = False):
    # random forest
    rf_regr = RandomForestRegressor(max_depth=10, random_state=random, criterion="mae")
    rf_regr.fit(X_train, y_train)
    rf_y_pred = rf_regr.predict(X_test)
    rf_mape = MAPE(y_test, rf_y_pred)
    #print('Random Forests MAPE: {0}'.format(rf_mape))

    '''
    # adaboost 
    ada_regr = AdaBoostRegressor(random_state=random)
    ada_regr.fit(X_train, y_train)
    ada_y_pred = ada_regr.predict(X_test)
    ada_mape = MAPE(y_test, ada_y_pred)
    #print('AdaBoost MAPE: {0}'.format(MAPE(y_test, y_pred)))
    '''
    
    # xgboost
    xg_regr = XGBRegressor(max_depth=10, learning_rate=0.01, n_estimators=300, gamma=1, random_state=random)
    xg_regr.fit(X_train, y_train)
    xg_y_pred = xg_regr.predict(X_test)
    xg_mape = MAPE(y_test, xg_y_pred)
    #print('XGBoost MAPE: {0}'.format(MAPE(y_test, y_pred)))
    
    # linear regression
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)
    lin_y_pred = lin_reg.predict(X_test)
    lin_mape = MAPE(y_test, lin_y_pred)
    #print('Linear Regression MAPE: {0}'.format(MAPE(y_test, y_pred)))
    
    if preds: return np.array([rf_mape, xg_mape, lin_mape]), [rf_y_pred, xg_y_pred, lin_y_pred]
    
    return np.array([rf_mape, xg_mape, lin_mape])

In [4]:
_iso_splits = None
def gen_splits(fb_df, variables = [], data_used = False):
    global _iso_splits
    if data_used:
        variables.extend(['data_used_B', 'data_used_C', 'data_used_I', 'data_used_R', 'data_used_BR', 'data_used_CR', 'data_used_IR', 'data_used_CB', 'data_used_CBR'])
        
    total_columns = [col for col in fb_df.columns[2:] if '2019' not in col and '2020' not in col and ('un' in col or 'lvl' in col)]
    predictors = fb_df[variables].values if variables != []\
        else fb_df[total_columns].values
    gt = fb_df['migrant_2019'].values
    
    high_predictors = predictors[fb_df['development_lvl'] == 1]
    high_gt = gt[fb_df['development_lvl'] == 1]
    low_predictors = predictors[fb_df['development_lvl'] == 0]
    low_gt = gt[fb_df['development_lvl'] == 0]
    
    
    if _iso_splits is None:
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        _iso_splits = dict()
        _iso_splits["all"] = list(kf.split(predictors, gt))
        _iso_splits["high"] = list(kf.split(high_predictors, high_gt))
        _iso_splits["low"] = list(kf.split(low_predictors, low_gt))

    splits = dict()
    # 1. Randomly sample from all countries for training and test sets.
    #splits["random_all"] = train_test_split(predictors, gt, test_size=0.2, random_state=42)
    splits["random_all"] = [(predictors[i], predictors[j], gt[i], gt[j]) for i, j in _iso_splits["all"]]
    # 2. Train more developed, test less developed
    # splits["train_high_test_low"] = [(high_predictors, low_predictors, high_gt, low_gt)]
    # 3. Train less developed, test more developed
    # splits["train_low_test_high"] = [(low_predictors, high_predictors, low_gt, high_gt)]
    # 4. Randomly sample high for train+test
    #splits["train_test_high"] = train_test_split(high_predictors, high_gt, test_size=0.2, random_state=42)
    splits["train_test_high"] = [(high_predictors[i], high_predictors[j], high_gt[i], high_gt[j]) for i, j in _iso_splits["high"]]
    # 5. Randomly sample low for train+test
    #splits["train_test_low"] = train_test_split(low_predictors, low_gt, test_size=0.2, random_state=42)
    splits["train_test_low"] = [(low_predictors[i], low_predictors[j], low_gt[i], low_gt[j]) for i, j in _iso_splits["low"]]

    return splits

## Data Setup

In [5]:
# read in UN data
un_df = pd.read_csv('../data/UN_data_clean.csv')
# ground truth data for all of the models
y = np.array((un_df[(un_df['age_group'] == 'Total') & (un_df['sex'] == 'both sexes') & (un_df['year'] == 2019)]\
          ['migrant_pop']))

In [6]:
# read in combined fb_un_data
fb_df = pd.read_csv('../data/facebook_un_combined_2020.csv')
#fb_df_2020 = pd.read_csv('../data/facebook_un_combined_2020.csv')
predictors = fb_df.values [:, 2:176]
gt = fb_df['migrant_2019'].values

In [7]:
fb_un_age_sex = pd.read_csv('../data/FB_UN_age_sex.csv')
fb_un_age_sex = fb_un_age_sex.drop('country_name', axis=1)
fb_un_age_sex['oecd_member'] = fb_un_age_sex['oecd_member'].astype(int)

fb_un_age_sex['data_used_B'] = ['B' in s for s in fb_un_age_sex['data_used']]
fb_un_age_sex['data_used_C'] = ['C' in s for s in fb_un_age_sex['data_used']]
fb_un_age_sex['data_used_I'] = ['I' in s for s in fb_un_age_sex['data_used']]
fb_un_age_sex['data_used_R'] = ['R' in s for s in fb_un_age_sex['data_used']]
fb_un_age_sex['data_used_BR'] = ['B' in s and 'R' in s for s in fb_un_age_sex['data_used']]
fb_un_age_sex['data_used_CR'] = ['C' in s and 'R' in s for s in fb_un_age_sex['data_used']]
fb_un_age_sex['data_used_IR'] = ['I' in s and 'R' in s for s in fb_un_age_sex['data_used']]
fb_un_age_sex['data_used_CB'] = ['C' in s and 'B' in s for s in fb_un_age_sex['data_used']]
fb_un_age_sex['data_used_CBR'] = ['C' in s and 'B' in s and 'R' in s for s in fb_un_age_sex['data_used']]

fb_un_age_sex = fb_un_age_sex.drop('data_used', axis=1)

fb_un_age_sex['un_development_lvl'] = [0 if 'Less' in s else 1 for s in fb_un_age_sex['un_development_lvl']]

fb_un_age_sex.loc[fb_un_age_sex['fb_penetration'] > 1, 'fb_penetration'] = 1

fb_un_age_sex = pd.get_dummies(fb_un_age_sex)

fb_un_age_sex['fb_expats_normalized'] = fb_un_age_sex['fb_expats'] / fb_un_age_sex['fb_penetration']

In [8]:
# hyperparameters
depth = 5
random = 0

In [9]:
f = open('model_mapes.csv','w')
writer = csv.writer(f, delimiter=',')
writer.writerow(["model", "split", "rf_mape", "xgboost_mape", "linreg_mape"])

46

### Simple Autoregressive Model

In [10]:
splits = gen_splits(fb_df, ['un_expat_total_age16_2017', 'un_expat_total_age16_2015'])

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = np.array([0.]*3)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["autoregressive_baseline", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[28.81854125 27.16132241 97.60701627]


train_test_high: 
[14.65252326 17.08746768 22.29603912]


train_test_low: 
[25.93411819 23.24958422 64.50992431]




In [11]:
splits = gen_splits(fb_df, ['un_expat_total_age16_2017', 'un_expat_total_age16_2015'], data_used = True)

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = np.array([0.]*3)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["autoregressive_baseline_data_used", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[ 28.87273039  27.1547004  117.68283573]


train_test_high: 
[16.00378755 17.08746768 22.59624586]


train_test_low: 
[26.72779936 22.97933473 88.63186461]




### Facebook Naive

In [12]:
splits = gen_splits(fb_df, ['total_expat'])

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = mapes = np.array([0.]*3)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["fb_naive", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[225.81663693 219.77116709 310.93996917]


train_test_high: 
[ 58.68774572  58.00751828 206.16529846]


train_test_low: 
[312.08858133 294.21916465 322.3538709 ]




In [13]:
splits = gen_splits(fb_df, ['total_expat'], True)

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = mapes = np.array([0.]*3)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["fb_naive_data_used", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[219.93641173 190.88214261 616.81200138]


train_test_high: 
[ 59.0498373   57.16827958 190.7422446 ]


train_test_low: 
[301.52314819 262.82614543 374.43698138]




### Autoregressive + Facebook Expats

In [14]:
splits = gen_splits(fb_df, ['un_expat_total_age16_2017', 'un_expat_total_age16_2015', 'total_expat'])

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = mapes = np.array([0.]*3)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["autoregressive_plus_fb", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[23.58728849 16.45554028 91.98927695]


train_test_high: 
[15.05807372 17.12983242 22.65209668]


train_test_low: 
[23.52850681 26.09451595 40.53309467]




In [15]:
splits = gen_splits(fb_df, ['un_expat_total_age16_2017', 'un_expat_total_age16_2015', 'total_expat'], True)

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = mapes = np.array([0.]*3)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["autoregressive_plus_fb_data_used", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[ 23.53418518  16.36774652 115.63878613]


train_test_high: 
[16.27248859 17.12983242 23.4655055 ]


train_test_low: 
[23.8418574  26.17210751 72.47093153]




### Facebook age-sex corrected

### Facebook age-sex corrected (2020)

In [16]:
norm_columns = [col for col in fb_df.columns if 'normalized' in col]
splits = gen_splits(fb_df, norm_columns)

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = mapes = np.array([0.]*3)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["fb_age_sex_normalized", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[234.9389278  175.23503547 431.72965451]


train_test_high: 
[ 48.16076726  52.69560887 227.43864852]


train_test_low: 
[311.59907899 224.80760422 720.85965534]




In [17]:
norm_columns = [col for col in fb_df.columns if 'normalized' in col]
splits = gen_splits(fb_df, norm_columns, True)

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = mapes = np.array([0.]*3)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["fb_age_sex_normalized_data_used", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[240.62446341 178.57804372 613.96413065]


train_test_high: 
[ 50.31006458  52.53047149 227.6403614 ]


train_test_low: 
[301.52885955 223.62200578 777.84653244]




### Facebook age-sex corrected with autoregression

### Facebook age-sex corrected with autoregression (2020)

In [18]:
norm_columns = [col for col in fb_df.columns if 'normalized' in col]
norm_columns.extend(['un_expat_total_age16_2017', 'un_expat_total_age16_2015'])
splits = gen_splits(fb_df, norm_columns)

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = mapes = np.array([0.]*3)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["autoregressive_with_fb_normalized", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[21.88996317 21.75573349 54.84774068]


train_test_high: 
[35.75017061 33.23389708 37.01272483]


train_test_low: 
[ 26.81268978  27.35153645 193.87170796]




In [19]:
norm_columns = [col for col in fb_df.columns if 'normalized' in col]
norm_columns.extend(['un_expat_total_age16_2017', 'un_expat_total_age16_2015'])
splits = gen_splits(fb_df, norm_columns, True)

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = mapes = np.array([0.]*3)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["autoregressive_with_fb_normalized_data_used", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[22.74021872 22.0960169  70.47186936]


train_test_high: 
[41.14886813 33.23389708 37.01255401]


train_test_low: 
[ 27.0856626   28.0380591  206.48408055]




### All Predictors (2019)

### All Predictors (2020)

In [20]:
splits = gen_splits(fb_df)

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = mapes = np.array([0.]*3)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["all_preds", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[25.27726597 24.20104498 81.37025522]


train_test_high: 
[14.8699879  27.65986029 20.6946041 ]


train_test_low: 
[ 35.56576065  38.71852609 155.72790559]




### Age-Sex

In [21]:
country_codes = list(set(fb_un_age_sex['country_code']))
high_dev_isos = list(set(fb_un_age_sex.loc[fb_un_age_sex['un_development_lvl'] == 1, 'country_code']))
low_dev_isos = list(set(fb_un_age_sex.loc[fb_un_age_sex['un_development_lvl'] == 0, 'country_code']))

In [22]:
randall_splits = [([country_codes[i] for i in train_split], [country_codes[i] for i in test_split]) for train_split, test_split in _iso_splits["all"]]
high_dev_splits = [([high_dev_isos[i] for i in train_split], [high_dev_isos[i] for i in test_split]) for train_split, test_split in _iso_splits["high"]]
low_dev_splits = [([low_dev_isos[i] for i in train_split], [low_dev_isos[i] for i in test_split]) for train_split, test_split in _iso_splits["low"]]

iso_splits = {
            "random_all": randall_splits,
            "train_test_high": high_dev_splits,
            "train_test_low": low_dev_splits
         }

In [23]:
for k,v in iso_splits.items(): print(k)

random_all
train_test_high
train_test_low


In [24]:
columns = ['age_sex_group_female_age0',
       'age_sex_group_female_age1', 'age_sex_group_female_age10',
       'age_sex_group_female_age2', 'age_sex_group_female_age3',
       'age_sex_group_female_age4', 'age_sex_group_female_age5',
       'age_sex_group_female_age6', 'age_sex_group_female_age7',
       'age_sex_group_female_age8', 'age_sex_group_female_age9',
       'age_sex_group_male_age0', 'age_sex_group_male_age1',
       'age_sex_group_male_age10', 'age_sex_group_male_age2',
       'age_sex_group_male_age3', 'age_sex_group_male_age4',
       'age_sex_group_male_age5', 'age_sex_group_male_age6',
       'age_sex_group_male_age7', 'age_sex_group_male_age8',
       'age_sex_group_male_age9', 'migrant_pop_2015', 'migrant_pop_2017']

In [25]:
for k, folds in iso_splits.items():
    subgroup_mapes = mapes = np.array([0.]*3)
    total_mapes = mapes = np.array([0.]*3)
    print(k, ": ")
    for train, test in folds: 
        X_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), columns]
        X_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), columns]
        y_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), 'migrant_pop_2019']
        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'migrant_pop_2019']
        mapes, preds = run_all_models(X_train, X_test, y_train, y_test, preds=True)
        subgroup_mapes+=mapes

        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), ['migrant_pop_2019', 'country_code']]
        mapes_temp = []
        for pred in preds:
            pred = pd.DataFrame(pred, columns=['pop'])
            pred['country_code'] = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'country_code'].values
            mapes_temp.append(MAPE(y_test.groupby('country_code').sum()['migrant_pop_2019'].values, pred.groupby('country_code').sum()['pop'].values))
        total_mapes+=mapes_temp

    subgroup_mapes = subgroup_mapes/len(folds)
    total_mapes = total_mapes/len(folds)
    writer.writerow(["autoregression_subgroups", k, *subgroup_mapes])
    writer.writerow(["autoregression_subgroups_added", k, *total_mapes])
    print("Subgroups: ", subgroup_mapes)
    print("Total: ", total_mapes)
    print("\n")

random_all : 
Subgroups:  [ 23.84646531  27.73974519 214.02390304]
Total:  [ 14.16542601  17.22727041 143.45972152]


train_test_high : 
Subgroups:  [18.53955913 18.48041349 85.95371386]
Total:  [12.1872286  11.15636017 25.04844746]


train_test_low : 
Subgroups:  [ 26.87464501  30.7349897  188.46762754]
Total:  [ 17.15208061  20.36235567 157.2136843 ]




In [26]:
columns = ['fb_expats_normalized', 'age_sex_group_female_age0',
       'age_sex_group_female_age1', 'age_sex_group_female_age10',
       'age_sex_group_female_age2', 'age_sex_group_female_age3',
       'age_sex_group_female_age4', 'age_sex_group_female_age5',
       'age_sex_group_female_age6', 'age_sex_group_female_age7',
       'age_sex_group_female_age8', 'age_sex_group_female_age9',
       'age_sex_group_male_age0', 'age_sex_group_male_age1',
       'age_sex_group_male_age10', 'age_sex_group_male_age2',
       'age_sex_group_male_age3', 'age_sex_group_male_age4',
       'age_sex_group_male_age5', 'age_sex_group_male_age6',
       'age_sex_group_male_age7', 'age_sex_group_male_age8',
       'age_sex_group_male_age9', 'migrant_pop_2015', 'migrant_pop_2017']

In [27]:
for k, folds in iso_splits.items():
    subgroup_mapes = mapes = np.array([0.]*3)
    total_mapes = mapes = np.array([0.]*3)
    print(k, ": ")
    for train, test in folds: 
        X_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), columns]
        X_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), columns]
        y_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), 'migrant_pop_2019']
        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'migrant_pop_2019']
        mapes, preds = run_all_models(X_train, X_test, y_train, y_test, preds=True)
        subgroup_mapes+=mapes

        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), ['migrant_pop_2019', 'country_code']]
        mapes_temp = []
        for pred in preds:
            pred = pd.DataFrame(pred, columns=['pop'])
            pred['country_code'] = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'country_code'].values
            mapes_temp.append(MAPE(y_test.groupby('country_code').sum()['migrant_pop_2019'].values, pred.groupby('country_code').sum()['pop'].values))
        total_mapes+=mapes_temp

    subgroup_mapes = subgroup_mapes/len(folds)
    total_mapes = total_mapes/len(folds)
    writer.writerow(["autoregressive_with_fb_normalized_subgroups", k, *subgroup_mapes])
    writer.writerow(["autoregressive_with_fb_normalized_subgroups_added", k, *total_mapes])
    print("Subgroups: ", subgroup_mapes)
    print("Total: ", total_mapes)
    print("\n")

random_all : 
Subgroups:  [ 24.96889453  26.91198274 210.54475379]
Total:  [ 15.50060815  17.03493764 139.71896829]


train_test_high : 
Subgroups:  [18.68602279 21.10938774 86.60015382]
Total:  [12.55085863 15.20704026 25.61836095]


train_test_low : 
Subgroups:  [ 29.3199535   29.17281419 188.41866479]
Total:  [ 19.00385318  19.00404275 154.49772212]




In [28]:
columns = ['fb_expats_normalized', 'age_sex_group_female_age0',
       'age_sex_group_female_age1', 'age_sex_group_female_age10',
       'age_sex_group_female_age2', 'age_sex_group_female_age3',
       'age_sex_group_female_age4', 'age_sex_group_female_age5',
       'age_sex_group_female_age6', 'age_sex_group_female_age7',
       'age_sex_group_female_age8', 'age_sex_group_female_age9',
       'age_sex_group_male_age0', 'age_sex_group_male_age1',
       'age_sex_group_male_age10', 'age_sex_group_male_age2',
       'age_sex_group_male_age3', 'age_sex_group_male_age4',
       'age_sex_group_male_age5', 'age_sex_group_male_age6',
       'age_sex_group_male_age7', 'age_sex_group_male_age8',
       'age_sex_group_male_age9']

In [29]:
for k, folds in iso_splits.items():
    subgroup_mapes = mapes = np.array([0.]*3)
    total_mapes = mapes = np.array([0.]*3)
    print(k, ": ")
    for train, test in folds: 
        X_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), columns]
        X_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), columns]
        y_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), 'migrant_pop_2019']
        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'migrant_pop_2019']
        mapes, preds = run_all_models(X_train, X_test, y_train, y_test, preds=True)
        subgroup_mapes+=mapes

        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), ['migrant_pop_2019', 'country_code']]
        mapes_temp = []
        for pred in preds:
            pred = pd.DataFrame(pred, columns=['pop'])
            pred['country_code'] = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'country_code'].values
            mapes_temp.append(MAPE(y_test.groupby('country_code').sum()['migrant_pop_2019'].values, pred.groupby('country_code').sum()['pop'].values))
        total_mapes+=mapes_temp

    subgroup_mapes = subgroup_mapes/len(folds)
    total_mapes = total_mapes/len(folds)
    writer.writerow(["fb_normalized_subgroups", k, *subgroup_mapes])
    writer.writerow(["fb_normalized_subgroups_added", k, *total_mapes])
    print("Subgroups: ", subgroup_mapes)
    print("Total: ", total_mapes)
    print("\n")

random_all : 
Subgroups:  [ 373.51275028  463.23934765 3160.88705568]
Total:  [ 295.75716344  367.32333029 2481.84525019]


train_test_high : 
Subgroups:  [114.54023859  98.82990594 597.64629826]
Total:  [ 93.14157633  76.00907462 375.16117015]


train_test_low : 
Subgroups:  [ 328.7852281   425.53101723 2363.54796287]
Total:  [ 264.02568337  347.29777262 2092.26336991]




In [30]:
columns = ['migrant_pop_2015', 'migrant_pop_2017', 'fb_expats_normalized',
       'data_used_B', 'data_used_C',
       'data_used_I', 'data_used_R', 'data_used_BR', 'data_used_CR',
       'data_used_IR', 'data_used_CB', 'data_used_CBR', 'age_sex_group_female_age0',
       'age_sex_group_female_age1', 'age_sex_group_female_age10',
       'age_sex_group_female_age2', 'age_sex_group_female_age3',
       'age_sex_group_female_age4', 'age_sex_group_female_age5',
       'age_sex_group_female_age6', 'age_sex_group_female_age7',
       'age_sex_group_female_age8', 'age_sex_group_female_age9',
       'age_sex_group_male_age0', 'age_sex_group_male_age1',
       'age_sex_group_male_age10', 'age_sex_group_male_age2',
       'age_sex_group_male_age3', 'age_sex_group_male_age4',
       'age_sex_group_male_age5', 'age_sex_group_male_age6',
       'age_sex_group_male_age7', 'age_sex_group_male_age8',
       'age_sex_group_male_age9']


In [31]:
for k, folds in iso_splits.items():
    subgroup_mapes = mapes = np.array([0.]*3)
    total_mapes = mapes = np.array([0.]*3)
    print(k, ": ")
    for train, test in folds: 
        X_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), columns]
        X_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), columns]
        y_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), 'migrant_pop_2019']
        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'migrant_pop_2019']
        mapes, preds = run_all_models(X_train, X_test, y_train, y_test, preds=True)
        subgroup_mapes+=mapes

        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), ['migrant_pop_2019', 'country_code']]
        mapes_temp = []
        for pred in preds:
            pred = pd.DataFrame(pred, columns=['pop'])
            pred['country_code'] = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'country_code'].values
            mapes_temp.append(MAPE(y_test.groupby('country_code').sum()['migrant_pop_2019'].values, pred.groupby('country_code').sum()['pop'].values))
        total_mapes+=mapes_temp

    subgroup_mapes = subgroup_mapes/len(folds)
    total_mapes = total_mapes/len(folds)
    writer.writerow(["autoregressive_with_fb_normalized_subgroups_data_used", k, *subgroup_mapes])
    writer.writerow(["autoregressive_with_fb_normalized_subgroups_added_data_used", k, *total_mapes])
    print("Subgroups: ", subgroup_mapes)
    print("Total: ", total_mapes)
    print("\n")

random_all : 
Subgroups:  [ 25.1587439   28.74550864 209.23620865]
Total:  [ 15.77712015  18.50327241 134.68252768]


train_test_high : 
Subgroups:  [18.89598242 19.09915924 89.80350553]
Total:  [12.84891579 12.10171122 28.92315976]


train_test_low : 
Subgroups:  [ 29.08310521  29.10354953 193.63471778]
Total:  [ 18.95484481  19.17816337 143.65886206]




In [32]:
columns = ['age_sex_group_female_age0',
       'age_sex_group_female_age1', 'age_sex_group_female_age10',
       'age_sex_group_female_age2', 'age_sex_group_female_age3',
       'age_sex_group_female_age4', 'age_sex_group_female_age5',
       'age_sex_group_female_age6', 'age_sex_group_female_age7',
       'age_sex_group_female_age8', 'age_sex_group_female_age9',
       'age_sex_group_male_age0', 'age_sex_group_male_age1',
       'age_sex_group_male_age10', 'age_sex_group_male_age2',
       'age_sex_group_male_age3', 'age_sex_group_male_age4',
       'age_sex_group_male_age5', 'age_sex_group_male_age6',
       'age_sex_group_male_age7', 'age_sex_group_male_age8',
       'age_sex_group_male_age9', 'migrant_pop_2015', 'migrant_pop_2017',
       'data_used_B', 'data_used_C',
       'data_used_I', 'data_used_R', 'data_used_BR', 'data_used_CR',
       'data_used_IR', 'data_used_CB']

In [33]:
for k, folds in iso_splits.items():
    subgroup_mapes = mapes = np.array([0.]*3)
    total_mapes = mapes = np.array([0.]*3)
    print(k, ": ")
    for train, test in folds: 
        X_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), columns]
        X_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), columns]
        y_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), 'migrant_pop_2019']
        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'migrant_pop_2019']
        mapes, preds = run_all_models(X_train, X_test, y_train, y_test, preds=True)
        subgroup_mapes+=mapes

        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), ['migrant_pop_2019', 'country_code']]
        mapes_temp = []
        for pred in preds:
            pred = pd.DataFrame(pred, columns=['pop'])
            pred['country_code'] = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'country_code'].values
            mapes_temp.append(MAPE(y_test.groupby('country_code').sum()['migrant_pop_2019'].values, pred.groupby('country_code').sum()['pop'].values))
        total_mapes+=mapes_temp

    subgroup_mapes = subgroup_mapes/len(folds)
    total_mapes = total_mapes/len(folds)
    writer.writerow(["fb_normalized_subgroups_data_used", k, *subgroup_mapes])
    writer.writerow(["fb_normalized_subgroups_added_data_used", k, *total_mapes])
    print("Subgroups: ", subgroup_mapes)
    print("Total: ", total_mapes)
    print("\n")

random_all : 
Subgroups:  [ 23.82425986  28.34425758 210.54425979]
Total:  [ 14.18777658  17.65058566 135.97747175]


train_test_high : 
Subgroups:  [18.52820561 18.61603057 89.66815219]
Total:  [12.19977856 11.30994261 28.90613709]


train_test_low : 
Subgroups:  [ 26.94155429  30.16340832 188.7684089 ]
Total:  [ 17.37240163  19.48405525 137.26570633]




In [34]:
columns = ['age_sex_group_female_age0',
       'age_sex_group_female_age1', 'age_sex_group_female_age10',
       'age_sex_group_female_age2', 'age_sex_group_female_age3',
       'age_sex_group_female_age4', 'age_sex_group_female_age5',
       'age_sex_group_female_age6', 'age_sex_group_female_age7',
       'age_sex_group_female_age8', 'age_sex_group_female_age9',
       'age_sex_group_male_age0', 'age_sex_group_male_age1',
       'age_sex_group_male_age10', 'age_sex_group_male_age2',
       'age_sex_group_male_age3', 'age_sex_group_male_age4',
       'age_sex_group_male_age5', 'age_sex_group_male_age6',
       'age_sex_group_male_age7', 'age_sex_group_male_age8',
       'age_sex_group_male_age9', 'migrant_pop_2015', 'migrant_pop_2017',
       'data_used_B', 'data_used_C',
       'data_used_I', 'data_used_R', 'data_used_BR', 'data_used_CR',
       'data_used_IR', 'data_used_CB']

In [35]:
for k, folds in iso_splits.items():
    subgroup_mapes = mapes = np.array([0.]*3)
    total_mapes = mapes = np.array([0.]*3)
    print(k, ": ")
    for train, test in folds: 
        X_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), columns]
        X_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), columns]
        y_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), 'migrant_pop_2019']
        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'migrant_pop_2019']
        mapes, preds = run_all_models(X_train, X_test, y_train, y_test, preds=True)
        subgroup_mapes+=mapes

        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), ['migrant_pop_2019', 'country_code']]
        mapes_temp = []
        for pred in preds:
            pred = pd.DataFrame(pred, columns=['pop'])
            pred['country_code'] = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'country_code'].values
            mapes_temp.append(MAPE(y_test.groupby('country_code').sum()['migrant_pop_2019'].values, pred.groupby('country_code').sum()['pop'].values))
        total_mapes+=mapes_temp

    subgroup_mapes = subgroup_mapes/len(folds)
    total_mapes = total_mapes/len(folds)
    writer.writerow(["autoregression_subgroups_data_used", k, *subgroup_mapes])
    writer.writerow(["autoregression_subgroups_added_data_used", k, *total_mapes])
    print("Subgroups: ", subgroup_mapes)
    print("Total: ", total_mapes)
    print("\n")

random_all : 
Subgroups:  [ 23.82425986  28.34425758 210.54425979]
Total:  [ 14.18777658  17.65058566 135.97747175]


train_test_high : 
Subgroups:  [18.52820561 18.61603057 89.66815219]
Total:  [12.19977856 11.30994261 28.90613709]


train_test_low : 
Subgroups:  [ 26.94155429  30.16340832 188.7684089 ]
Total:  [ 17.37240163  19.48405525 137.26570633]




In [36]:
f.close()