In [1]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

In [2]:
def MAPE(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
def run_all_models(X_train, X_test, y_train, y_test):
    # random forest
    rf_regr = RandomForestRegressor(max_depth=depth, random_state=random, criterion="mae")
    rf_regr.fit(X_train, y_train)
    y_pred = rf_regr.predict(X_test)
    rf_mape = MAPE(y_test, y_pred)
    #print('Random Forests MAPE: {0}'.format(rf_mape))
    
    # adaboost 
    ada_regr = AdaBoostRegressor(random_state=random)
    ada_regr.fit(X_train, y_train)
    y_pred = ada_regr.predict(X_test)
    ada_mape = MAPE(y_test, y_pred)
    #print('AdaBoost MAPE: {0}'.format(MAPE(y_test, y_pred)))
    
    # xgboost
    xg_regr = XGBRegressor(random_state=random)
    xg_regr.fit(X_train, y_train)
    y_pred = xg_regr.predict(X_test)
    xg_mape = MAPE(y_test, y_pred)
    #print('XGBoost MAPE: {0}'.format(MAPE(y_test, y_pred)))
    
    # linear regression
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)
    y_pred = lin_reg.predict(X_test)
    lin_mape = MAPE(y_test, y_pred)
    #print('Linear Regression MAPE: {0}'.format(MAPE(y_test, y_pred)))
    
    return np.array([rf_mape, ada_mape, xg_mape, lin_mape])

In [4]:
def gen_splits(fb_df, variables = []):
    total_columns = [col for col in fb_df.columns[2:] if '2019' not in col]
    predictors = fb_df[variables].values if variables != []\
        else fb_df[total_columns].values
    gt = fb_df['migrant_2019'].values
    
    high_predictors = predictors[fb_df['development_lvl'] == 1]
    high_gt = gt[fb_df['development_lvl'] == 1]
    low_predictors = predictors[fb_df['development_lvl'] == 0]
    low_gt = gt[fb_df['development_lvl'] == 0]
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    kf_split = kf.split(predictors, gt)
    
    splits = dict()
    # 1. Randomly sample from all countries for training and test sets.
    #splits["random_all"] = train_test_split(predictors, gt, test_size=0.2, random_state=42)
    splits["random_all"] = [(predictors[i], predictors[j], gt[i], gt[j]) for i, j in kf.split(predictors, gt)]
    # 2. Train more developed, test less developed
    splits["train_high_test_low"] = [(high_predictors, low_predictors, high_gt, low_gt)]
    # 3. Train less developed, test more developed
    splits["train_low_test_high"] = [(low_predictors, high_predictors, low_gt, high_gt)]
    # 4. Randomly sample high for train+test
    #splits["train_test_high"] = train_test_split(high_predictors, high_gt, test_size=0.2, random_state=42)
    splits["train_test_high"] = [(high_predictors[i], high_predictors[j], high_gt[i], high_gt[j]) for i, j in kf.split(high_predictors, high_gt)]
    # 5. Randomly sample low for train+test
    #splits["train_test_low"] = train_test_split(low_predictors, low_gt, test_size=0.2, random_state=42)
    splits["train_test_low"] = [(low_predictors[i], low_predictors[j], low_gt[i], low_gt[j]) for i, j in kf.split(low_predictors, low_gt)]

    return splits

## Data Setup

In [5]:
# read in UN data
un_df = pd.read_csv('../data/UN_data_clean.csv')
# ground truth data for all of the models
y = np.array((un_df[(un_df['age_group'] == 'Total') & (un_df['sex'] == 'both sexes') & (un_df['year'] == 2019)]\
          ['migrant_pop']))

In [6]:
# read in combined fb_un_data
fb_df = pd.read_csv('../data/facebook_un_combined_2019.csv')
fb_df_2020 = pd.read_csv('../data/facebook_un_combined_2020.csv')
predictors = fb_df.values [:, 2:176]
gt = fb_df['migrant_2019'].values

In [7]:
# hyperparameters
depth = 5
random = 0

In [8]:
f = open('model_mapes.csv','w')
writer = csv.writer(f, delimiter=',')
writer.writerow(["model", "split", "rf_mape", "adaboost_mape", "xgboost_mape", "linreg_mape"])

60

### Simple Autoregressive Model

In [9]:
splits = gen_splits(fb_df, ['un_expat_total_age16_2017', 'un_expat_total_age16_2015'])

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = np.array([0.]*4)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["autoregressive_baseline", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[ 42.02446425 854.60815739  32.44764284  97.60701627]


train_high_test_low: 
[ 187.43006888 1686.81040411   86.74464025  156.33946143]


train_low_test_high: 
[21.11290759 60.2285136  33.14075178 10.59468046]


train_test_high: 
[18.3662049  97.82886392 15.78447345 22.29603912]


train_test_low: 
[ 30.90469672 472.28408286  27.39933143  64.50992431]




### Facebook Naive

In [10]:
splits = gen_splits(fb_df, ['total_expat'])

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = np.array([0.]*4)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["fb_naive", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[207.95859728 708.25973069 252.20157076 310.93996917]


train_high_test_low: 
[ 611.11685316 1400.85947611  684.74336801 1761.73752735]


train_low_test_high: 
[56.59768649 61.56077997 83.58140511 54.76032061]


train_test_high: 
[ 55.97299008  98.13817427  69.15245206 206.16529846]


train_test_low: 
[264.90621803 623.53828116 342.11459058 322.3538709 ]




### Autoregressive + Facebook Expats

In [11]:
splits = gen_splits(fb_df, ['un_expat_total_age16_2017', 'un_expat_total_age16_2015', 'total_expat'])

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = np.array([0.]*4)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["autoregressive_plus_fb", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[ 49.06118056 774.16988565  19.97254989  91.98927695]


train_high_test_low: 
[ 230.39012333 1522.75187341   78.14617697  162.46622932]


train_low_test_high: 
[17.96135422 54.11590006 16.08311878  7.74661822]


train_test_high: 
[ 18.22736018 133.58336877  15.24838657  22.65209668]


train_test_low: 
[ 39.99946132 570.27093344  27.18229063  40.53309467]




### Facebook age-sex corrected

In [12]:
norm_columns = [col for col in fb_df.columns if 'normalized' in col]
splits = gen_splits(fb_df, norm_columns)

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = np.array([0.]*4)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["fb_age_sex_normalized", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[ 218.99879811 2648.21488505  216.19787864  498.55632602]


train_high_test_low: 
[1394.32611047 3225.34415569 1927.20981416 3055.61785047]


train_low_test_high: 
[50.53611554 72.06935688 80.6463588  66.68132532]


train_test_high: 
[ 48.0154907  129.25741736  49.50113586 280.8595249 ]


train_test_low: 
[285.64083237 742.97575176 233.89595711 658.06482297]




### Facebook age-sex corrected (2020)

In [13]:
norm_columns = [col for col in fb_df_2020.columns if 'normalized' in col]
splits = gen_splits(fb_df_2020, norm_columns)

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = np.array([0.]*4)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["fb_age_sex_normalized_2020", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[ 217.30635944 1707.73629931  230.12950369  431.72965451]


train_high_test_low: 
[1418.03008617 3224.06931642 1854.14464967 3444.96737348]


train_low_test_high: 
[50.92767942 90.93367548 83.83098209 66.96094387]


train_test_high: 
[ 48.7604281  139.072057    51.61337101 227.43864852]


train_test_low: 
[292.34746673 712.38578059 276.44610001 720.85965534]




### Facebook age-sex corrected with autoregression

In [14]:
norm_columns = [col for col in fb_df.columns if 'normalized' in col]
norm_columns.extend(['un_expat_total_age16_2017', 'un_expat_total_age16_2015'])
splits = gen_splits(fb_df, norm_columns)

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = np.array([0.]*4)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["autoregressive_with_fb_normalized", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[  48.39706425 1070.91661223   31.76776737   59.34510741]


train_high_test_low: 
[ 819.14498681 1695.50697602 1498.60469292 1185.74316316]


train_low_test_high: 
[13.40422618 85.97478079 13.61356441 14.65745128]


train_test_high: 
[ 35.1268161  157.46835003  31.98382651  39.04414446]


train_test_low: 
[ 40.95339784 662.44981791  35.35438634 173.44526699]




### Facebook age-sex corrected with autoregression (2020)

In [15]:
norm_columns = [col for col in fb_df_2020.columns if 'normalized' in col]
norm_columns.extend(['un_expat_total_age16_2017', 'un_expat_total_age16_2015'])
splits = gen_splits(fb_df_2020, norm_columns)

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = np.array([0.]*4)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["autoregressive_with_fb_normalized_2020", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[ 48.69446258 991.07552476  29.99699642  54.84774068]


train_high_test_low: 
[ 852.38650511 1991.22431395 1518.66333758  921.71990486]


train_low_test_high: 
[13.50376307 63.05374468 15.4051986  13.4105808 ]


train_test_high: 
[ 36.01973524 161.13485698  35.47912186  37.01272483]


train_test_low: 
[ 40.99730236 580.97283807  36.67364529 193.87170796]




### All Predictors (2019)

In [16]:
splits = gen_splits(fb_df)

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = np.array([0.]*4)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["all_preds", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[ 47.77605527 924.44773694  26.65698106 246.53271424]


train_high_test_low: 
[ 493.38596882 1239.38893074  718.57208496  161.13382044]


train_low_test_high: 
[21.56126208 76.77237524 16.39759357 68.19708567]


train_test_high: 
[ 27.37924137 142.63685787  35.41667761  18.85839753]


train_test_low: 
[ 39.47129843 642.37249416  37.29538285 117.72098743]




### Final Clean-up

In [17]:
f.close()