In [1]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import csv
import pandas as pd
import numpy as np

In [2]:
def MAPE(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
def run_all_models(X_train, X_test, y_train, y_test):
    # random forest
    rf_regr = RandomForestRegressor(max_depth=depth, random_state=random, criterion="mae")
    rf_regr.fit(X_train, y_train)
    y_pred = rf_regr.predict(X_test)
    rf_mape = MAPE(y_test, y_pred)
    print('Random Forests MAPE: {0}'.format(rf_mape))
    
    # adaboost 
    ada_regr = AdaBoostRegressor(random_state=random)
    ada_regr.fit(X_train, y_train)
    y_pred = ada_regr.predict(X_test)
    ada_mape = MAPE(y_test, y_pred)
    print('AdaBoost MAPE: {0}'.format(MAPE(y_test, y_pred)))
    
    # xgboost
    xg_regr = XGBRegressor(random_state=random)
    xg_regr.fit(X_train, y_train)
    y_pred = xg_regr.predict(X_test)
    xg_mape = MAPE(y_test, y_pred)
    print('XGBoost MAPE: {0}'.format(MAPE(y_test, y_pred)))
    
    # linear regression
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)
    y_pred = lin_reg.predict(X_test)
    lin_mape = MAPE(y_test, y_pred)
    print('Linear Regression MAPE: {0}'.format(MAPE(y_test, y_pred)))
    
    return (rf_mape, ada_mape, xg_mape, lin_mape)

In [4]:
def gen_splits(fb_df, variables = []):
    total_columns = [col for col in fb_df.columns[2:] if '2019' not in col]
    predictors = fb_df[variables].values if variables != []\
        else fb_df[total_columns].values
    gt = fb_df['migrant_2019'].values
    
    high_predictors = predictors[fb_df['development_lvl'] == 1]
    high_gt = gt[fb_df['development_lvl'] == 1]
    low_predictors = predictors[fb_df['development_lvl'] == 0]
    low_gt = gt[fb_df['development_lvl'] == 0]
    
    splits = dict()
    # 1. Randomly sample from all countries for training and test sets.
    splits["random_all"] = train_test_split(predictors, gt, test_size=0.2, random_state=42)
    # 2. Train more developed, test less developed
    splits["train_high_test_low"] = (high_predictors, low_predictors, high_gt, low_gt)
    # 3. Train less developed, test more developed
    splits["train_low_test_high"] = (low_predictors, high_predictors, low_gt, high_gt)
    # 4. Randomly sample high for train+test
    splits["train_test_high"] = train_test_split(high_predictors, high_gt, test_size=0.2, random_state=42)
    # 5. Randomly sample low for train+test
    splits["train_test_low"] = train_test_split(low_predictors, low_gt, test_size=0.2, random_state=42)

    return splits

## Data Setup

In [5]:
# read in UN data
un_df = pd.read_csv('../data/UN_data_clean.csv')
# ground truth data for all of the models
y = np.array((un_df[(un_df['age_group'] == 'Total') & (un_df['sex'] == 'both sexes') & (un_df['year'] == 2019)]\
          ['migrant_pop']))

In [6]:
# read in combined fb_un_data
fb_df = pd.read_csv('../data/facebook_un_combined.csv')
predictors = fb_df.values [:, 2:176]
gt = fb_df['migrant_2019'].values

In [7]:
# hyperparameters
depth = 5
random = 0

In [8]:
f = open('model_mapes.csv','w')
writer = csv.writer(f, delimiter=',')
writer.writerow(["model", "split", "rf_mape", "adaboost_mape", "xgboost_mape", "linreg_mape"])

60

### Simple Autoregressive Model

In [9]:
splits = gen_splits(fb_df, ['un_expat_total_age16_2017', 'un_expat_total_age16_2015'])

for (k, v) in splits.items():
    print(k + ": ")
    mapes = run_all_models(*v)
    writer.writerow(["autoregressive_baseline", k, *mapes])
    print("\n")

random_all: 
Random Forests MAPE: 71.33136130546714
AdaBoost MAPE: 1019.2923295687892
XGBoost MAPE: 97.84671343298231
Linear Regression MAPE: 151.72909609399267


train_high_test_low: 
Random Forests MAPE: 187.43006888075092
AdaBoost MAPE: 1686.8104041058689
XGBoost MAPE: 86.7446402455189
Linear Regression MAPE: 156.3394614308411


train_low_test_high: 
Random Forests MAPE: 21.11290759065284
AdaBoost MAPE: 60.22851360465323
XGBoost MAPE: 33.140751780837704
Linear Regression MAPE: 10.594680463374175


train_test_high: 
Random Forests MAPE: 28.043011707889832
AdaBoost MAPE: 187.8523339156974
XGBoost MAPE: 17.84754278247097
Linear Regression MAPE: 45.730820988835234


train_test_low: 
Random Forests MAPE: 34.36779482716031
AdaBoost MAPE: 206.8315612121729
XGBoost MAPE: 36.9743836423685
Linear Regression MAPE: 38.37765286358955




### Facebook Naive

In [10]:
splits = gen_splits(fb_df, ['total_expat'])

for (k, v) in splits.items():
    print(k + ": ")
    mapes = run_all_models(*v)
    writer.writerow(["fb_naive", k, *mapes])
    print("\n")

random_all: 
Random Forests MAPE: 348.70714684882535
AdaBoost MAPE: 888.4393378179258
XGBoost MAPE: 426.13142616259444
Linear Regression MAPE: 269.78360941092535


train_high_test_low: 
Random Forests MAPE: 611.1168531634244
AdaBoost MAPE: 1400.859476109096
XGBoost MAPE: 684.7433680143762
Linear Regression MAPE: 1761.7375273485504


train_low_test_high: 
Random Forests MAPE: 56.59768649117039
AdaBoost MAPE: 61.56077996833701
XGBoost MAPE: 83.58140511473168
Linear Regression MAPE: 54.76032061494285


train_test_high: 
Random Forests MAPE: 79.7654064330622
AdaBoost MAPE: 227.73975787227755
XGBoost MAPE: 91.80655730869547
Linear Regression MAPE: 390.1946489712339


train_test_low: 
Random Forests MAPE: 177.57912920121865
AdaBoost MAPE: 846.678573883217
XGBoost MAPE: 222.40821344474656
Linear Regression MAPE: 213.36878690131687




### Autoregressive + Facebook Expats

In [11]:
splits = gen_splits(fb_df, ['un_expat_total_age16_2017', 'un_expat_total_age16_2015', 'total_expat'])

for (k, v) in splits.items():
    print(k + ": ")
    mapes = run_all_models(*v)
    writer.writerow(["autoregressive_with_expats", k, *mapes])
    print("\n")

random_all: 
Random Forests MAPE: 71.61149099901387
AdaBoost MAPE: 1083.5296833321377
XGBoost MAPE: 36.29133918555385
Linear Regression MAPE: 150.97892702706264


train_high_test_low: 
Random Forests MAPE: 230.39012332790995
AdaBoost MAPE: 1522.7518734105695
XGBoost MAPE: 78.14617697321175
Linear Regression MAPE: 162.4662293161901


train_low_test_high: 
Random Forests MAPE: 17.961354221235315
AdaBoost MAPE: 54.11590006099739
XGBoost MAPE: 16.083118778397367
Linear Regression MAPE: 7.746618219509357


train_test_high: 
Random Forests MAPE: 25.43740550267321
AdaBoost MAPE: 182.69764155170546
XGBoost MAPE: 17.831190010067804
Linear Regression MAPE: 46.55748528694383


train_test_low: 
Random Forests MAPE: 33.19926722075143
AdaBoost MAPE: 668.4759156947622
XGBoost MAPE: 28.273864858508883
Linear Regression MAPE: 17.876274819838788




### Facebook age-sex corrected

In [12]:
norm_columns = [col for col in fb_df.columns if 'normalized' in col]
splits = gen_splits(fb_df, norm_columns)

for (k, v) in splits.items():
    print(k + ": ")
    mapes = run_all_models(*v)
    writer.writerow(["fb_age_sex_normalized", k, *mapes])
    print("\n")

random_all: 
Random Forests MAPE: 279.4067444326248
AdaBoost MAPE: 2725.0959389100417
XGBoost MAPE: 213.89662587409538
Linear Regression MAPE: 647.8844754498936


train_high_test_low: 
Random Forests MAPE: 1394.3261104731714
AdaBoost MAPE: 3225.344155691324
XGBoost MAPE: 1927.2098141550537
Linear Regression MAPE: 3055.617850469793


train_low_test_high: 
Random Forests MAPE: 50.53611554446471
AdaBoost MAPE: 72.06935687630964
XGBoost MAPE: 80.64635879789276
Linear Regression MAPE: 66.68132531577696


train_test_high: 
Random Forests MAPE: 99.88840844505553
AdaBoost MAPE: 171.32892667450818
XGBoost MAPE: 85.27725322556688
Linear Regression MAPE: 333.32447587581134


train_test_low: 
Random Forests MAPE: 123.48852050946952
AdaBoost MAPE: 1460.91232394715
XGBoost MAPE: 107.12159115344586
Linear Regression MAPE: 270.04120219133284




### Facebook age-sex corrected with autoregression

In [13]:
norm_columns = [col for col in fb_df.columns if 'normalized' in col]
norm_columns.extend(['un_expat_total_age16_2017', 'un_expat_total_age16_2015'])
splits = gen_splits(fb_df, norm_columns)

for (k, v) in splits.items():
    print(k + ": ")
    mapes = run_all_models(*v)
    writer.writerow(["autoregressive_with_fb_normalized", k, *mapes])
    print("\n")

random_all: 
Random Forests MAPE: 68.13104888760246
AdaBoost MAPE: 2247.1181197776036
XGBoost MAPE: 51.50601461700446
Linear Regression MAPE: 65.43865585829667


train_high_test_low: 
Random Forests MAPE: 819.1449868146422
AdaBoost MAPE: 1695.5069760161045
XGBoost MAPE: 1498.604692921907
Linear Regression MAPE: 1185.743163161429


train_low_test_high: 
Random Forests MAPE: 13.404226182801047
AdaBoost MAPE: 85.97478079379127
XGBoost MAPE: 13.61356440746721
Linear Regression MAPE: 14.657451279713966


train_test_high: 
Random Forests MAPE: 89.87920884844242
AdaBoost MAPE: 214.07243603627123
XGBoost MAPE: 46.23402751531899
Linear Regression MAPE: 36.824238565514875


train_test_low: 
Random Forests MAPE: 30.507313070531143
AdaBoost MAPE: 465.4453716677719
XGBoost MAPE: 16.34215071170913
Linear Regression MAPE: 84.06910930627814




### All Predictors

In [15]:
splits = gen_splits(fb_df)

for (k, v) in splits.items():
    print(k + ": ")
    mapes = run_all_models(*v)
    writer.writerow(["all_preds", k, *mapes])
    print("\n")

random_all: 
Random Forests MAPE: 59.27264836899918
AdaBoost MAPE: 1370.5303234227745
XGBoost MAPE: 38.62705864777089
Linear Regression MAPE: 246.51191350870576


train_high_test_low: 
Random Forests MAPE: 493.38596882364476
AdaBoost MAPE: 1239.3889307424538
XGBoost MAPE: 718.5720849625111
Linear Regression MAPE: 161.1338204372646


train_low_test_high: 
Random Forests MAPE: 21.56126208250336
AdaBoost MAPE: 76.77237524060945
XGBoost MAPE: 16.397593570005796
Linear Regression MAPE: 68.19708566872316


train_test_high: 
Random Forests MAPE: 49.16442097878411
AdaBoost MAPE: 345.15669391691796
XGBoost MAPE: 37.55335097663485
Linear Regression MAPE: 24.523245879752707


train_test_low: 
Random Forests MAPE: 32.59357649337822
AdaBoost MAPE: 842.3973855054021
XGBoost MAPE: 25.060368057728276
Linear Regression MAPE: 103.61917898653094




### Final Clean-up

In [16]:
f.close()