In [1]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import csv
import pandas as pd
import numpy as np

In [2]:
def MAPE(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
def run_all_models(X_train, X_test, y_train, y_test):
    # random forest
    rf_regr = RandomForestRegressor(max_depth=depth, random_state=random, criterion="mae")
    rf_regr.fit(X_train, y_train)
    y_pred = rf_regr.predict(X_test)
    rf_mape = MAPE(y_test, y_pred)
    print('Random Forests MAPE: {0}'.format(rf_mape))
    
    # adaboost 
    ada_regr = AdaBoostRegressor(random_state=random)
    ada_regr.fit(X_train, y_train)
    y_pred = ada_regr.predict(X_test)
    ada_mape = MAPE(y_test, y_pred)
    print('AdaBoost MAPE: {0}'.format(MAPE(y_test, y_pred)))
    
    # xgboost
    xg_regr = XGBRegressor(random_state=random)
    xg_regr.fit(X_train, y_train)
    y_pred = xg_regr.predict(X_test)
    xg_mape = MAPE(y_test, y_pred)
    print('XGBoost MAPE: {0}'.format(MAPE(y_test, y_pred)))
    
    # linear regression
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)
    y_pred = lin_reg.predict(X_test)
    lin_mape = MAPE(y_test, y_pred)
    print('Linear Regression MAPE: {0}'.format(MAPE(y_test, y_pred)))
    
    return (rf_mape, ada_mape, xg_mape, lin_mape)

In [4]:
def gen_splits(fb_df, variables = []):
    predictors = fb_df[variables].values if variables != []\
        else fb_df.values [:, 2:176]
    gt = fb_df['migrant_2019'].values
    
    high_predictors = predictors[fb_df['development_lvl'] == 1]
    high_gt = gt[fb_df['development_lvl'] == 1]
    low_predictors = predictors[fb_df['development_lvl'] == 0]
    low_gt = gt[fb_df['development_lvl'] == 0]
    
    splits = dict()
    # 1. Randomly sample from all countries for training and test sets.
    splits["random_all"] = train_test_split(predictors, gt, test_size=0.2, random_state=42)
    # 2. Train more developed, test less developed
    splits["train_high_test_low"] = (high_predictors, low_predictors, high_gt, low_gt)
    # 3. Train less developed, test more developed
    splits["train_low_test_high"] = (low_predictors, high_predictors, low_gt, high_gt)
    # 4. Randomly sample high for train+test
    splits["train_test_high"] = train_test_split(high_predictors, high_gt, test_size=0.2, random_state=42)
    # 5. Randomly sample low for train+test
    splits["train_test_low"] = train_test_split(low_predictors, low_gt, test_size=0.2, random_state=42)

    return splits

## Data Setup

In [5]:
# read in UN data
un_df = pd.read_csv('../data/UN_data_clean.csv')
# ground truth data for all of the models
y = np.array((un_df[(un_df['age_group'] == 'Total') & (un_df['sex'] == 'both sexes') & (un_df['year'] == 2019)]\
          ['migrant_pop']))

In [6]:
# read in combined fb_un_data
fb_df = pd.read_csv('../data/facebook_un_combined.csv')
predictors = fb_df.values [:, 2:176]
gt = fb_df['migrant_2019'].values

In [7]:
# hyperparameters
depth = 5
random = 0

### Simple Autoregressive Model

In [8]:
splits = gen_splits(fb_df, ['un_total_age16_2017', 'un_total_age16_2015'])

for (k, v) in splits.items():
    print(k + ": ")
    run_all_models(*v);
    print("\n")

random_all: 
Random Forests MAPE: 71.33136130546714
AdaBoost MAPE: 1019.2923295687892
XGBoost MAPE: 97.84671343298231
Linear Regression MAPE: 151.72909609399267


train_high_test_low: 
Random Forests MAPE: 187.43006888075092
AdaBoost MAPE: 1686.8104041058689
XGBoost MAPE: 86.7446402455189
Linear Regression MAPE: 156.3394614308411


train_low_test_high: 
Random Forests MAPE: 21.11290759065284
AdaBoost MAPE: 60.22851360465323
XGBoost MAPE: 33.140751780837704
Linear Regression MAPE: 10.594680463374175


train_test_high: 
Random Forests MAPE: 28.043011707889832
AdaBoost MAPE: 187.8523339156974
XGBoost MAPE: 17.84754278247097
Linear Regression MAPE: 45.730820988835234


train_test_low: 
Random Forests MAPE: 34.36779482716031
AdaBoost MAPE: 206.8315612121729
XGBoost MAPE: 36.9743836423685
Linear Regression MAPE: 38.37765286358955




### Facebook Naive

In [9]:
splits = gen_splits(fb_df, ['total_expat'])

for (k, v) in splits.items():
    print(k + ": ")
    run_all_models(*v);
    print("\n")

random_all: 
Random Forests MAPE: 348.70714684882535
AdaBoost MAPE: 888.4393378179258
XGBoost MAPE: 426.13142616259444
Linear Regression MAPE: 269.78360941092535


train_high_test_low: 
Random Forests MAPE: 611.1168531634244
AdaBoost MAPE: 1400.859476109096
XGBoost MAPE: 684.7433680143762
Linear Regression MAPE: 1761.7375273485504


train_low_test_high: 
Random Forests MAPE: 56.59768649117039
AdaBoost MAPE: 61.56077996833701
XGBoost MAPE: 83.58140511473168
Linear Regression MAPE: 54.76032061494285


train_test_high: 
Random Forests MAPE: 79.7654064330622
AdaBoost MAPE: 227.73975787227755
XGBoost MAPE: 91.80655730869547
Linear Regression MAPE: 390.1946489712339


train_test_low: 
Random Forests MAPE: 177.57912920121865
AdaBoost MAPE: 846.678573883217
XGBoost MAPE: 222.40821344474656
Linear Regression MAPE: 213.36878690131687




### Autoregressive + Facebook Expats

In [10]:
splits = gen_splits(fb_df, ['un_total_age16_2017', 'un_total_age16_2015', 'total_expat'])

for (k, v) in splits.items():
    print(k + ": ")
    run_all_models(*v);
    print("\n")

random_all: 
Random Forests MAPE: 71.61149099901387
AdaBoost MAPE: 1083.5296833321377
XGBoost MAPE: 36.29133918555385
Linear Regression MAPE: 150.97892702706264


train_high_test_low: 
Random Forests MAPE: 230.39012332790995
AdaBoost MAPE: 1522.7518734105695
XGBoost MAPE: 78.14617697321175
Linear Regression MAPE: 162.4662293161901


train_low_test_high: 
Random Forests MAPE: 17.961354221235315
AdaBoost MAPE: 54.11590006099739
XGBoost MAPE: 16.083118778397367
Linear Regression MAPE: 7.746618219509357


train_test_high: 
Random Forests MAPE: 25.43740550267321
AdaBoost MAPE: 182.69764155170546
XGBoost MAPE: 17.831190010067804
Linear Regression MAPE: 46.55748528694383


train_test_low: 
Random Forests MAPE: 33.19926722075143
AdaBoost MAPE: 668.4759156947622
XGBoost MAPE: 28.273864858508883
Linear Regression MAPE: 17.876274819838788




### All Predictors

In [11]:
splits = gen_splits(fb_df)

for (k, v) in splits.items():
    print(k + ": ")
    run_all_models(*v);
    print("\n")

random_all: 
Random Forests MAPE: 61.589010233630226
AdaBoost MAPE: 965.4561789541841
XGBoost MAPE: 33.92179632750849
Linear Regression MAPE: 215.80557871035748


train_high_test_low: 
Random Forests MAPE: 282.5294440392145
AdaBoost MAPE: 1797.6870697866514
XGBoost MAPE: 704.0039708009618
Linear Regression MAPE: 170.53307487428418


train_low_test_high: 
Random Forests MAPE: 26.89340114021601
AdaBoost MAPE: 68.45963773775752
XGBoost MAPE: 17.56113062595195
Linear Regression MAPE: 88.52737451226204


train_test_high: 
Random Forests MAPE: 39.753435909849365
AdaBoost MAPE: 261.4141716815693
XGBoost MAPE: 41.91173028385944
Linear Regression MAPE: 21.578171906715994


train_test_low: 
Random Forests MAPE: 34.75389751627678
AdaBoost MAPE: 556.2294707500852
XGBoost MAPE: 25.411146011180303
Linear Regression MAPE: 63.590217780048675


