In [1]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import csv
import pandas as pd
import numpy as np

In [2]:
def MAPE(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
def run_all(X_train, X_test, y_train, y_test):
    # random forest
    rf_regr = RandomForestRegressor(max_depth=depth, random_state=random, criterion="mae")
    rf_regr.fit(X_train, y_train)
    y_pred = rf_regr.predict(X_test)
    rf_mape = MAPE(y_test, y_pred)
    print('Random Forests MAPE: {0}'.format(rf_mape))
    
    # adaboost 
    ada_regr = AdaBoostRegressor(random_state=random)
    ada_regr.fit(X_train, y_train)
    y_pred = ada_regr.predict(X_test)
    ada_mape = MAPE(y_test, y_pred)
    print('AdaBoost MAPE: {0}'.format(MAPE(y_test, y_pred)))
    
    # xgboost
    xg_regr = XGBRegressor(random_state=random)
    xg_regr.fit(X_train, y_train)
    y_pred = xg_regr.predict(X_test)
    xg_mape = MAPE(y_test, y_pred)
    print('XGBoost MAPE: {0}'.format(MAPE(y_test, y_pred)))
    
    # linear regression
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)
    y_pred = lin_reg.predict(X_test)
    lin_mape = MAPE(y_test, y_pred)
    print('Linear Regression MAPE: {0}'.format(MAPE(y_test, y_pred)))
    
    return (rf_mape, ada_mape, xg_mape, lin_mape)

## Data Setup

In [4]:
# read in UN data
un_df = pd.read_csv('../data/UN_data_clean.csv')
# ground truth data for all of the models
y = np.array((un_df[(un_df['age_group'] == 'Total') & (un_df['sex'] == 'both sexes') & (un_df['year'] == 2019)]\
          ['migrant_pop']))

In [5]:
# read in combined fb_un_data
fb_df = pd.read_csv('../data/facebook_un_combined.csv')
predictors = fb_df.values [:, 2:176]
gt = fb_df['migrant_2019'].values

In [6]:
# hyperparameters
depth = 5
random = 0

### Simple Autoregressive Model

In [7]:
# format data
y = gt
X = fb_df[['un_total_age16_2017', 'un_total_age16_2015']].values
splits = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
run_all(*splits);

Random Forests MAPE: 71.33136130546714
AdaBoost MAPE: 1019.2923295687892
XGBoost MAPE: 97.84671343298231
Linear Regression MAPE: 151.72909609399267


### Facebook Naive

In [9]:
y = gt
X = fb_df[['total_expat']].values
splits = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
run_all(*splits);

Random Forests MAPE: 348.70714684882535
AdaBoost MAPE: 888.4393378179258
XGBoost MAPE: 426.13142616259444
Linear Regression MAPE: 269.78360941092535


### Autogressive + Facebook Expats

In [11]:
# format data
y = gt
X = fb_df[['un_total_age16_2017', 'un_total_age16_2015', 'total_expat']].values
splits = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
run_all(*splits);

Random Forests MAPE: 71.61149099901387
AdaBoost MAPE: 1083.5296833321377
XGBoost MAPE: 36.29133918555385
Linear Regression MAPE: 150.97892702706264


### All Predictors

In [13]:
X = predictors
y = gt
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# random forest
rf_regr = RandomForestRegressor(max_depth=depth, random_state=random, criterion="mae")
rf_regr.fit(X_train, y_train)
y_pred = rf_regr.predict(X_test)
print('Mean Absolute Percentage Error: {0}'.format(MAPE(y_test, y_pred)))

Mean Absolute Percentage Error: 59.78972763864687


In [15]:
# adaboost 
ada_regr = AdaBoostRegressor(random_state=random)
ada_regr.fit(X_train, y_train)
y_pred = ada_regr.predict(X_test)
print('Mean Absolute Percentage Error: {0}'.format(MAPE(y_test, y_pred)))

Mean Absolute Percentage Error: 1121.7662042983834


In [16]:
# xgboost
xg_regr = XGBRegressor(random_state=random)
xg_regr.fit(X_train, y_train)
y_pred = xg_regr.predict(X_test)
print('Mean Absolute Percentage Error: {0}'.format(MAPE(y_test, y_pred)))

Mean Absolute Percentage Error: 36.157544020378815


In [17]:
# linear regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)
print('Mean Absolute Percentage Error: {0}'.format(MAPE(y_test, y_pred)))

Mean Absolute Percentage Error: 215.80557870632853
