In [1]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

In [2]:
def MAPE(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
def run_all_models(X_train, X_test, y_train, y_test, preds = False):
        
    # random forest
    rf_regr = RandomForestRegressor(max_depth=10, random_state=random, criterion="mae")
    rf_regr.fit(X_train, y_train)
    rf_y_pred = rf_regr.predict(X_test)
    rf_mape = MAPE(y_test, rf_y_pred)
    #print('Random Forests MAPE: {0}'.format(rf_mape))

    '''
    # adaboost 
    ada_regr = AdaBoostRegressor(random_state=random)
    ada_regr.fit(X_train, y_train)
    ada_y_pred = ada_regr.predict(X_test)
    ada_mape = MAPE(y_test, ada_y_pred)
    #print('AdaBoost MAPE: {0}'.format(MAPE(y_test, y_pred)))
    '''
    
    # xgboost
    xg_regr = XGBRegressor(max_depth=10, learning_rate=0.01, n_estimators=300, gamma=1, random_state=random)
    xg_regr.fit(X_train, y_train)
    xg_y_pred = xg_regr.predict(X_test)
    xg_mape = MAPE(y_test, xg_y_pred)
    #print('XGBoost MAPE: {0}'.format(MAPE(y_test, y_pred)))
    
    if X_train.shape[1] > 20:
        X_train = X_train.values
        X_train[:,22:] = np.log(X_train[:,22:])
        X_test = X_test.values
        X_test[:,22:] = np.log(X_test[:, 22:])
    else:
        X_train = np.log(X_train)
        X_test = np.log(X_test)
    y_train = np.log(y_train)
        
    # linear regression
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)
    lin_y_pred = np.exp(lin_reg.predict(X_test))
    lin_mape = MAPE(y_test, lin_y_pred)
    

    #print('Linear Regression MAPE: {0}'.format(MAPE(y_test, y_pred)))
    
    if preds: return np.array([rf_mape, xg_mape, lin_mape]), [rf_y_pred, xg_y_pred, lin_y_pred]
    
    return np.array([rf_mape, xg_mape, lin_mape])

In [4]:
_iso_splits = None
def gen_splits(fb_df, variables = [], data_used = False):
    global _iso_splits
    if data_used:
        variables.extend(['data_used_B', 'data_used_C', 'data_used_I', 'data_used_R', 'data_used_BR', 'data_used_CR', 'data_used_IR', 'data_used_CB', 'data_used_CBR'])
        
    total_columns = [col for col in fb_df.columns[2:] if '2019' not in col and '2020' not in col and ('un' in col or 'lvl' in col)]
    predictors = fb_df[variables].values if variables != []\
        else fb_df[total_columns].values
    gt = fb_df['migrant_pop_2019'].values
    
    high_predictors = predictors[fb_df['development_lvl'] == 1]
    high_gt = gt[fb_df['development_lvl'] == 1]
    low_predictors = predictors[fb_df['development_lvl'] == 0]
    low_gt = gt[fb_df['development_lvl'] == 0]
    
    
    if _iso_splits is None:
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        _iso_splits = dict()
        _iso_splits["all"] = list(kf.split(predictors, gt))
        _iso_splits["high"] = list(kf.split(high_predictors, high_gt))
        _iso_splits["low"] = list(kf.split(low_predictors, low_gt))

    splits = dict()
    # 1. Randomly sample from all countries for training and test sets.
    #splits["random_all"] = train_test_split(predictors, gt, test_size=0.2, random_state=42)
    splits["random_all"] = [(predictors[i], predictors[j], gt[i], gt[j]) for i, j in _iso_splits["all"]]
    # 2. Train more developed, test less developed
    # splits["train_high_test_low"] = [(high_predictors, low_predictors, high_gt, low_gt)]
    # 3. Train less developed, test more developed
    # splits["train_low_test_high"] = [(low_predictors, high_predictors, low_gt, high_gt)]
    # 4. Randomly sample high for train+test
    #splits["train_test_high"] = train_test_split(high_predictors, high_gt, test_size=0.2, random_state=42)
    splits["train_test_high"] = [(high_predictors[i], high_predictors[j], high_gt[i], high_gt[j]) for i, j in _iso_splits["high"]]
    # 5. Randomly sample low for train+test
    #splits["train_test_low"] = train_test_split(low_predictors, low_gt, test_size=0.2, random_state=42)
    splits["train_test_low"] = [(low_predictors[i], low_predictors[j], low_gt[i], low_gt[j]) for i, j in _iso_splits["low"]]

    return splits

## Data Setup

In [5]:
# read in UN data
un_df = pd.read_csv('../data/UN_data_clean.csv')
# ground truth data for all of the models
y = np.array((un_df[(un_df['age_group'] == 'Total') & (un_df['sex'] == 'both sexes') & (un_df['year'] == 2019)]\
          ['migrant_pop']))

In [6]:
# read in combined fb_un_data
#fb_df = pd.read_csv('../data/facebook_un_combined_2020.csv')
#fb_df_2020 = pd.read_csv('../data/facebook_un_combined_2020.csv')
#predictors = fb_df.values [:, 2:176]
#gt = fb_df['migrant_2019'].values

fb_df = pd.read_csv('../data/FB_UN_totals.csv')
gt = fb_df['migrant_pop_2019'].values

In [7]:
fb_un_age_sex = pd.read_csv('../data/FB_UN_age_sex.csv')
fb_un_age_sex = fb_un_age_sex.drop('country_name', axis=1)
fb_un_age_sex['oecd_member'] = fb_un_age_sex['oecd_member'].astype(int)

fb_un_age_sex['data_used_B'] = ['B' in s for s in fb_un_age_sex['data_used']]
fb_un_age_sex['data_used_C'] = ['C' in s for s in fb_un_age_sex['data_used']]
fb_un_age_sex['data_used_I'] = ['I' in s for s in fb_un_age_sex['data_used']]
fb_un_age_sex['data_used_R'] = ['R' in s for s in fb_un_age_sex['data_used']]
fb_un_age_sex['data_used_BR'] = ['B' in s and 'R' in s for s in fb_un_age_sex['data_used']]
fb_un_age_sex['data_used_CR'] = ['C' in s and 'R' in s for s in fb_un_age_sex['data_used']]
fb_un_age_sex['data_used_IR'] = ['I' in s and 'R' in s for s in fb_un_age_sex['data_used']]
fb_un_age_sex['data_used_CB'] = ['C' in s and 'B' in s for s in fb_un_age_sex['data_used']]
fb_un_age_sex['data_used_CBR'] = ['C' in s and 'B' in s and 'R' in s for s in fb_un_age_sex['data_used']]

fb_un_age_sex = fb_un_age_sex.drop('data_used', axis=1)

fb_un_age_sex['un_development_lvl'] = [0 if 'Less' in s else 1 for s in fb_un_age_sex['un_development_lvl']]

fb_un_age_sex.loc[fb_un_age_sex['fb_penetration'] > 1, 'fb_penetration'] = 1

fb_un_age_sex = pd.get_dummies(fb_un_age_sex)

fb_un_age_sex['fb_expats_normalized'] = fb_un_age_sex['fb_expats'] / fb_un_age_sex['fb_penetration']

In [8]:
# hyperparameters
depth = 5
random = 0

In [9]:
f = open('model_mapes.csv','w')
writer = csv.writer(f, delimiter=',')
writer.writerow(["model", "split", "rf_mape", "xgboost_mape", "linreg_mape"])

46

### Simple Autoregressive Model

In [10]:
splits = gen_splits(fb_df, ['migrant_pop_2015', 'migrant_pop_2017'])

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = np.array([0.]*3)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["autoregressive_baseline", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[19.90251978 15.6235684  11.29510699]


train_test_high: 
[17.07294836 23.28426186  7.3213885 ]


train_test_low: 
[25.38962105 26.96627794 11.85701237]




### Facebook Naive

In [11]:
splits = gen_splits(fb_df, ['fb_expats'])

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = mapes = np.array([0.]*3)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["fb_naive", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[247.76828159 241.88411169 149.69018252]


train_test_high: 
[63.55194138 62.71546674 50.11232441]


train_test_low: 
[326.42191893 310.6532503  152.4569364 ]




### Autoregressive + Facebook Expats

In [12]:
splits = gen_splits(fb_df, ['migrant_pop_2015', 'migrant_pop_2017', 'fb_expats'])

for (k, folds) in splits.items():
    print(k + ": ")
    mapes = mapes = np.array([0.]*3)
    for fold in folds: 
        mapes+=run_all_models(*fold)
    mapes/=len(folds)
    writer.writerow(["autoregressive_plus_fb", k, *mapes])
    print(mapes)
    print("\n")

random_all: 
[19.85613053 17.42490357 13.16156887]


train_test_high: 
[17.14987023 24.50012018  7.1113205 ]


train_test_low: 
[23.11461399 23.25380717 14.96718015]




### Facebook age-sex corrected

### Facebook age-sex corrected (2020)

### Facebook age-sex corrected with autoregression

### Facebook age-sex corrected with autoregression (2020)

### All Predictors (2019)

### All Predictors (2020)

### Age-Sex

In [13]:
country_codes = list(set(fb_un_age_sex['country_code']))
high_dev_isos = list(set(fb_un_age_sex.loc[fb_un_age_sex['un_development_lvl'] == 1, 'country_code']))
low_dev_isos = list(set(fb_un_age_sex.loc[fb_un_age_sex['un_development_lvl'] == 0, 'country_code']))

In [14]:
randall_splits = [([country_codes[i] for i in train_split], [country_codes[i] for i in test_split]) for train_split, test_split in _iso_splits["all"]]
high_dev_splits = [([high_dev_isos[i] for i in train_split], [high_dev_isos[i] for i in test_split]) for train_split, test_split in _iso_splits["high"]]
low_dev_splits = [([low_dev_isos[i] for i in train_split], [low_dev_isos[i] for i in test_split]) for train_split, test_split in _iso_splits["low"]]

iso_splits = {
            "random_all": randall_splits,
            "train_test_high": high_dev_splits,
            "train_test_low": low_dev_splits
         }

In [15]:
columns = ['age_sex_group_female_age0',
       'age_sex_group_female_age1', 'age_sex_group_female_age10',
       'age_sex_group_female_age2', 'age_sex_group_female_age3',
       'age_sex_group_female_age4', 'age_sex_group_female_age5',
       'age_sex_group_female_age6', 'age_sex_group_female_age7',
       'age_sex_group_female_age8', 'age_sex_group_female_age9',
       'age_sex_group_male_age0', 'age_sex_group_male_age1',
       'age_sex_group_male_age10', 'age_sex_group_male_age2',
       'age_sex_group_male_age3', 'age_sex_group_male_age4',
       'age_sex_group_male_age5', 'age_sex_group_male_age6',
       'age_sex_group_male_age7', 'age_sex_group_male_age8',
       'age_sex_group_male_age9', 'migrant_pop_2015', 'migrant_pop_2017']

In [16]:
for k, folds in iso_splits.items():
    subgroup_mapes = mapes = np.array([0.]*3)
    total_mapes = mapes = np.array([0.]*3)
    print(k, ": ")
    for train, test in folds: 
        X_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), columns]
        X_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), columns]
        y_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), 'migrant_pop_2019']
        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'migrant_pop_2019']
        mapes, preds = run_all_models(X_train, X_test, y_train, y_test, preds=True)
        subgroup_mapes+=mapes

        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), ['migrant_pop_2019', 'country_code']]
        mapes_temp = []
        for pred in preds:
            pred = pd.DataFrame(pred, columns=['pop'])
            pred['country_code'] = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'country_code'].values
            mapes_temp.append(MAPE(y_test.groupby('country_code').sum()['migrant_pop_2019'].values, pred.groupby('country_code').sum()['pop'].values))
        total_mapes+=mapes_temp

    subgroup_mapes = subgroup_mapes/len(folds)
    total_mapes = total_mapes/len(folds)
    writer.writerow(["autoregressive_subgroups", k, *subgroup_mapes])
    writer.writerow(["autoregressive_subgroups_added", k, *total_mapes])
    print("Subgroups: ", subgroup_mapes)
    print("Total: ", total_mapes)
    print("\n")

random_all : 
Subgroups:  [23.84646531 27.73974519 32.1461829 ]
Total:  [14.16542601 17.22727041 16.60578027]


train_test_high : 
Subgroups:  [18.53955913 18.48041349 27.8865388 ]
Total:  [12.1872286  11.15636017 14.44343896]


train_test_low : 
Subgroups:  [26.87464501 30.7349897  33.86436493]
Total:  [17.15208061 20.36235567 17.70338255]




In [17]:
columns = ['age_sex_group_female_age0',
       'age_sex_group_female_age1', 'age_sex_group_female_age10',
       'age_sex_group_female_age2', 'age_sex_group_female_age3',
       'age_sex_group_female_age4', 'age_sex_group_female_age5',
       'age_sex_group_female_age6', 'age_sex_group_female_age7',
       'age_sex_group_female_age8', 'age_sex_group_female_age9',
       'age_sex_group_male_age0', 'age_sex_group_male_age1',
       'age_sex_group_male_age10', 'age_sex_group_male_age2',
       'age_sex_group_male_age3', 'age_sex_group_male_age4',
       'age_sex_group_male_age5', 'age_sex_group_male_age6',
       'age_sex_group_male_age7', 'age_sex_group_male_age8',
       'age_sex_group_male_age9', 'migrant_pop_2015', 'migrant_pop_2017', 'fb_expats_normalized']

In [18]:
for k, folds in iso_splits.items():
    subgroup_mapes = mapes = np.array([0.]*3)
    total_mapes = mapes = np.array([0.]*3)
    print(k, ": ")
    for train, test in folds: 
        X_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), columns]
        X_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), columns]
        y_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), 'migrant_pop_2019']
        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'migrant_pop_2019']
        mapes, preds = run_all_models(X_train, X_test, y_train, y_test, preds=True)
        subgroup_mapes+=mapes

        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), ['migrant_pop_2019', 'country_code']]
        mapes_temp = []
        for pred in preds:
            pred = pd.DataFrame(pred, columns=['pop'])
            pred['country_code'] = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'country_code'].values
            mapes_temp.append(MAPE(y_test.groupby('country_code').sum()['migrant_pop_2019'].values, pred.groupby('country_code').sum()['pop'].values))
        total_mapes+=mapes_temp

    subgroup_mapes = subgroup_mapes/len(folds)
    total_mapes = total_mapes/len(folds)
    writer.writerow(["autoregressive_with_fb_normalized_subgroups", k, *subgroup_mapes])
    writer.writerow(["autoregressive_with_fb_normalized_subgroups_added", k, *total_mapes])
    print("Subgroups: ", subgroup_mapes)
    print("Total: ", total_mapes)
    print("\n")

random_all : 
Subgroups:  [24.87980197 26.99255378 19.37732491]
Total:  [15.44634558 17.14186881 11.33363937]


train_test_high : 
Subgroups:  [18.86047124 19.21923497 14.06266874]
Total:  [12.69710468 12.206536    8.56314977]


train_test_low : 
Subgroups:  [29.29580149 29.25912082 21.55026744]
Total:  [18.96779849 18.99965591 13.41814243]




In [19]:
columns = ['age_sex_group_female_age0',
       'age_sex_group_female_age1', 'age_sex_group_female_age10',
       'age_sex_group_female_age2', 'age_sex_group_female_age3',
       'age_sex_group_female_age4', 'age_sex_group_female_age5',
       'age_sex_group_female_age6', 'age_sex_group_female_age7',
       'age_sex_group_female_age8', 'age_sex_group_female_age9',
       'age_sex_group_male_age0', 'age_sex_group_male_age1',
       'age_sex_group_male_age10', 'age_sex_group_male_age2',
       'age_sex_group_male_age3', 'age_sex_group_male_age4',
       'age_sex_group_male_age5', 'age_sex_group_male_age6',
       'age_sex_group_male_age7', 'age_sex_group_male_age8',
       'age_sex_group_male_age9', 'fb_expats_normalized']

In [20]:
for k, folds in iso_splits.items():
    subgroup_mapes = mapes = np.array([0.]*3)
    total_mapes = mapes = np.array([0.]*3)
    print(k, ": ")
    for train, test in folds: 
        X_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), columns]
        X_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), columns]
        y_train = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(train), 'migrant_pop_2019']
        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'migrant_pop_2019']
        mapes, preds = run_all_models(X_train, X_test, y_train, y_test, preds=True)
        subgroup_mapes+=mapes

        y_test = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), ['migrant_pop_2019', 'country_code']]
        mapes_temp = []
        for pred in preds:
            pred = pd.DataFrame(pred, columns=['pop'])
            pred['country_code'] = fb_un_age_sex.loc[fb_un_age_sex['country_code'].isin(test), 'country_code'].values
            mapes_temp.append(MAPE(y_test.groupby('country_code').sum()['migrant_pop_2019'].values, pred.groupby('country_code').sum()['pop'].values))
        total_mapes+=mapes_temp

    subgroup_mapes = subgroup_mapes/len(folds)
    total_mapes = total_mapes/len(folds)
    writer.writerow(["fb_normalized_subgroups", k, *subgroup_mapes])
    writer.writerow(["fb_normalized_subgroups_added", k, *total_mapes])
    print("Subgroups: ", subgroup_mapes)
    print("Total: ", total_mapes)
    print("\n")

random_all : 
Subgroups:  [373.89460134 463.80449586 303.31355584]
Total:  [296.2914012  367.82325305 263.08414971]


train_test_high : 
Subgroups:  [114.29754197  99.5355317   99.05421468]
Total:  [92.88505536 76.53826418 87.24094204]


train_test_low : 
Subgroups:  [329.59642409 436.28404086 250.51275755]
Total:  [265.06120173 352.89628084 217.62965443]




In [21]:
f.close()