# 3a. Machine learning with satellite features - all countries except India

In [None]:
import json
import sys
from IPython.display import clear_output
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.decomposition import PCA
from sklearn.linear_model import RidgeCV, LinearRegression, RidgeClassifierCV, LogisticRegressionCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, roc_auc_score
import statsmodels.api as sm
import seaborn as sns
import matplotlib.ticker as mtick
from scipy.stats import sem, ttest_ind, spearmanr
import warnings
from multiprocessing import Pool
warnings.filterwarnings('ignore')
import ray
import psutil
import os

In [None]:
def clean_plot(ax):
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    plt.tight_layout()

In [None]:
def simulation(args):
    
    # Nice process
    p = psutil.Process(os.getpid())
    p.nice(19)
    
    # Get arguments
    i, train, test, TARGET, regression = args
    print('Simulation %i' % i, end='\r')
    np.random.seed(i)
    ALPHAS = np.logspace(-3, 3, 10)
    
    x_train, x_test = train[feature_cols], test[feature_cols]
    y_train, y_test = train[TARGET], test[TARGET]
    
    # Fit and predict with Ridge regression
    if regression == 'regression':
        model = RidgeCV(alphas=ALPHAS)
    else:
        model = LogisticRegressionCV(Cs=6)
    model.fit(x_train, y_train)
    if regression == 'regression':
        if model.alpha_ == ALPHAS[0]:
            print('Warning: Selected alpha is at lower extreme')
        if model.alpha_ == ALPHAS[-1]:
            print('Warning: Selected alpha is at upper extreme')
        
        yhat_train = model.predict(x_train)
        yhat_test = model.predict(x_test)
    else:
        yhat_train = model.predict_proba(x_train)[:, 0]
        yhat_test = model.predict_proba(x_test)[:, 0]
    
    data_train = train.copy()
    data_train['y'] = y_train
    data_train['yhat'] = yhat_train
    mse = np.mean(((np.array(yhat_test) - np.array(y_test))**2))
    data_train['y_noised'] = np.array(y_train) + np.random.normal(0, np.sqrt(mse), len(y_train))
    data_train['sim'] = i
    data_train['split'] = 'train'
    
    data_test = test.copy()
    data_test['y'] = y_test
    data_test['yhat'] = yhat_test
    mse = np.mean(((np.array(yhat_test) - np.array(y_test))**2))
    data_test['y_noised'] = np.array(y_test) + np.random.normal(0, np.sqrt(mse), len(y_test))
    data_test['sim'] = i
    data_test['split'] = 'test'    
    
    data = pd.concat([data_train, data_test])
    
    return data[['y', 'yhat', 'y_noised', 'sim', 'split']]

In [None]:
for country in ['us', 'mexico', 'colombia', 'honduras', 'indonesia', 'kenya', 'nigeria', 'peru', 'philiippines']:
    
    print('Running country ' + country + '...')
    print('=========================')


    if country == 'us':
        FEATURES_FNAME = '/data/mosaiks/replication/features/mosaiks_features_by_puma_us.csv'
        LABELS_FNAME = '/data/mosaiks/replication/surveys/us/groundtruth_by_puma_2019.csv'
        MERGE_KEY = 'StatePUMA'
        SPLIT_KEYS = {'no_spatial':'StatePUMA'}
        POVERTY = 'FINCP' 
        WEIGHT = 'PWGTP'
        OUTFILE_NAME = '/data/mosaiks/replication/simulations/us/'
    
    elif country == 'mexico':
        FEATURES_FNAME = '/data/mosaiks/replication/features/mosaiks_features_by_municipality_mexico.csv'
        LABELS_FNAME = '/data/mosaiks/replication/surveys/mexico/grouped.csv'
        MERGE_KEY = 'municipality'
        SPLIT_KEYS = {'no_spatial':'municipality'}
        POVERTY = 'asset_index' 
        WEIGHT = 'weight'
        OUTFILE_NAME = '/data/mosaiks/replication/simulations/mexico/'
    
    else:
        FEATURES_FNAME = '/data/mosaiks/replication/features/dhs/mosaiks_features_by_cluster_' + country + '.csv'
        LABELS_FNAME = '/data/mosaiks/replication/surveys/dhs/' + country + '_grouped.csv'
        MERGE_KEY = 'cluster'
        SPLIT_KEYS = {'spatial':'region', 'no_spatial':'cluster'}
        POVERTY = 'wealth' 
        WEIGHT = 'weight'
        OUTFILE_NAME = '/data/mosaiks/replication/simulations/dhs/' + country + '/'
        
    for method, TARGET in [('regression', POVERTY), ('classification', 'rural')]:
        for spatial in SPLIT_KEYS.keys():
            print('Running ' + method + ' with ' + spatial + '...')
            print('--------------------')

            # Load and merge data
            features = pd.read_csv(FEATURES_FNAME)
            print('Regions with features: %i' % len(features))
            labels = pd.read_csv(LABELS_FNAME)
            
            # Normalize labels
            if country == 'us' and method == 'regression':
                labels[TARGET] = np.log(labels[TARGET] + 1)
                
            if method == 'regression':
                label_mean, label_std = labels[TARGET].mean(), labels[TARGET].std()
                labels[TARGET] = (labels[TARGET] - label_mean)/label_std
            
            if country == 'us':
                features['StatePUMA'] = features['State'] + features['PUMA'].apply(lambda x: str(int(x)))
                labels['StatePUMA'] = labels['State'] + labels['PUMA'].apply(lambda x: str(int(x)))
                
            print('Regions with labels: %i' % len(labels))
            df = labels.merge(features, on=MERGE_KEY, how='inner')
            print('Regions with features and labels: %i' % len(df))
            feature_cols = [c for c in df.columns if 'Feature' in c]
            
            # Preprocess MOSAIKS features: drop columns with all nulls or all same value, standardize
            for col in feature_cols:
                cols_to_drop = []
                if df[col].isnull().sum() == len(df) or df[col].std() == 0:
                    cols_to_drop.append(col)
                else:
                    df[col] = (df[col] - df[col].mean())/df[col].std()
            df = df.drop(cols_to_drop, axis=1)
            
            SPLIT_KEY = SPLIT_KEYS[spatial]
            trains, tests = [], []
        
            for i in range(100):
                if SPLIT_KEY == 'buffer':
                    train_states = list(pd.read_csv('/data/mosaiks/splits/' + country + '/train_' + str(i+1) 
                                                    + '.csv')['region'])
                    test_states = list(pd.read_csv('/data/mosaiks/splits/' + country + '/test_' + str(i+1) 
                                                   + '.csv')['region'])
                    train = df[df[MERGE_KEY].isin(train_states)]
                    test = df[df[MERGE_KEY].isin(test_states)]

                else:
                    states = list(df[SPLIT_KEY].unique())
                    train_states, test_states = train_test_split(states, test_size=0.25, random_state=i)
                    train = df[df[SPLIT_KEY].isin(train_states)]
                    test = df[df[SPLIT_KEY].isin(test_states)]
            
                trains.append(train)
                tests.append(test)
                
            if method == 'regression':
                df[[MERGE_KEY]].to_csv(OUTFILE_NAME + 'ids.csv', index=True)
            
            results = [simulation((i, trains[i], tests[i], TARGET, method)) for i in range(100)]
            results = pd.concat(results)
            results.to_csv(OUTFILE_NAME + TARGET + '_' + spatial + '.csv')
            clear_output(wait=True)
            print('Done!', end='\r')