In [None]:
import time
import torch
import torch.nn as nn
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets
from typing import List, Union

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import permutation_importance
import os
import sys
import contextlib

import pandas as pd

@contextlib.contextmanager
def suppress_output():
    with open(os.devnull, "w") as devnull:
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = devnull
        sys.stderr = devnull
        try:
            yield
        finally:
            sys.stdout = old_stdout
            sys.stderr = old_stderr


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

COLORS = ['#1b9e77', '#d95f02', '#7570b3']  # teal, orange, purple
DATA_DIR = '/glade/u/home/kmacmanu/d4-columbia-hurricane-migration/contributors/fcottier'

In [None]:
inputDS=DATA_DIR + os.sep + 'df_migration_hurricane.csv'

inputDF=pd.read_csv(inputDS)
inputDF.columns

In [None]:
inputDF['pop_2000_ln'] = np.log(inputDF['pop_2000'])

In [None]:
Xp=inputDF[['avg_gustW_peak', 'avg_sustW_max',
       'max_gustW_peak', 'max_sustW_max', 'pop_2000_ln', 'medIncHH_2000', 'povertyL_2000', 'unempl_2000',
       'ownerL_2000', 'afroAmerL_2000', 'rural_L_2000',  'INLECZ']]
T = inputDF[['GEOID', 'year',"nOutMigr_ln"]]
print(f'{Xp.shape=}, {T.shape=}')

In [None]:
T["nOutMigr_ln"]=T.groupby("GEOID")["nOutMigr_ln"].transform(lambda x:x-x.mean())

In [None]:
# standardize X
Xp = (Xp - Xp.mean(axis=0)) / Xp.std(axis=0)
#Xp

In [None]:
Xp = pd.concat([inputDF[['GEOID', 'year']],Xp],axis=1)
Xp

In [None]:
def partition(X: np.ndarray, T: np.ndarray) -> Union[List[np.ndarray], List[np.ndarray]]:
    #if shuffle
    #    rng = np.random.default_rng(seed)
    #    indices = rng.permutation(X.shape[0])
    #    X, T = X[indices], T[indices]
   
    #train_end = int(Xp.shape[0] * train_fraction)

    # if validation_fraction > 0:
    #     valid_end = train_end + int(X.shape[0] * validation_fraction)
    #     Xtrain, Xvalid, Xtest = X[:train_end], X[train_end:valid_end], X[valid_end:]
    #     Ttrain, Tvalid, Ttest = T[:train_end], T[train_end:valid_end], T[valid_end:]
    #     return Xtrain, Ttrain, Xvalid, Tvalid, Xtest, Ttest
    # else:
    #     Xtrain, Xtest = X[:train_end], X[train_end:]
    #     Ttrain, Ttest = T[:train_end], T[train_end:]


    Xtrain=X.query('year<2018').drop(['GEOID','year'],axis=1)
    Xtest=X.query('year>=2018').drop(['GEOID','year'],axis=1)
    Ttrain=T.query('year<2018').drop(['GEOID','year'],axis=1)
    Ttest=T.query('year>=2018').drop(['GEOID','year'],axis=1)
    idTrain=X.query('year<2018')[['GEOID','year']]
    idTest=X.query('year>=2018')[['GEOID','year']]
    return Xtrain, Ttrain, Xtest, Ttest, idTrain, idTest
   
Xtrain, Ttrain, Xtest, Ttest, idTrain, idTest = partition(Xp, T)
print(f'{Xtrain.shape=}, {Ttrain.shape=}, {Xtest.shape=}, {Ttest.shape=}, {idTrain.shape=}, {idTest.shape=}')
