In [1]:
# Griffin Davis, The University of Texas at Dallas
# (C) 2022
# Data source:
# Chetty, Raj; Friedman, John; Hendren, Nathaniel; Jones, Maggie R.; Porter, Sonya R., 2022, 
# "Replication Data for: The Opportunity Atlas: Mapping the Childhood Roots of Social Mobility", 
# https://doi.org/10.7910/DVN/NKCQM1, Harvard Dataverse, V1, UNF:6:wwWmCZy1LUqtq02qHdCKFQ== [fileUNF] 

import os
import time
import logging
from importlib import reload
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
from urllib.parse import urlparse
from tqdm.notebook import tqdm

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, KFold

import torch
from torch import nn
from torch.distributions import Normal
nnF = nn.functional

if not os.path.exists('logs'):
    os.makedirs('logs')

reload(logging) # Notebook workaround
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s [%(threadName)s] [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("logs/mdn.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger()

# Download data
!wget -nc https://personal.utdallas.edu/~gcd/data/tract_merged.csv
ds = pd.read_csv('tract_merged.csv')

File ‘tract_merged.csv’ already there; not retrieving.



In [2]:
# Get subset of columns
cols = ['id', 'hhinc_mean2000', 'mean_commutetime2000', 'frac_coll_plus2000', 'frac_coll_plus2010', 
        'med_hhinc1990', 'med_hhinc2016', 'popdensity2000', 'poor_share2010', 'poor_share2000', 
        'poor_share1990', 'gsmn_math_g3_2013', 'traveltime15_2010', 'emp2000', 'singleparent_share1990',
        'singleparent_share2010', 'singleparent_share2000', 
        'mail_return_rate2010', 'jobs_total_5mi_2015', 'jobs_highpay_5mi_2015', 
        'popdensity2010', 'job_density_2013', 'kfr_pooled_pooled_p1', 
        'kfr_pooled_pooled_p25', 'kfr_pooled_pooled_p50', 'kfr_pooled_pooled_p75', 'kfr_pooled_pooled_p100']

excluded = ['rent_twobed2015', 'ln_wage_growth_hs_grad', 'ann_avg_job_growth_2004_2013']

full_cols = cols + excluded

# Handle null data
ds_full = ds[ds.columns[ds.columns.isin(full_cols)]]
ds = ds[ds.columns[ds.columns.isin(cols)]]
ds = ds.dropna()

In [20]:
# Shuffle split the data into training and test sets (75% / 25%)
train, test = train_test_split(ds)

train_X = train.loc[:,'hhinc_mean2000':'job_density_2013']
test_X = test.loc[:,'hhinc_mean2000':'job_density_2013']

percentiles = ['kfr_pooled_pooled_p1', 'kfr_pooled_pooled_p25', 'kfr_pooled_pooled_p50', 'kfr_pooled_pooled_p75']
train_Y = train.loc[:, percentiles[1]]
test_Y = test.loc[:, percentiles[1]]

# Reset indexes and convert Y to pd.Series
train_X.reset_index(drop=True, inplace=True)
train_Y = train_Y.reset_index(drop=True).squeeze()
test_X.reset_index(drop=True, inplace=True)
test_Y = test_Y.reset_index(drop=True).squeeze()

In [4]:
# Simple Mixture Density Network, without handling for missing input features
features = train_X.shape[1]

class Net(nn.Module):
    def __init__(self, features, hidden_dim, out_dim):
        super(Net, self).__init__()
        self.seq = nn.Sequential(
            nn.Linear(features, hidden_dim),
            nn.Sigmoid(),
            nn.Linear(hidden_dim, out_dim),
            nn.ReLU()
        )
    
    def forward(self, x):
        params = self.seq(x)
        mu, sigma = torch.tensor_split(params, params.shape[0], dim=0)
        
        return mu, sigma+1
    
    def loss(self, x, y):
        mu, sigma = self.forward(x)
        dist = Normal(mu, sigma)
        return -dist.log_prob(y)

In [8]:
def train_mdn(mdn, X, Y, optimizer, verbose):
    mdn.train()
    
    X.reset_index(drop=True, inplace=True)
    Y.reset_index(drop=True, inplace=True)
    
    for index, row in X.iterrows():
        x = torch.tensor(np.double(row.values))
        y = torch.tensor(np.double(Y.iloc[index]))
        
        loss = mdn.loss(x, y)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if verbose and index % 10000 == 0:
            logging.info(f"index: {index} loss: {loss}")

In [6]:
def test_mdn(mdn, X, Y, verbose):
    mdn.eval()

    X.reset_index(drop=True, inplace=True)
    Y.reset_index(drop=True, inplace=True)

    test_loss = 0
    sq_er = []
    with torch.no_grad():
        for index, row in X.iterrows():
            x = torch.tensor(np.double(row.values))
            y = torch.tensor(np.double(Y.iloc[index]))
            loss = mdn.loss(x, y)
            test_loss += loss.item()

            mu, sigma = mdn.forward(x)

            sq_er.append((mu.item() - y)**2)

    test_loss /= test_X.shape[0]
    
    if verbose:
        logging.info(f"Avg test loss: {test_loss}")
        logging.info(f"Mean squared error: {np.mean(sq_er)}")
    
    return test_loss

In [194]:
# Determine number of Sigmoid activations to use in hidden layer
# 10-fold hyperparameter cross validation using training data
kf = KFold(n_splits=10)

losses = {}

# Setup folder to save hypervalidation models
hyperPath = 'hypervalidation'
if not os.path.exists(hyperPath):
    os.makedirs(hyperPath)

# Try each option for number of Sigmoid activations
for hidden_dim in tqdm(range(3, 30)):
    # Create network with that hyperparameter
    mdn = Net(features, hidden_dim, 2).double()
    optimizer = torch.optim.Adam(mdn.parameters(), lr=0.0001)

    # Do 10-fold cross validation and store results of tests in array
    dim_loss = []
    for train_index, test_index in kf.split(train_X):
        X_train, X_test = train_X.iloc[train_index], train_X.iloc[test_index]
        Y_train, Y_test = train_Y.iloc[train_index], train_Y.iloc[test_index]

        train_mdn(mdn, X_train, Y_train, optimizer, False)
        dim_loss.append(test_mdn(mdn, X_test, Y_test, False))
    
    # Store average test results for this hyperparameter option
    losses[hidden_dim] = np.mean(dim_loss)

    # Save the hypervalidation model
    torch.save(mdn, f'{hyperPath}/{hidden_dim}_activ_mdn.pt')

    logging.info("Dimension = " + str(hidden_dim) + ", Loss = " + str(losses[hidden_dim]))

# Select best hyperparameter
min_loss = np.inf
min_dim = 0
for hidden_dim in losses:
    if losses[hidden_dim] < min_loss:
        min_loss = losses[hidden_dim]
        min_dim = hidden_dim

logging.info("Selected number of sigmoid activations: " + str(min_dim))

  0%|          | 0/27 [00:00<?, ?it/s]

Dimension = 3, Loss = 0.2768381263867698
Dimension = 4, Loss = 0.2789649501573936
Dimension = 5, Loss = 0.29557487171902286
Dimension = 6, Loss = 0.29557487171902286
Dimension = 7, Loss = 0.27678297429819837
Dimension = 8, Loss = 0.27677964461746984
Dimension = 9, Loss = 0.2955755801044583
Dimension = 10, Loss = 0.2767739615945039
Dimension = 11, Loss = 0.2767723332257522
Dimension = 12, Loss = 0.27678039662463905
Dimension = 13, Loss = 0.27676444063356503
Dimension = 14, Loss = 0.29557487171902286
Dimension = 15, Loss = 0.2767818423319893
Dimension = 16, Loss = 0.27676558481836744
Dimension = 17, Loss = 0.2767831932562505
Dimension = 18, Loss = 0.2767636130155072
Dimension = 19, Loss = 0.2767720795720673
Dimension = 20, Loss = 0.29557487171902286
Dimension = 21, Loss = 0.27677251045612
Dimension = 22, Loss = 0.2767821537030486
Dimension = 23, Loss = 0.29557487171902286
Dimension = 24, Loss = 0.29557487171902286
Dimension = 25, Loss = 0.27674490638982185
Dimension = 26, Loss = 0.276776

In [21]:
# Retrain with full training dataset (selected 10)
mdn = Net(features, 10, 2).double()
optimizer = torch.optim.Adam(mdn.parameters(), lr=0.0001)

train_mdn(mdn, train_X, train_Y, optimizer, True)

# Save the final model
torch.save(mdn, f'simple_mdn.pt')

2023-03-28 08:44:24,486 [MainThread] [INFO] index: 0 loss: tensor([1.0017], dtype=torch.float64, grad_fn=<NegBackward0>)
2023-03-28 08:44:30,120 [MainThread] [INFO] index: 10000 loss: tensor([1.0762], dtype=torch.float64, grad_fn=<NegBackward0>)
2023-03-28 08:44:35,730 [MainThread] [INFO] index: 20000 loss: tensor([0.9871], dtype=torch.float64, grad_fn=<NegBackward0>)
2023-03-28 08:44:41,284 [MainThread] [INFO] index: 30000 loss: tensor([1.0149], dtype=torch.float64, grad_fn=<NegBackward0>)
2023-03-28 08:44:46,940 [MainThread] [INFO] index: 40000 loss: tensor([0.9803], dtype=torch.float64, grad_fn=<NegBackward0>)
2023-03-28 08:44:52,589 [MainThread] [INFO] index: 50000 loss: tensor([1.0029], dtype=torch.float64, grad_fn=<NegBackward0>)


In [22]:
# Test with reserved data
test_mdn(mdn, test_X, test_Y, True)

2023-03-28 08:44:57,976 [MainThread] [INFO] Avg test loss: 1.0132598407689926
2023-03-28 08:44:58,013 [MainThread] [INFO] Mean squared error: 0.18864261512864228


1.0132598407689926