In [1]:
# Griffin Davis, The University of Texas at Dallas
# (C) 2022
# Data source:
# Chetty, Raj; Friedman, John; Hendren, Nathaniel; Jones, Maggie R.; Porter, Sonya R., 2022, 
# "Replication Data for: The Opportunity Atlas: Mapping the Childhood Roots of Social Mobility", 
# https://doi.org/10.7910/DVN/NKCQM1, Harvard Dataverse, V1, UNF:6:wwWmCZy1LUqtq02qHdCKFQ== [fileUNF] 

import os
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
from urllib.parse import urlparse

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.distributions import Normal
nnF = nn.functional

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

# Download data
!wget -nc https://personal.utdallas.edu/~gcd/data/tract_merged.csv
ds = pd.read_csv('tract_merged.csv')

File ‘tract_merged.csv’ already there; not retrieving.



In [2]:
# Get subset of columns
cols = ['id', 'hhinc_mean2000', 'mean_commutetime2000', 'frac_coll_plus2000', 'frac_coll_plus2010', 
        'med_hhinc1990', 'med_hhinc2016', 'popdensity2000', 'poor_share2010', 'poor_share2000', 
        'poor_share1990', 'gsmn_math_g3_2013', 'traveltime15_2010', 'emp2000', 'singleparent_share1990',
        'singleparent_share2010', 'singleparent_share2000', 
        'mail_return_rate2010', 'jobs_total_5mi_2015', 'jobs_highpay_5mi_2015', 
        'popdensity2010', 'job_density_2013', 'kfr_pooled_pooled_p1', 
        'kfr_pooled_pooled_p25', 'kfr_pooled_pooled_p50', 'kfr_pooled_pooled_p75', 'kfr_pooled_pooled_p100']

exluded = ['rent_twobed2015', 'ln_wage_growth_hs_grad', 'ann_avg_job_growth_2004_2013']

ds_full = ds

# Handle null data
ds = ds[ds.columns[ds.columns.isin(cols)]]
ds = ds.dropna()

In [3]:
# Shuffle split the data into training and test sets (75% / 25%)
train, test = train_test_split(ds)

train_X = train.loc[:,'hhinc_mean2000':'job_density_2013']
test_X = test.loc[:,'hhinc_mean2000':'job_density_2013']

percentiles = ['kfr_pooled_pooled_p25', 'kfr_pooled_pooled_p50', 'kfr_pooled_pooled_p75', 'kfr_pooled_pooled_p100']
train_Y = train.loc[:, percentiles[0]]
test_Y = test.loc[:, percentiles[0]]

In [21]:
# Simple Mixture Density Network, without handling for missing input features
features = train_X.shape[1]

class Net(nn.Module):
    def __init__(self, features, hidden_dim, out_dim):
        super(Net, self).__init__()
        self.seq = nn.Sequential(
            nn.Linear(features, hidden_dim),
            nn.Sigmoid(),
            nn.Linear(hidden_dim, out_dim),
            nn.ReLU()
        )
    
    def forward(self, x):
        params = self.seq(x)
        mu, sigma = torch.tensor_split(params, params.shape[0], dim=0)
        
        return mu, nnF.elu(sigma)+1+1e-7
    
    def loss(self, x, y):
        mu, sigma = self.forward(x)
        dist = Normal(mu, sigma)
        return -dist.log_prob(y)
    
mdn = Net(features, 12, 2).double()
optimizer = torch.optim.SGD(mdn.parameters(), lr=1e-3)

print(mdn)

Net(
  (seq): Sequential(
    (0): Linear(in_features=21, out_features=12, bias=True)
    (1): Sigmoid()
    (2): Linear(in_features=12, out_features=2, bias=True)
    (3): ReLU()
  )
)


In [22]:
def train_mdn(mdn, train_X, train_Y, optimizer):
    mdn.train()
    
    train_X = train_X.reset_index().iloc[:, 1:]
    train_Y = train_Y.reset_index().iloc[:, 1:].squeeze()

    for index, row in train_X.iterrows():
        x = torch.tensor(np.double(row.values))
        y = torch.tensor(np.double(train_Y.iloc[index]))
        
        loss = mdn.loss(x, y)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if index % 10000 == 0:
            print(f"index: {index} loss: {loss}")

train_mdn(mdn, train_X, train_Y, optimizer)

index: 0 loss: tensor([0.9963], dtype=torch.float64, grad_fn=<NegBackward0>)
index: 10000 loss: tensor([0.9208], dtype=torch.float64, grad_fn=<NegBackward0>)
index: 20000 loss: tensor([0.9231], dtype=torch.float64, grad_fn=<NegBackward0>)
index: 30000 loss: tensor([0.9211], dtype=torch.float64, grad_fn=<NegBackward0>)
index: 40000 loss: tensor([0.9201], dtype=torch.float64, grad_fn=<NegBackward0>)
index: 50000 loss: tensor([0.9190], dtype=torch.float64, grad_fn=<NegBackward0>)


In [23]:
def test_mdn(mdn, test_X, test_Y):
    mdn.eval()
    
    test_X = test_X.reset_index().iloc[:, 1:]
    test_Y = test_Y.reset_index().iloc[:, 1:].squeeze()

    test_loss = 0
    sq_er = []
    with torch.no_grad():
        for index, row in test_X.iterrows():
            x = torch.tensor(np.double(row.values))
            y = torch.tensor(np.double(test_Y.iloc[index]))
            loss = mdn.loss(x, y)
            test_loss += loss.item()

            mu, sigma = mdn.forward(x)

            sq_er.append((mu.item() - y)**2)

    test_loss /= test_X.shape[0]
    
    print(f"Avg test loss: {test_loss}")
    print(f"Mean squared error: {np.mean(sq_er)}")

test_mdn(mdn, test_X, test_Y)

Avg test loss: 0.9213506576693299
Mean squared error: 0.004824049894130483
