In [31]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

from scipy.interpolate import interp1d #pre written interpolation function
from statsmodels.base.model import GenericLikelihoodModel

In [32]:
#TODO:

# add state as parameters
# figure out max x using function
#does state start at 0 or at 1?

# parametrize cost function correctly
# what is the correct measure of distance
# fix class to extend generic likelihood model
# 
 
# dropping data
# 

The code below references the following two sources.

John Rust's website:
https://editorialexpress.com/jrust/nfxp.html

Victor Aguirregabiria and Pedro Mira's website:
http://individual.utoronto.ca/vaguirre/wpapers/program_code_survey_joe_2008.html

In [33]:
#fix the bus .dat because Aguirregabiria and Mira hate everyone
data = np.fromfile('bus1234.dat')
data = data.reshape(len(data)/6,6)
data = pd.DataFrame(data,columns=['id','group','year','month','replace','miles'])

#save to .csv so other people don't need to be confused
data.to_csv("bus1234.csv")

#switch to date time for ease 
data['date'] = pd.to_datetime(data[['year', 'month']].assign(Day=1))
data = data[['id','group','date','replace','miles']]

#lag date
date_lag = data.copy()
date_lag['date'] = date_lag['date'] - pd.DateOffset(months=1)
data = data.merge(date_lag, how='left', on=['id','group','date'] , suffixes=('','_next'))
data = data.dropna()

In [63]:
#constants
BETA = .97
GAMMA = .5772 #euler's constant

#size of step in discretization
STEP = 1000

id                              162
group                        530875
date            1985-04-01 00:00:00
replace                           1
miles                        388254
replace_next                      1
miles_next                   388254
dtype: object


In [62]:
def miles_pdf(x_obs, x_next, i_obs):
    """non-parametric estimation of mileage pdf.
    in other words, the probability of jumping dx states
    next period
    
    this corresponds to pdfdx in AM's code"""
    
    #figure out max number of steps
    dx = (1-i_obs)*(x_next - x_obs) + i_obs*x_obs 
    discrete = (dx/STEP).astype(int)
    
    #make a dataframe with the discrete 'jumps' in mileage
    frame = np.array([discrete, i_obs]).transpose()
    frame = pd.DataFrame(frame, columns=('dx','i'))

    #non-parametrically group jumps to make pdf
    pdfdx = frame.groupby('dx')
    pdfdx = pdfdx.count()/pdfdx.count().sum()
    return  np.array(pdfdx)


MILES_PDF = miles_pdf(data['miles'], data['miles_next'], data['replace'])

In [100]:
def transition_1(x_obs , x_next, i_obs):
    """calculate transitions probabilities,
    non-parametrically
    
    this corresponds to fmat1 and fmat2 in AM's code"""
    
    #transitions when i=1
    num_states = (x_obs.max()/STEP).astype(int) + 1
    pdfdx = miles_pdf(x_obs, x_next, i_obs).transpose()
    
    #zero probability of transitioning to large states
    zeros = np.zeros( (num_states,num_states-pdfdx.shape[1]) )
    
    #transitioning to first state and 'jumping' dx states
    fmat1 = np.tile(pdfdx,(num_states,1))
    fmat1 = np.concatenate( (fmat1, zeros), axis=1 )

    return fmat1

FMAT1 = transition_1(data['miles'],data['miles_next'],data['replace'])

In [142]:
def transition_0(x_obs , x_next, i_obs):
    """calculate transitions probabilities,
    non-parametrically
    
    this corresponds to fmat1 and fmat2 in AM's code"""
    
    
    num_states = (x_obs.max()/STEP).astype(int) + 1
    pdfdx = miles_pdf(x_obs, x_next, i_obs).transpose()
    
    #initialize fmat array, transitions when i=0
    end_zeros = np.zeros((1,num_states -pdfdx.shape[1]))
    fmat0 = np.concatenate( (pdfdx,end_zeros), axis=1 )

    for row in range(1,num_states):
        
        #this corresponds to colz i think
        cutoff = (num_states - row - pdfdx.shape[1] )
        
        #case 1 far enough from the 'end' of the matrix
        if cutoff >= 0:
            start_zeros = np.zeros((1,row))
            end_zeros = np.zeros((1,num_states -pdfdx.shape[1] - row))
            fmat_new = np.concatenate( (start_zeros,pdfdx,end_zeros), axis=1 )
            fmat0 = np.concatenate((fmat0,fmat_new))
       
        #case 2, too far from the end and need to adjust probs
        else:
            pdf_adj = pdfdx[:,0:cutoff]
            pdf_adj = pdf_adj/pdf_adj.sum(axis=1)
            
            start_zeros = np.zeros((1,row))
            fmat_new = np.concatenate( (start_zeros,pdf_adj), axis=1 )
            fmat0 = np.concatenate((fmat0,fmat_new))


    return fmat0

FMAT0 = transition_0(data['miles'],data['miles_next'],data['replace'])

In [None]:
def hm_initial_pr(a_obs, i_obs):
    """initial the probability of view a given state
    following AM
    
    seems like it just involves multinomial logit"""
    

In [28]:
def hm_value(theta1, cost, pr_obs, pr_trans):
    """calculate value function using hotz miller approach"""
    
    #set up matrices, transition is deterministic
    trans0, trans1 = pr_trans
    
    x = np.arange(1,STATES+1).reshape(STATES,1)*(MAX_X/STATES)
    

    #calculate value function for all state
    pr_tile = np.tile( pr_obs.reshape(STATES,1), (1,STATES))
    
    denom = (np.identity(STATES) - BETA*(1-pr_tile)*trans0 - BETA*trans1*pr_tile)
    
    numer = ( (1-pr_obs)*(theta1*x  + GAMMA - np.log(1-pr_obs)) + 
                 pr_obs*(cost+ GAMMA - np.log(pr_obs) ) )
    
    value = np.linalg.inv(denom).dot(numer)
    return value



hm_value(-1e-5, -3, PR_OBS, PR_TRANS)

array([[-26.23284597],
       [-41.33685875],
       [-50.35503324],
       [-55.90815079],
       [-64.53219932]])

In [33]:
def hm_prob(theta1, cost, pr_obs, pr_trans):
    """calculate kappa using value function"""
    
    value = hm_value(theta1, cost, pr_obs, pr_trans)
    trans0, trans1 = pr_trans
    x = np.arange(1,STATES+1).reshape(STATES,1)*(MAX_X/STATES)
    
    delta1 = np.exp( cost + BETA*trans1.dot(value))
    delta0 = np.exp( x*theta1 + BETA*trans0.dot(value) )
    
    return delta1/(delta1+delta0)

hm_prob(-1e-5, -3, PR_OBS, PR_TRANS)

array([[0.16389381],
       [0.99999867],
       [1.        ],
       [1.        ],
       [1.        ]])

In [39]:
from scipy.optimize import minimize


class HotzMiller():
    """class for estimating the values of R and theta
    using the Hotz Miller routine and the helper functions
    above"""
    
    def __init__(self, x, x_next, i):
        
        #transitions
        self.pr_obs = hm_initial_pr(x,i)
        self.trans =  hm_transitions(0,x,x_next,i), hm_transitions(1,x,x_next,i)

        #data
        self.x = x
        self.x_next = x
        self.i = i
        
        #parameters
        self.theta1 = 0
        self.R = 0
        
        
    def likelihood(self, params): 
        theta1, R = params
        
        # Input our data into the model
        interval = int(MAX_X/STATES) + 1
        i = self.i
        x = (self.x/interval).astype(int)
        
        #set up hm state pr
        prob = hm_prob(theta1, R, self.pr_obs, self.trans).transpose()[0]
        prob = interp1d(np.arange(1,STATES+1), prob,fill_value="extrapolate")
        
        log_likelihood = (1-i)*np.log(1-prob(x)) + i*np.log(prob(x))
        
        return -log_likelihood.sum()
    
    
    def fit(self):
        result = minimize(self.likelihood, [-1e-5,-3], method = 'Nelder-Mead', options={'disp': False})
        self.theta1, self.R = result.x

        
model_hm = HotzMiller(data_safe['miles'],data_safe['miles_next'], data_safe['replace'])
model_hm.fit()


print '\n theta_1:%s, R:%s'%(round(model_hm.theta1,4) , round(model_hm.R,4))


 theta_1:-0.0, R:-5.5657


