In [1]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

from scipy.interpolate import interp1d #pre written interpolation function
from statsmodels.base.model import GenericLikelihoodModel

The code below references the following two sources:

John Rust's website
https://editorialexpress.com/jrust/nfxp.html

Victor Aguirregabiria and Pedro Mira's website
http://individual.utoronto.ca/vaguirre/wpapers/program_code_survey_joe_2008.html

In [105]:
#constants
BETA = .97
MAX_X = data['miles'].max()
MIN_X = data['miles'].min()
STATES = 5

In [106]:
#fix the bus .dat because Aguirregabiria and Mira hate everyone
data = np.fromfile('bus1234.dat')
data = data.reshape(len(data)/6,6)
data = pd.DataFrame(data,columns=['id','group','year','month','replace','miles'])

#save to .csv so other people don't need to be confused
data.to_csv("bus1234.csv")

In [107]:
#switch to date time for ease 
data['date'] = pd.to_datetime(data[['year', 'month']].assign(Day=1))
data = data[['id','group','date','replace','miles']]

#lag date
date_lag = data.copy()
date_lag['date'] = date_lag['date'] - pd.DateOffset(months=1)
data = data.merge(date_lag, how='left', on=['id','group','date'] , suffixes=('','_next'))

In [108]:
def hm_initial_pr(x_obs, i_obs):
    """calculate state pr"""
    
    interval = int(MAX_X/STATES) + 1
    
    discrete = (x_obs/interval).astype(int)
    
    df = np.array([discrete, i_obs]).transpose()
    df = pd.DataFrame(df, columns=('x','i'))
    pr_obs = df.groupby('x')
    pr_obs = pr_obs.sum()/(1.*pr_obs.count())

    return  np.array(pr_obs)


result1 = hm_initial_pr(data['miles'],data['replace'])
print result1

[[0.        ]
 [0.00296108]
 [0.01472393]
 [0.0290404 ]
 [0.03225806]]


In [109]:
#dropna values from lag
data_safe = data.dropna()

def hm_transitions(i, x_obs , x_next, i_obs):
    """calculate transitions probabilities,
    non-parametrically"""
    
    interval = int(MAX_X/STATES) + 1
    discrete = (x_obs/interval).astype(int)
    discrete_next = (x_next/interval).astype(int)
    
    #set up dataframe with columns
    df = np.array([discrete, discrete_next, i_obs ]).transpose()
    df = pd.DataFrame(df, columns=('x','x_next','i'))
    
    #transition for this value of i,
    df = df.loc[df['i'] == i]
    pr_obs = df.groupby(['x','x_next']).count()/df[['i','x']].groupby('x').count()
    
    #set up transitions matrix
    trans = np.arange(STATES)
    trans = np.transpose([np.repeat(trans, STATES), np.tile(trans, STATES)] )
    trans = pd.DataFrame(trans,columns=('x','x_next'))
    
    #merge with transition pr, 0 if nothing observed
    trans = trans.merge(pr_obs, how = 'left', on=['x','x_next'])
    trans = np.array(trans.fillna(0)['i']).reshape(STATES,STATES)
    
    #prevent column from being 0
    bad_cols = trans.sum(axis=1)
    trans[:,] = np.maximum(trans,1-bad_cols)
    
    return trans


hm_transitions(0,data_safe['miles'],data_safe['miles_next'],data_safe['replace'])

array([[0.95945946, 0.04054054, 0.        , 0.        , 0.        ],
       [0.        , 0.96289905, 0.03710095, 0.        , 0.        ],
       [0.        , 0.        , 0.96802508, 0.03197492, 0.        ],
       [0.        , 0.        , 0.        , 0.98537234, 0.01462766],
       [0.        , 0.        , 0.        , 0.        , 1.        ]])

In [46]:
def hm_value(a_max, theta1, cost, pr_obs):
    """calculate value function using hotz miller approach"""
    
    #set up matrices, transition is deterministic
    trans0, trans1 = hm_transitions(a_max)
    a = np.arange(1,a_max+1).reshape(a_max,1)
    
    #calculate value function for all state
    pr_tile = np.tile( pr_obs.reshape(a_max,1), (1,a_max))
    
    denom = (np.identity(a_max) - BETA*(1-pr_tile)*trans0 - BETA*trans1*pr_tile)
        
    numer = ( (1-pr_obs)*(theta1*a  + GAMMA - np.log(1-pr_obs)) + 
                 pr_obs*(cost+ GAMMA - np.log(pr_obs) ) )
    
    value = np.linalg.inv(denom).dot(numer)
    return value


def hm_prob(a_max, theta1, cost, pr_obs):
    """calculate kappa using value function"""
    
    value = hm_value(a_max, theta1, cost, pr_obs)
    trans0,trans1 = hm_transitions(a_max)
    a = np.arange(1,a_max+1).reshape(a_max,1)

    delta1 = np.exp( cost + BETA*trans1.dot(value))
    delta0 = np.exp( a*theta1 + BETA*trans0.dot(value) )
    
    return delta1/(delta1+delta0)