In [14]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

from scipy.interpolate import interp1d #pre written interpolation function
from statsmodels.base.model import GenericLikelihoodModel

In [None]:
#TODO:

# add state as parameters
# figure out max x using function
#does state start at 0 or at 1?

# parametrize cost function correctly
# what is the correct measure of distance
# fix class to extend generic likelihood model
# 
 
# dropping data
# 

The code below references the following two sources.

John Rust's website:
https://editorialexpress.com/jrust/nfxp.html

Victor Aguirregabiria and Pedro Mira's website:
http://individual.utoronto.ca/vaguirre/wpapers/program_code_survey_joe_2008.html

In [15]:
#fix the bus .dat because Aguirregabiria and Mira hate everyone
data = np.fromfile('bus1234.dat')
data = data.reshape(len(data)/6,6)
data = pd.DataFrame(data,columns=['id','group','year','month','replace','miles'])

#save to .csv so other people don't need to be confused
data.to_csv("bus1234.csv")

In [16]:
#constants
BETA = .97
GAMMA = .5772 #euler's constant

#discretization
MAX_X = data['miles'].max()
MIN_X = data['miles'].min()
STATES = 5
print MAX_X

388254.0


In [17]:
#switch to date time for ease 
data['date'] = pd.to_datetime(data[['year', 'month']].assign(Day=1))
data = data[['id','group','date','replace','miles']]

#lag date
date_lag = data.copy()
date_lag['date'] = date_lag['date'] - pd.DateOffset(months=1)
data = data.merge(date_lag, how='left', on=['id','group','date'] , suffixes=('','_next'))

In [18]:
def hm_initial_pr(x_obs, i_obs):
    """calculate state pr"""
    
    interval = int(MAX_X/STATES) + 1
    
    discrete = (x_obs/interval).astype(int)
    
    df = np.array([discrete, i_obs]).transpose()
    df = pd.DataFrame(df, columns=('x','i'))
    pr_obs = df.groupby('x')
    pr_obs = pr_obs.sum()/(1.*pr_obs.count())

    return  np.maximum(.001, np.array(pr_obs))

PR_OBS = hm_initial_pr(data['miles'],data['replace'])

In [19]:
#dropna values from lag
data_safe = data.dropna()

def hm_transitions(i, x_obs , x_next, i_obs):
    """calculate transitions probabilities,
    non-parametrically"""
    
    interval = int(MAX_X/STATES) + 1
    discrete = (x_obs/interval).astype(int)
    discrete_next = (x_next/interval).astype(int)
    
    #set up dataframe with columns
    df = np.array([discrete, discrete_next, i_obs ]).transpose()
    df = pd.DataFrame(df, columns=('x','x_next','i'))
    
    #transition for this value of i,
    df = df.loc[df['i'] == i]
    pr_obs = df.groupby(['x','x_next']).count()/df[['i','x']].groupby('x').count()
    
    #set up transitions matrix
    trans = np.arange(STATES)
    trans = np.transpose([np.repeat(trans, STATES), 
                          np.tile(trans, STATES)] ) #cartesian product of states
    trans = pd.DataFrame(trans,columns=('x','x_next'))
    
    #merge with transition pr, 0 if nothing observed
    trans = trans.merge(pr_obs, how = 'left', on=['x','x_next'])
    trans = np.array(trans.fillna(0)['i']).reshape(STATES,STATES)
    
    #prevent column from being 0
    bad_cols = trans.sum(axis=1)
    trans[:,] = np.maximum(trans,1-bad_cols)
    
    return trans

TRANS0 = hm_transitions(0,data_safe['miles'],data_safe['miles_next'],data_safe['replace'])
TRANS1 = hm_transitions(1,data_safe['miles'],data_safe['miles_next'],data_safe['replace'])
PR_TRANS = TRANS0, TRANS1

In [28]:
def hm_value(theta1, cost, pr_obs, pr_trans):
    """calculate value function using hotz miller approach"""
    
    #set up matrices, transition is deterministic
    trans0, trans1 = pr_trans
    
    x = np.arange(1,STATES+1).reshape(STATES,1)*(MAX_X/STATES)
    

    #calculate value function for all state
    pr_tile = np.tile( pr_obs.reshape(STATES,1), (1,STATES))
    
    denom = (np.identity(STATES) - BETA*(1-pr_tile)*trans0 - BETA*trans1*pr_tile)
    
    numer = ( (1-pr_obs)*(theta1*x  + GAMMA - np.log(1-pr_obs)) + 
                 pr_obs*(cost+ GAMMA - np.log(pr_obs) ) )
    
    value = np.linalg.inv(denom).dot(numer)
    return value



hm_value(-1e-5, -3, PR_OBS, PR_TRANS)

array([[-26.23284597],
       [-41.33685875],
       [-50.35503324],
       [-55.90815079],
       [-64.53219932]])

In [33]:
def hm_prob(theta1, cost, pr_obs, pr_trans):
    """calculate kappa using value function"""
    
    value = hm_value(theta1, cost, pr_obs, pr_trans)
    trans0, trans1 = pr_trans
    x = np.arange(1,STATES+1).reshape(STATES,1)*(MAX_X/STATES)
    
    delta1 = np.exp( cost + BETA*trans1.dot(value))
    delta0 = np.exp( x*theta1 + BETA*trans0.dot(value) )
    
    return delta1/(delta1+delta0)

hm_prob(-1e-5, -3, PR_OBS, PR_TRANS)

array([[0.16389381],
       [0.99999867],
       [1.        ],
       [1.        ],
       [1.        ]])

In [39]:
from scipy.optimize import minimize


class HotzMiller():
    """class for estimating the values of R and theta
    using the Hotz Miller routine and the helper functions
    above"""
    
    def __init__(self, x, x_next, i):
        
        #transitions
        self.pr_obs = hm_initial_pr(x,i)
        self.trans =  hm_transitions(0,x,x_next,i), hm_transitions(1,x,x_next,i)

        #data
        self.x = x
        self.x_next = x
        self.i = i
        
        #parameters
        self.theta1 = 0
        self.R = 0
        
        
    def likelihood(self, params): 
        theta1, R = params
        
        # Input our data into the model
        interval = int(MAX_X/STATES) + 1
        i = self.i
        x = (self.x/interval).astype(int)
        
        #set up hm state pr
        prob = hm_prob(theta1, R, self.pr_obs, self.trans).transpose()[0]
        prob = interp1d(np.arange(1,STATES+1), prob,fill_value="extrapolate")
        
        log_likelihood = (1-i)*np.log(1-prob(x)) + i*np.log(prob(x))
        
        return -log_likelihood.sum()
    
    
    def fit(self):
        result = minimize(self.likelihood, [-1e-5,-3], method = 'Nelder-Mead', options={'disp': False})
        self.theta1, self.R = result.x

        
model_hm = HotzMiller(data_safe['miles'],data_safe['miles_next'], data_safe['replace'])
model_hm.fit()


print '\n theta_1:%s, R:%s'%(round(model_hm.theta1,4) , round(model_hm.R,4))


 theta_1:-0.0, R:-5.5657


