In [11]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

from scipy.interpolate import interp1d #pre written interpolation function
from statsmodels.base.model import GenericLikelihoodModel
from scipy import stats #for kernel function

In [12]:
#TODO:

#debug value function
#debug kappa function

#paramterize cost function (for different specs)

#implement class using statsmodles
#iterate AM style

The code below references the following two sources.

John Rust's website:
https://editorialexpress.com/jrust/nfxp.html

Victor Aguirregabiria and Pedro Mira's website:
http://individual.utoronto.ca/vaguirre/wpapers/program_code_survey_joe_2008.html

Victor Aguirregabiria and Pedro Mira's 2002 paper
https://www.jstor.org/stable/3082006


In [13]:
#fix the bus .dat because Aguirregabiria and Mira hate everyone
data = np.fromfile('bus1234.dat')
data = data.reshape(len(data)/6,6)
data = pd.DataFrame(data,columns=['id','group','year','month','replace','miles'])

#save to .csv so other people don't need to be confused
data.to_csv("bus1234.csv")

#divide by 1e6
data['miles'] = data['miles']/1e6

#switch to date time for ease 
data['date'] = pd.to_datetime(data[['year', 'month']].assign(Day=1))
data = data[['id','group','date','replace','miles']]

#lag date
date_lag = data.copy()
date_lag['date'] = date_lag['date'] - pd.DateOffset(months=1)
data = data.merge(date_lag, how='left', on=['id','group','date'] , suffixes=('','_next'))
data = data.dropna()

print data.max()
print data.min()

id                              162
group                        530875
date            1985-04-01 00:00:00
replace                           1
miles                      0.388254
replace_next                      1
miles_next                 0.388254
dtype: object
id                               59
group                            50
date            1975-09-01 00:00:00
replace                           0
miles                             0
replace_next                      0
miles_next                        0
dtype: object


In [14]:
#constants
BETA = .9999
GAMMA = .5772 #euler's constant

#size of step in discretization
STEP = .002
#STEP = 3000

In [15]:
def miles_pdf(i_obs, x_obs, x_next):
    """estimation of mileage pdf following AM using the
    kernel function
    
    this corresponds to pdfdx in AM's code"""
    
    #figure out max number of steps
    dx = (1-i_obs)*(x_next - x_obs) + i_obs*x_obs 
    
    #number of states
    dx_states = np.arange(dx.min(),dx.max(), STEP)
    
    #use kernel groups to make pdf
    kernel1 = stats.gaussian_kde(dx, bw_method='silverman')
    pdfdx = kernel1(dx_states)
    
    return np.array([pdfdx/pdfdx.sum()]).transpose()


MILES_PDF = miles_pdf(data['replace'], data['miles'], data['miles_next'])

In [16]:
def transition_1(i_obs, x_obs , x_next):
    """calculate transitions probabilities,
    non-parametrically
    
    this corresponds to fmat1 and fmat2 in AM's code"""
    
    #transitions when i=1
    num_states = (x_obs.max()/STEP).astype(int) + 1
    states = np.arange(x_obs.min(),x_obs.max(), STEP)
    
    pdfdx = miles_pdf(i_obs, x_obs, x_next).transpose()
    
    #zero probability of transitioning to large states
    zeros = np.zeros( (num_states,num_states-pdfdx.shape[1]) )
    
    #transitioning to first state and 'jumping' dx states
    fmat1 = np.tile(pdfdx,(num_states,1))
    fmat1 = np.concatenate( (fmat1, zeros), axis=1 )

    return fmat1

FMAT1 = transition_1(data['replace'], data['miles'],data['miles_next'])

print FMAT1[0,:]

[1.62127782e-01 2.21302727e-01 2.31470978e-01 1.85223431e-01
 1.13219329e-01 5.28207788e-02 1.88290137e-02 5.15758947e-03
 1.10225602e-03 1.89787398e-04 2.76998247e-05 3.59413878e-06
 4.15830294e-07 4.09233654e-08 3.23375602e-09 1.97043181e-10
 9.05597434e-12 3.10503488e-13 7.89978370e-15 1.48722963e-16
 2.06863332e-18 2.12381910e-20 1.60841516e-22 8.98084275e-25
 3.69581828e-27 1.12058391e-29 2.50267709e-32 4.11617438e-35
 4.98453948e-38 4.44346777e-41 2.91551979e-44 1.40781223e-47
 5.00209199e-51 1.55224741e-54 9.65925165e-52 2.80209223e-48
 5.97180575e-45 9.35009776e-42 1.07551914e-38 9.08903247e-36
 5.64317008e-33 2.57422476e-30 8.62788390e-28 2.12480968e-25
 3.84526747e-23 5.11410877e-21 4.99936610e-19 3.59295836e-17
 1.89892921e-15 7.38350730e-14 2.11331014e-12 4.45614727e-11
 6.93013233e-10 7.96157923e-09 6.77191564e-08 4.27843178e-07
 2.01732283e-06 7.14923537e-06 1.92493050e-05 4.00234381e-05
 6.57732620e-05 8.79000501e-05 9.79692815e-05 9.18766758e-05
 7.15460122e-05 4.505611

In [17]:
def transition_0(i_obs, x_obs , x_next):
    """calculate transitions probabilities,
    non-parametrically
    
    this corresponds to fmat1 and fmat2 in AM's code"""
    
    
    num_states = (x_obs.max()/STEP).astype(int) + 1
    pdfdx = miles_pdf(i_obs, x_obs, x_next).transpose()
    
    #initialize fmat array, transitions when i=0
    end_zeros = np.zeros((1,num_states -pdfdx.shape[1]))
    fmat0 = np.concatenate( (pdfdx,end_zeros), axis=1 )

    for row in range(1,num_states):
        
        #this corresponds to colz i think
        cutoff = (num_states - row - pdfdx.shape[1] )
        
        #case 1 far enough from the 'end' of the matrix
        if cutoff >= 0:
            start_zeros = np.zeros((1,row))
            end_zeros = np.zeros((1,num_states -pdfdx.shape[1] - row))
            fmat_new = np.concatenate( (start_zeros,pdfdx,end_zeros), axis=1 )
            fmat0 = np.concatenate((fmat0,fmat_new))
       
        #case 2, too far from the end and need to adjust probs
        else:
            pdf_adj = pdfdx[:,0:cutoff]
            pdf_adj = pdf_adj/pdf_adj.sum(axis=1)
            
            start_zeros = np.zeros((1,row))
            fmat_new = np.concatenate( (start_zeros,pdf_adj), axis=1 )
            fmat0 = np.concatenate((fmat0,fmat_new))


    return fmat0

FMAT0 = transition_0(data['replace'],data['miles'],data['miles_next'])

PR_TRANS = FMAT0, FMAT1

In [18]:
def initial_pr(i_obs, x_obs, d=0):
    """initial the probability of view a given state following AM.
    Seems like it just involves logit to predict
    
    Third arguement involves display"""
    
    X = np.array([x_obs, x_obs**2, x_obs**3]).transpose()
    X = sm.add_constant(X)
    
    model = sm.Logit(i_obs,X)
    fit = model.fit(disp=d)
    if d: print fit.summary()
    
    states = np.arange(x_obs.min(),x_obs.max(), STEP)
    
    states = np.array([states, states**2, states**3]).transpose()
    states = sm.add_constant(states)
    
    return fit.predict(states)

PR_OBS = initial_pr(data['replace'], data['miles'], d=1)

Optimization terminated successfully.
         Current function value: 0.036201
         Iterations 23
                           Logit Regression Results                           
Dep. Variable:                replace   No. Observations:                 8156
Model:                          Logit   Df Residuals:                     8152
Method:                           MLE   Df Model:                            3
Date:                Mon, 14 Jan 2019   Pseudo R-squ.:                  0.1671
Time:                        15:52:16   Log-Likelihood:                -295.26
converged:                       True   LL-Null:                       -354.51
                                        LLR p-value:                 1.623e-25
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -17.3136      4.188     -4.134      0.000     -25.522      -9.105
x1           149.3089     56

In [30]:
def hm_value(theta1, cost, i_obs, x_obs, pr_obs, pr_trans):
    """calculate value function using hotz miller approach"""
    
    #set up matrices, transition is deterministic
    trans0, trans1 = pr_trans
    
    #should probably make these class parameters
    num_states = ( x_obs.max()/STEP).astype(int) + 1
    x = np.arange(x_obs.min(),x_obs.max(), STEP)
    
    
    #calculate value function for all state
    pr_tile = np.tile( pr_obs.reshape(num_states,1), (1,num_states))
    
    denom = (np.identity(num_states) - BETA*(1-pr_tile)*trans0 - BETA*pr_tile*trans1)
    
    numer = ( (1-pr_obs)*(theta1*x  + GAMMA - np.log(1-pr_obs)) + 
                 pr_obs*(cost+ GAMMA - np.log(pr_obs) ) )
    
    print denom
    
    value = np.linalg.inv(denom).dot(numer)
    return value


VALUE = hm_value(-.6, -10, data['replace'], data['miles'], PR_OBS, PR_TRANS)

[[ 8.37888431e-01 -2.21280597e-01 -2.31447831e-01 ... -1.55161037e-05
  -2.56255805e-05 -3.10916355e-05]
 [-6.59684132e-09  8.37883387e-01 -2.21287478e-01 ... -6.90321518e-06
  -1.55165866e-05 -2.56263776e-05]
 [-8.83402198e-09 -1.20583477e-08  8.37879231e-01 ... -2.26809102e-06
  -6.90339277e-06 -1.55169848e-05]
 ...
 [-1.86718325e-02 -2.54868561e-02 -2.66579070e-02 ...  7.66725502e-01
  -3.18417762e-01 -3.33048676e-01]
 [-1.98116175e-02 -2.70426507e-02 -2.82851861e-02 ... -1.89621946e-06
   6.28873684e-01 -5.06583067e-01]
 [-2.10480782e-02 -2.87304067e-02 -3.00504898e-02 ... -2.01456421e-06
  -3.32714824e-06  1.29919969e-01]]


In [25]:
print VALUE

[-19753.49803921 -19754.40027222 -19755.30053168 -19756.19890491
 -19757.09490437 -19757.98793653 -19758.87763289 -19759.76385649
 -19760.64657375 -19761.52573619 -19762.40126343 -19763.27325013
 -19764.14229009 -19765.00944347 -19765.87561446 -19766.74089605
 -19767.60458537 -19768.46576547 -19769.32379104 -19770.17836339
 -19771.0293852  -19771.87683137 -19772.72069434 -19773.56097371
 -19774.39767369 -19775.23077639 -19776.06020664 -19776.88591416
 -19777.70815058 -19778.52765295 -19779.34534647 -19780.16180715
 -19780.97707156 -19781.79078205 -19782.60232693 -19783.41099389
 -19784.21619154 -19785.01756523 -19785.81494602 -19786.60825348
 -19787.39742686 -19788.18236801 -19788.96292723 -19789.7390611
 -19790.51114645 -19791.28007507 -19792.04684756 -19792.81205109
 -19793.57574484 -19794.33762472 -19795.09709475 -19795.85329131
 -19796.60530169 -19797.35250398 -19798.09470327 -19798.83202698
 -19799.56481063 -19800.29345752 -19801.01817243 -19801.73877453
 -19802.45478634 -19803.16

In [21]:
def hm_prob(theta1, cost, i_obs, x_obs, pr_obs, pr_trans):
    """calculate kappa using value function"""
    
    value = hm_value(theta1, cost, i_obs, x_obs, pr_obs, pr_trans)
    trans0, trans1 = pr_trans
    
    #should probably make these class parameters
    num_states = ( x_obs.max()/STEP).astype(int) + 1
    x = np.arange(x_obs.min(),x_obs.max(), STEP)
    
    delta1 = np.exp( cost + BETA*trans1.dot(value))
    delta0 = np.exp( x*theta1 + BETA*trans0.dot(value) )
    
    return delta1/(delta1+delta0)


print hm_prob(-.6, -10, data['replace'], data['miles'], PR_OBS, PR_TRANS)

[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


  


In [23]:
from scipy.optimize import minimize

class HotzMiller():
    """class for estimating the values of R and theta
    using the Hotz Miller routine and the helper functions
    above"""
    
    def __init__(self, i, x, x_next):
        
        #transitions
        self.pr_obs = initial_pr(i, x)
        self.trans =  transition_1(i,x,x_next), transition_1(i,x,x_next)
        
        #should probably make these class parameters
        self.num_states = ( x.max()/STEP).astype(int) + 1
        self.states = np.arange(x.min(),x.max(), STEP)
        
        #data
        self.x = x
        self.x_next = x
        self.i = i
        
        #parameters
        self.theta1 = 0
        self.R = 0
        
        
    def likelihood(self, params): 
        theta1, R = params
        
        # Input our data into the model
        i = self.i
        x = (self.x/STEP).astype(int)
        
        #set up hm state pr
        prob = hm_prob(theta1, R, self.i, self.x, self.pr_obs, self.trans).transpose()
        prob = interp1d(self.states, prob, fill_value="extrapolate")
        
        log_likelihood = (1-i)*np.log(1-prob(x)) + i*np.log(prob(x))
        
        return -log_likelihood.sum()
    
    
    def fit(self):
        result = minimize(self.likelihood, [-.6,-10], method = 'Nelder-Mead', options={'disp': False})
        self.theta1, self.R = result.x

        
model_hm = HotzMiller(data['replace'], data['miles'],data['miles_next'])
model_hm.fit()


print '\n theta_1:%s, R:%s'%(round(model_hm.theta1,4) , round(model_hm.R,4))

  



 theta_1:-60.0, R:-100.0
