In [2]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

from scipy.interpolate import interp1d #pre written interpolation function
from statsmodels.base.model import GenericLikelihoodModel
from scipy import stats

In [3]:
#TODO:

# add state as parameters
# figure out max x using function
#does state start at 0 or at 1?

# parametrize cost function correctly
# what is the correct measure of distance
# fix class to extend generic likelihood model
# 
 
# dropping data
# 

The code below references the following two sources.

John Rust's website:
https://editorialexpress.com/jrust/nfxp.html

Victor Aguirregabiria and Pedro Mira's website:
http://individual.utoronto.ca/vaguirre/wpapers/program_code_survey_joe_2008.html

Victor Aguirregabiria and Pedro Mira's 2004 paper
https://www.jstor.org/stable/3082006


In [30]:
#fix the bus .dat because Aguirregabiria and Mira hate everyone
data = np.fromfile('bus1234.dat')
data = data.reshape(len(data)/6,6)
data = pd.DataFrame(data,columns=['id','group','year','month','replace','miles'])

#save to .csv so other people don't need to be confused
data.to_csv("bus1234.csv")

#switch to date time for ease 
data['date'] = pd.to_datetime(data[['year', 'month']].assign(Day=1))
data = data[['id','group','date','replace','miles']]

#lag date
date_lag = data.copy()
date_lag['date'] = date_lag['date'] - pd.DateOffset(months=1)
data = data.merge(date_lag, how='left', on=['id','group','date'] , suffixes=('','_next'))
data = data.dropna()

In [31]:
#constants
BETA = .97
GAMMA = .5772 #euler's constant

#size of step in discretization
STEP = 1000

In [36]:
def miles_pdf(i_obs, x_obs, x_next):
    """non-parametric estimation of mileage pdf.
    in other words, the probability of jumping dx states
    next period
    
    this corresponds to pdfdx in AM's code"""
    
    #figure out max number of steps
    dx = (1-i_obs)*(x_next - x_obs) + i_obs*x_obs 
    discrete = (dx/STEP).astype(int)
    
    #make a dataframe with the discrete 'jumps' in mileage
    frame = np.array([discrete, i_obs]).transpose()
    frame = pd.DataFrame(frame, columns=('dx','i'))

    #non-parametrically group jumps to make pdf
    pdfdx = frame.groupby('dx')
    pdfdx = pdfdx.count()/pdfdx.count().sum()
    return  np.array(pdfdx)


MILES_PDF = miles_pdf(data['replace'], data['miles'], data['miles_next'])

In [40]:
def transition_1(i_obs, x_obs , x_next):
    """calculate transitions probabilities,
    non-parametrically
    
    this corresponds to fmat1 and fmat2 in AM's code"""
    
    #transitions when i=1
    num_states = (x_obs.max()/STEP).astype(int) + 1
    pdfdx = miles_pdf(i_obs, x_obs, x_next).transpose()
    
    #zero probability of transitioning to large states
    zeros = np.zeros( (num_states,num_states-pdfdx.shape[1]) )
    
    #transitioning to first state and 'jumping' dx states
    fmat1 = np.tile(pdfdx,(num_states,1))
    fmat1 = np.concatenate( (fmat1, zeros), axis=1 )

    return fmat1

FMAT1 = transition_1(data['replace'], data['miles'],data['miles_next'])

In [48]:
def transition_0(i_obs, x_obs , x_next):
    """calculate transitions probabilities,
    non-parametrically
    
    this corresponds to fmat1 and fmat2 in AM's code"""
    
    
    num_states = (x_obs.max()/STEP).astype(int) + 1
    pdfdx = miles_pdf(i_obs, x_obs, x_next).transpose()
    
    #initialize fmat array, transitions when i=0
    end_zeros = np.zeros((1,num_states -pdfdx.shape[1]))
    fmat0 = np.concatenate( (pdfdx,end_zeros), axis=1 )

    for row in range(1,num_states):
        
        #this corresponds to colz i think
        cutoff = (num_states - row - pdfdx.shape[1] )
        
        #case 1 far enough from the 'end' of the matrix
        if cutoff >= 0:
            start_zeros = np.zeros((1,row))
            end_zeros = np.zeros((1,num_states -pdfdx.shape[1] - row))
            fmat_new = np.concatenate( (start_zeros,pdfdx,end_zeros), axis=1 )
            fmat0 = np.concatenate((fmat0,fmat_new))
       
        #case 2, too far from the end and need to adjust probs
        else:
            pdf_adj = pdfdx[:,0:cutoff]
            pdf_adj = pdf_adj/pdf_adj.sum(axis=1)
            
            start_zeros = np.zeros((1,row))
            fmat_new = np.concatenate( (start_zeros,pdf_adj), axis=1 )
            fmat0 = np.concatenate((fmat0,fmat_new))


    return fmat0

FMAT0 = transition_0(data['replace'],data['miles'],data['miles_next'])

PR_TRANS = FMAT0, FMAT1

In [49]:
def np_initial_pr(i_obs, x_obs):
    """truly non-parametric
    initial the probability of view a given state
    May delete later"""
    
    num_states = ( x_obs.max()/STEP).astype(int) + 1
    discrete = (x_obs/STEP).astype(int)
    
    df = np.array([discrete, i_obs]).transpose()
    df = pd.DataFrame(df, columns=('x','i'))
    pr_obs = df.groupby('x')
    pr_obs = pr_obs.sum()/(1.*pr_obs.count())

    return  np.array(pr_obs)

PR_OBS = np_initial_pr(data['replace'], data['miles'])

In [82]:
def initial_pr(i_obs, x_obs):
    """initial the probability of view a given state following AM.
    Seems like it just involves logit to predict"""
    
    X = np.array([x_obs, x_obs**2, x_obs**3]).transpose()
    X = sm.add_constant(X)
    
    model = sm.Logit(i_obs,X)
    fit = model.fit()
    #print fit.summary()
    
    states = np.arange(x_obs.min(),x_obs.max(), STEP)
    
    states = np.array([states, states**2, states**3]).transpose()
    states = sm.add_constant(states)
    
    return fit.predict(states)

PR_OBS = initial_pr(data['replace'], data['miles'])

Optimization terminated successfully.
         Current function value: 0.036201
         Iterations 14


In [80]:
def hm_value(theta1, cost, i_obs, x_obs, pr_obs, pr_trans):
    """calculate value function using hotz miller approach"""
    
    #set up matrices, transition is deterministic
    trans0, trans1 = pr_trans
    
    #should probably make these class parameters
    num_states = ( x_obs.max()/STEP).astype(int) + 1
    x = np.arange(x_obs.min(),x_obs.max(), STEP)

    #calculate value function for all state
    pr_tile = np.tile( pr_obs.reshape(num_states,1), (1,num_states))
    
    denom = (np.identity(num_states) - BETA*(1-pr_tile)*trans0 - BETA*trans1*pr_tile)
    
    numer = ( (1-pr_obs)*(theta1*x  + GAMMA - np.log(1-pr_obs)) + 
                 pr_obs*(cost+ GAMMA - np.log(pr_obs) ) )
    
    value = np.linalg.inv(denom).dot(numer)
    return value



#hm_value(-.000058, -.001047, data['replace'], data['miles'], PR_OBS, PR_TRANS)

In [None]:
-2494563
-3360878

In [85]:
def hm_prob(theta1, cost, i_obs, x_obs, pr_obs, pr_trans):
    """calculate kappa using value function"""
    
    value = hm_value(theta1, cost, i_obs, x_obs, pr_obs, pr_trans)
    trans0, trans1 = pr_trans
    
    #should probably make these class parameters
    num_states = ( x_obs.max()/STEP).astype(int) + 1
    x = np.arange(x_obs.min(),x_obs.max(), STEP)
    
    delta1 = np.exp( cost + BETA*trans1.dot(value))
    delta0 = np.exp( x*theta1 + BETA*trans0.dot(value) )
    
    return delta1/(delta1+delta0)


hm_prob(-.000058, -.001047, data['replace'], data['miles'], PR_OBS, PR_TRANS).shape

(389,)

In [86]:
from scipy.optimize import minimize

class HotzMiller():
    """class for estimating the values of R and theta
    using the Hotz Miller routine and the helper functions
    above"""
    
    def __init__(self, i, x, x_next):
        
        #transitions
        self.pr_obs = initial_pr(i, x)
        self.trans =  transition_1(i,x,x_next), transition_1(i,x,x_next)
        
        #should probably make these class parameters
        self.num_states = ( x.max()/STEP).astype(int) + 1
        self.states = np.arange(x.min(),x.max(), STEP)
        
        #data
        self.x = x
        self.x_next = x
        self.i = i
        
        #parameters
        self.theta1 = 0
        self.R = 0
        
        
    def likelihood(self, params): 
        theta1, R = params
        
        # Input our data into the model
        i = self.i
        x = (self.x/STEP).astype(int)
        
        #set up hm state pr
        prob = hm_prob(theta1, R, self.i, self.x, self.pr_obs, self.trans).transpose()
        prob = interp1d(self.states, prob, fill_value="extrapolate")
        
        log_likelihood = (1-i)*np.log(1-prob(x)) + i*np.log(prob(x))
        
        return -log_likelihood.sum()
    
    
    def fit(self):
        result = minimize(self.likelihood, [-1e-5,-3], method = 'Nelder-Mead', options={'disp': False})
        self.theta1, self.R = result.x

        
model_hm = HotzMiller(data['replace'], data['miles'],data['miles_next'])
model_hm.fit()


print '\n theta_1:%s, R:%s'%(round(model_hm.theta1,4) , round(model_hm.R,4))

Optimization terminated successfully.
         Current function value: 0.036201
         Iterations 14


  



 theta_1:-0.0077, R:-10.3026
