In [1]:
import pandas as pd
import math
import numpy as np
from scipy.optimize import minimize
from scipy.optimize import fsolve
import statsmodels.api as sm
from scipy.stats import norm
from statsmodels.sandbox.regression.gmm import GMM

In [2]:
def comp_outside_good(data,name):
    #pre-processing to calculate outside good shares
    shares = data[['Market_ID',name]].copy()

    group_shares = shares.groupby('Market_ID').sum()
    group_shares['Outside Good Share'] = 1 - group_shares[name]

    data = pd.merge(data,group_shares[['Outside Good Share']], 
                right_index=True, left_on = 'Market_ID')
    return data

data = pd.read_csv('data.csv')
data = comp_outside_good(data,'Inside Good Share')

In [29]:
def setup_data(data):
    """simplify setting up data correctly"""
    #set up x and y
    y = data[['Inside Good Share','Outside Good Share']]
    x =  data[['Network Score','Satisfaction Score','PPO','Premium']]
    return x,y


def setup_hausman(data):
    #calculate hausmann insturments
    price = data['Premium']
    
    mkt_dum = pd.get_dummies(data['Market_ID'],prefix='mkt',drop_first=True)
    plan_dum = pd.get_dummies(data['Plan_ID'],prefix='plan',drop_first=True)
    exog = np.array( data[['Network Score','Satisfaction Score', 'PPO']])
    
    #calc avg price in other markets
    hausman_mod1 = sm.OLS(price, sm.add_constant(plan_dum))
    hausman_fit1 = hausman_mod1.fit()
    hausman_instr1 = hausman_fit1.fittedvalues
    hausman_instr1 = np.array([hausman_instr1]).transpose()
    
    hausman_mod2 = sm.OLS(price, sm.add_constant(mkt_dum))
    hausman_fit2 = hausman_mod2.fit()
    hausman_instr2 = hausman_fit2.fittedvalues
    hausman_instr2 = np.array([hausman_instr2]).transpose()
    
    #concat hausman instr with exog variables
    instr = np.concatenate( (exog, hausman_instr1, hausman_instr2), axis =1 )
    
    return instr

x,y =  setup_data(data)
z = setup_hausman(data)

[[0.86       0.8        0.         2.3401279  2.57971287]
 [0.88       0.865      0.         2.36602798 2.57971287]
 [0.91       0.875      1.         2.59650666 2.57971287]
 ...
 [0.9        0.85       1.         2.59559199 2.49469234]
 [0.92       0.925      0.         2.41005585 2.49469234]
 [0.875      0.91       1.         2.60064909 2.49469234]]


In [4]:
#set up useful global variables 
NMKTS = data['Market_ID'].nunique()
NPLANS = data['Plan_ID'].nunique()
NOBS = data['Plan_ID'].count()
NSIM = 20

theta1 = np.array([3.5,2,1,-2])
theta2 = np.array([.15,.1,.5]) # initialize theta2 for testing purposes
delta = np.ones(NOBS)*(-2)

#print global variables
print NMKTS,NPLANS,NOBS

600 16 3300


In [5]:
#set up random draws v
v = np.random.normal(size=(NSIM,3)) #same simulation for all markets
#np.savetxt("simulations.csv", v, delimiter=",")
#use same simulations each time
v = np.genfromtxt('simulations.csv', delimiter=',')
v = np.tile(v.reshape(NSIM,3,1) , (1,1,NOBS))

# Part 1 - Estimation

## Calculating $\delta_{jt}$, $\xi_{jt}$

In [6]:
def cal_sim_s(data, v, delta, theta2):
    """calculate market share for each simulated consumer"""
    
    #copy x and delta for simulations using tiling
    x =  np.array(data.copy()[['Network Score','Satisfaction Score','PPO']]).transpose()
    x  = np.tile(x,(NSIM,1,1))
    theta2 = np.tile( np.array([theta2]).transpose()  ,(NSIM,1,3300))
    delta  = np.tile( delta  ,(NSIM,1))
    
    #add to calcualte market shares
    sim_exp = pd.DataFrame( np.exp(delta + (theta2*v*x).sum(axis=1)).transpose() ) 
    
    #sum up between markets
    sim_exp['mkt_id'] = data['Market_ID']
    sum_exp = sim_exp.groupby('mkt_id').sum() 
    sum_exp = pd.merge(data.copy()[['Market_ID']], sum_exp, 
                       right_index=True, left_on = 'Market_ID')
    
    #format so I can broadcast
    sim_exp = np.array(sim_exp).transpose()[:-1]
    sum_exp = np.array(sum_exp).transpose()[1:] + 1
    
    return sim_exp/sum_exp


def cal_s(data, v, delta, theta2):
    """Calculate market share
    Calculates individual choice probability first, then take sum"""
    
    shares = (1./NSIM)*cal_sim_s(data, v, delta, theta2).sum(axis=0)
    return shares


s = cal_s(data, v, delta, theta2)

In [14]:
def cal_delta(data, v, theta2, error = 1e-3, maxiter = 500):
    """Calculate mean utility via contraction mapping"""

    niter = 0
    
    #initialize loop parameters
    delta = np.zeros(NOBS)
    s = cal_s(data, v, delta, theta2)
    diff = np.log(data['Inside Good Share']) - np.log(s)
    
    
    while ((abs(diff).max() > 1e-6) #this is easier to converge
           and (abs(diff).mean() > error) 
           and niter < maxiter):
        
        s = cal_s(data, v, delta, theta2)
        diff = np.log(data['Inside Good Share']) - np.log(s)

        if np.isnan(diff).sum():
            raise Exception('nan in diffs')
            
        delta += diff
        niter += 1

    return delta


delta = cal_delta(data, v, theta2)
print delta.shape
print delta.mean()

(3300,)
0.0537547330113


In [24]:
#initialize theta1 i.e. alpha and beta
def cal_xi(data, delta, theta1):
    """Calculate xi with F.O.C"""
    x,y =  setup_data(data)
    
    xi = np.matmul(np.array(x),theta1)
    
    xi = delta - np.matmul(np.array(x),theta1)
    
    return  xi

xi = cal_xi(data, delta,  theta1)
print xi.max(), xi.min(), xi.mean()

-4.4521914495928865 -6.374717985796533 -5.17496647911


## Calculating  $\theta_1,\theta_2$

where $\theta_1 = (\alpha, \beta)$

I only solve GMM over $\theta_2$, the non-linear parameters

In [25]:
def cal_theta1(data, delta, z, T):
    """  calculate theta 1 using FOCs (X1'Z T Z'X )^-1 X1'Z T Z' delta """
    
    #set up variables
    x,y =  setup_data(data)
    X,Z = np.array(x), np.array(z)
    
    #build up to main equation
    XtZ = X.transpose().dot(Z)
    ZtX = Z.transpose().dot(X)
        
    first_exp = np.linalg.inv( XtZ.dot(T).dot(ZtX))
    second_exp = XtZ.dot(T).dot(Z.transpose()).dot(delta)
    theta1 = first_exp.dot(second_exp)
    return theta1

T =  np.linalg.inv( z.transpose().dot(z) )
print cal_theta1(data, delta, z, T)

[ 3.47270033  2.06911418  0.74765472 -2.06193438]


In [26]:
def gmm_objective(theta2_init, data,  v, z,  T):
    """w' z phi z' w, of theta2"""
    #set up variables
    x,y =  setup_data(data)
    X,Z = np.array(x), np.array(z)
    
    #do calculations
    delta = cal_delta(data, v, theta2_init)
    print   delta.min(),  delta.max(),  delta.mean()

    theta1 = cal_theta1(data, delta, z, T)
    xi = cal_xi(data, delta, theta1)
    
    print xi.min(),  xi.max(),  xi.mean()
    
    xitZ = xi.transpose().dot(Z)
    Ztxi = Z.transpose().dot(xi)
    
    print xitZ
    print Ztxi
    print xitZ.dot(T).dot(Ztxi)
    
    return xitZ.dot(T).dot(Ztxi)
    

def calc_theta2(data, v, T, theta2_init):
    """calculate theta2 using scipy"""
    z= setup_hausman(data)
    theta2 = minimize(gmm_objective, theta2_init, args=(data,  v, z, T), method='Nelder-Mead',
                      options={'maxiter':10, 'disp': True, 'xatol':1e-3, 'fatol':1})
    #theta2 = minimize(gmm_objective, theta2_init, args=(data, v,  T), method='BFGS',
    #                  options={'maxiter':100, 'disp': True})
    return abs(theta2.x)



#run some tests
#theta2_init = np.array([1,.5,2])
#theta2_test = calc_theta2( data, v, T, theta2_init)
# print theta2_test


theta2_init = np.array([1,1,1])
gmm_objective(theta2_init, data,  v, z,  T)

10.455671339528664 23.376142580102762 18.8577714415
4.937593130071156 18.42140026285697 13.5763471447
[ 39051.20222722  37758.43088955  23087.17764858 110680.10681694]
[ 39051.20222722  37758.43088955  23087.17764858 110680.10681694]
609283.5421247376


609283.5421247376

In [27]:
theta2_init = np.array([1,1,1])

def calc_theta(data, v, theta2_init):
    
    #initialize theta
    x,y =  setup_data(data)
    z = setup_hausman(data)
    
    X,Z = np.array(x), np.array(z)
    theta2 = theta2_init 
    
    #on first step, use consistent approximation of T
    T =  np.linalg.inv( z.transpose().dot(z) )
    for i in range(2):  
       
        #on second use T using estimated xi
        if i==1:
            xi = cal_xi(data, delta, theta1)
            xi =np.array([xi]).transpose()
            T =  np.linalg.inv( Z.transpose().dot(xi).dot(xi.transpose()).dot(Z) )
        
        theta2 = calc_theta2(data, v, T, theta2)
        delta = cal_delta(data, v, theta2)
        theta1 = cal_theta1(data, delta, T)
        
    return theta1, theta2


theta = calc_theta(data, v, theta2_init)

print theta

-0.8537285064161964 1.1270680905440538 0.581544414486
-5.782728714085071 -4.030809545184476 -4.50849910224
[-12969.96763397 -12540.60307023  -7667.8803669  -36759.87732177]
[-12969.96763397 -12540.60307023  -7667.8803669  -36759.87732177]
67209.15182664497
-0.8298112620899851 1.148687898470888 0.608185481656
-5.760099842864588 -3.9950891946469778 -4.46997777478
[-12859.15707396 -12433.46084071  -7602.3688605  -36445.81465738]
[-12859.15707396 -12433.46084071  -7602.3688605  -36445.81465738]
66065.63787175258
-0.8307429683557706 1.1547561166681986 0.608459649169
-5.76605056296218 -4.008888047554314 -4.47850006741
[-12883.67147777 -12457.16370693  -7616.86184314 -36515.29412736]
[-12883.67147777 -12457.16370693  -7616.86184314 -36515.29412736]
66317.77002342064
-0.854491890637719 1.1427292762379668 0.577596377555
-5.79189571098172 -4.04877825057725 -4.52985012302
[-13031.38686848 -12599.98905042  -7704.19158646 -36933.95358699]
[-13031.38686848 -12599.98905042  -7704.19158646 -36933.9535

TypeError: cal_theta1() takes exactly 4 arguments (3 given)

## Calculate Standard Errors