In [1]:
import pandas as pd
import math
import numpy as np
from scipy.optimize import minimize
from scipy.optimize import fsolve
import statsmodels.api as sm
from scipy.stats import norm
from statsmodels.sandbox.regression.gmm import GMM

# Part 0 - Data setup

In [2]:
def comp_outside_good(data,name):
    """pre-processing to calculate outside good shares"""
    shares = data[['Market_ID',name]].copy()
    group_shares = shares.groupby('Market_ID').sum()
    group_shares['Outside Good Share'] = 1 - group_shares[name]
    data = pd.merge(data,group_shares[['Outside Good Share']], 
                right_index=True, left_on = 'Market_ID')
    return data


data = pd.read_csv('data.csv')
data = comp_outside_good(data,'Inside Good Share')

In [11]:
def setup_data(data):
    """simplify setting up data correctly"""
    #set up x and y
    y = data[['Inside Good Share','Outside Good Share']]
    x =  data[['Network Score','Satisfaction Score','PPO','Premium']]
    return x,y


def setup_hausman(data):
    #calculate hausmann insturments
    price = data['Premium']
    mkt_dum = pd.get_dummies(data['Market_ID'],prefix='mkt',drop_first=True)
    plan_dum = pd.get_dummies(data['Plan_ID'],prefix='plan',drop_first=True)
    exog = np.array( data[['Network Score','Satisfaction Score', 'PPO']])
    
    #calc avg price in other markets
    hausman_mod1 = sm.OLS(price, sm.add_constant(plan_dum))
    hausman_fit1 = hausman_mod1.fit()
    hausman_instr1 = hausman_fit1.fittedvalues
    hausman_instr1 = np.array([hausman_instr1]).transpose()
    
    #no of competitors
    BLP_instr = data[['Market_ID','Plan_ID']].groupby('Market_ID').count()
    BLP_instr =  pd.merge(data[['Market_ID','Plan_ID']],
                        BLP_instr[['Plan_ID']], right_index=True, left_on = 'Market_ID')
    BLP_instr = np.array([BLP_instr['Plan_ID_y']]).transpose()
    
    #average characteristics among competititors
    BLP_instr2 = data[['Market_ID','Network Score','Satisfaction Score']].groupby('Market_ID').mean()
    BLP_instr2 =  pd.merge(data[['Market_ID']],
                        BLP_instr2[['Network Score','Satisfaction Score']], right_index=True, left_on = 'Market_ID')
    BLP_instr2 = (np.array(BLP_instr2)[:,1:]*BLP_instr - exog[:,:-1])/(BLP_instr-1)
    
    #concat hausman instr with exog variables
    instr = np.concatenate( (exog, hausman_instr1, BLP_instr2), axis =1 )
    
    return instr


#pre process for testing
x,y =  setup_data(data)
z = setup_hausman(data)
T =  np.linalg.inv( z.transpose().dot(z) ) #set up initial weight matrix

In [12]:
#set up useful global variables 
NMKTS = data['Market_ID'].nunique()
NPLANS = data['Plan_ID'].nunique()
NOBS = data['Plan_ID'].count()
NSIM = 50

theta1 = np.array([4,1.5,.7,-1.5])
theta2 = np.array([2,2,1]) # initialize theta2 for testing purposes
delta = np.ones(NOBS)*(-2)

#print global variables
print NMKTS,NPLANS,NOBS

600 16 3300


In [13]:
#set up random draws v
v = np.random.normal(size=(NSIM,3)) #same simulation for all markets
np.savetxt("simulations.csv", v, delimiter=",")

#use same simulations each time
v = np.genfromtxt('simulations.csv', delimiter=',')
v = np.tile(v.reshape(NSIM,3,1) , (1,1,NOBS))

# Part 1 - Estimation

## Calculating $\delta_{jt}$, $\xi_{jt}$

The first part of the estimation involves calculating the mean utility with the BLP inversion and the mean unobservable

In [14]:
def cal_sim_s(data, v, delta, theta2):
    """calculate market share for each simulated consumer"""
    
    #copy x and delta for simulations using tiling
    x =  np.array(data.copy()[['Network Score','Satisfaction Score','PPO']]).transpose()
    x  = np.tile(x,(NSIM,1,1))
    theta2 = np.tile( np.array([theta2]).transpose()  ,(NSIM,1,3300))
    delta  = np.tile( delta  ,(NSIM,1))
    
    #add to calcualte market shares
    sim_exp = pd.DataFrame( np.exp(delta + (theta2*v*x).sum(axis=1)).transpose() ) 
    
    #sum up between markets
    sim_exp['mkt_id'] = data['Market_ID']
    sum_exp = sim_exp.groupby('mkt_id').sum() 
    sum_exp = pd.merge(data.copy()[['Market_ID']], sum_exp, 
                       right_index=True, left_on = 'Market_ID')
    
    #format so I can broadcast
    sim_exp = np.array(sim_exp).transpose()[:-1]
    sum_exp = np.array(sum_exp).transpose()[1:] + 1
    
    return sim_exp/sum_exp


def cal_s(data, v, delta, theta2):
    """Calculate market share
    Calculates individual choice probability first, then take sum"""
    
    shares = (1./NSIM)*cal_sim_s(data, v, delta, theta2).sum(axis=0)
    return shares

In [86]:
def cal_delta(data, v, theta2, error = 1e-3, maxiter = 500):
    """Calculate mean utility via contraction mapping"""

    niter = 0
    
    #initialize loop parameters
    delta = np.zeros(NOBS)
    s = cal_s(data, v, delta, theta2)
    diff = np.log(data['Inside Good Share']) - np.log(s)
    
    
    while ((abs(diff).max() > 1e-6) #this is easier to converge
           and (abs(diff).mean() > error) 
           and niter < maxiter):
        
        s = cal_s(data, v, delta, theta2)
        diff = np.log(data['Inside Good Share']) - np.log(s)

        if np.isnan(diff).sum():
            raise Exception('nan in diffs')
            
        delta += diff
        niter += 1

    return delta

In [17]:
def cal_xi(data, delta, theta1):
    """Calculate xi with F.O.C"""
    x,y =  setup_data(data)
    xi = np.matmul(np.array(x),theta1)
    xi = delta - np.matmul(np.array(x),theta1)
    return  xi

## Calculating  $\theta_1,\theta_2$

Here $\theta_1 = (\alpha, \beta)$

I only solve GMM over $\theta_2$, the non-linear parameters. $\theta_1$ is calculated as a function of $\delta$ using the formula from Nevo 2000

$$\hat{\theta_1} = (X'Z V^{-1} Z'X)^{-1} X'Z V^{-1} Z' \delta(\hat{\theta}_2) $$

In [18]:
def cal_theta1(data, delta, z, T):
    """  calculate theta_1 using FOCs 
    
    (X1'Z T Z'X )^-1 X1'Z T Z' delta """
    
    #set up variables
    x,y =  setup_data(data)
    X,Z = np.array(x), np.array(z)
    
    #build up to main equation
    XtZ = X.transpose().dot(Z)
    ZtX = Z.transpose().dot(X)
        
    first_exp = np.linalg.inv( XtZ.dot(T).dot(ZtX))
    second_exp = XtZ.dot(T).dot(Z.transpose()).dot(delta)
    theta1 = first_exp.dot(second_exp)
    return theta1

In [21]:
def gmm_objective(theta2_init, data,  v, z,  T):
    """calculate the GMM objective and minimize it to find theta_2
    
    I use the formula from Nevo 2000: w' z phi-1 z' w, of theta2"""
    
    #set up variables
    x,y =  setup_data(data)
    X,Z = np.array(x), np.array(z)
    
    #do calculations
    delta = cal_delta(data, v, theta2_init)
    theta1 = cal_theta1(data, delta, z, T)
    xi = cal_xi(data, delta, theta1)
        
    xitZ = xi.transpose().dot(Z)
    Ztxi = Z.transpose().dot(xi)
    return xitZ.dot(T).dot(Ztxi)
    

def calc_theta2(data, v, z, T, theta2_init,NM=True):
    """calculate theta2 using scipy"""
    if NM:
        theta2 = minimize(gmm_objective, theta2_init, args=(data,  v, z, T), method='Nelder-Mead',
                      options={'maxiter':100, 'disp': True})
    else:
        theta2 = minimize(gmm_objective, theta2_init, args=(data,  v, z, T), method='BFGS',
                      options={'maxiter':100, 'disp': True})
    return abs(theta2.x)


theta2_init = np.array([2,2,1])
theta2_test = calc_theta2(data, v, z, T, theta2_init)
print theta2_test

Optimization terminated successfully.
         Current function value: 0.000000
         Iterations: 59
         Function evaluations: 106
[2.35030545 1.09668885 1.0389929 ]


In [22]:
theta2_init = np.array([2,2,1])

def calc_theta(data, v, theta2_init, stages=2):
    """put everything together to calculate theta1 and theta2"""
    #initialize theta
    x,y =  setup_data(data)
    z = setup_hausman(data)
    
    X,Z = np.array(x), np.array(z)
    theta2 = theta2_init 
    
    #on first step, use consistent approximation of T
    T =  np.linalg.inv( z.transpose().dot(z) )
    for i in range(stages):  
       
        #on second use T using estimated xi
        if i==1:
            xi = cal_xi(data, delta, theta1)
            xi =np.array([xi]).transpose()
            T =  np.linalg.inv( Z.transpose().dot(xi).dot(xi.transpose()).dot(Z) )
        
        theta2 = calc_theta2(data, v, z, T, theta2)
        delta = cal_delta(data, v, theta2)
        theta1 = cal_theta1(data, delta, z, T)
        
    return theta1, theta2

theta = calc_theta(data, v, theta2_init, stages=2)

Optimization terminated successfully.
         Current function value: 0.000000
         Iterations: 59
         Function evaluations: 106


In [23]:
print '------------------------------------------------------------------'
print 'Mean Coefficients \n------------------------------------------------------------------'
labels1 = np.array(['Network Score','Satisfaction Score','PPO','Premium'])
print pd.DataFrame([labels1, theta[0]])
print '------------------------------------------------------------------'

print 'Coefficients Variance'
print '------------------------------------------------------------------'
print pd.DataFrame([labels1, theta[1]])
print '------------------------------------------------------------------'

------------------------------------------------------------------
Mean Coefficients 
------------------------------------------------------------------
               0                   1         2        3
0  Network Score  Satisfaction Score       PPO  Premium
1        3.79293             1.95861  0.757348 -1.47437
------------------------------------------------------------------
Coefficients Variance
------------------------------------------------------------------
               0                   1        2        3
0  Network Score  Satisfaction Score      PPO  Premium
1        2.47703             1.15401  1.00765     None
------------------------------------------------------------------


In [31]:
#save xi and write to array for counterfactuals
theta1_est, theta2_est = theta

delta_est = cal_delta(data, v, theta2_est)
xi_est = cal_xi(data, delta_est, theta1_est)

np.savetxt("xi.csv", xi_est, delimiter=",")

print theta

(array([ 3.79292846,  1.95860809,  0.75734848, -1.47436869]), array([2.47702906, 1.15401209, 1.00765258]))


## Calculate Standard Errors

In [32]:
xi_est = np.genfromtxt('xi.csv', delimiter=',')

In [180]:
def calc_s_delta(data, v, theta, xi):
    """calculate derivate wrt delta"""
    theta1,theta2 = theta
    
    delta = cal_delta(data, v, theta2)
    
    sim_shares = cal_sim_s(data, v, delta, theta2)
    cross_deriv = np.zeros((NOBS,NOBS))
    for sim_share in sim_shares:
        sim_share = sim_share.reshape((NOBS,1))
        cross_deriv = cross_deriv + sim_share.dot(sim_share.transpose())
        
    own = np.identity(NOBS)
    cross = (1 - own)
    own_deriv  = -(1-sim_shares) * sim_shares
    own_deriv = own_deriv.sum(axis=0)
    
    sim_deriv = 1./(NSIM) * (cross_deriv*cross +own_deriv*own )
    return sim_deriv
  

    return np.ones((NOBS,NOBS))


def calc_s_sigma_k(k, data, v, theta, xi):
    """calculate derivate wrt delta
    
    
    return type should by 3300 x 3
    """
    theta1,theta2 = theta
    delta = cal_delta(data, v, theta2)
    sim_shares = cal_sim_s(data, v, delta, theta2)
    x,y =  setup_data(data)
    vk = v[:,k,:]
    xk = np.array(x)[:,k]
    
    deriv_k = np.zeros(NOBS)
    for s in range(NSIM): #THIS IS A BUG PLEASE FIX IT
        
        #sum up between markets
        inner = pd.DataFrame( (xk*sim_shares[s]).transpose() )
        inner['mkt_id'] = data['Market_ID']
        inner = inner.groupby('mkt_id').sum()
        inner = pd.merge(data.copy()[['Market_ID']], inner, 
            right_index=True, left_on = 'Market_ID')
        inner = np.array(inner[0])
        outer = vk[s]*sim_shares[s]*(xk - inner)

        deriv_k = deriv_k + outer
    
    s_sigma  = 1./NSIM * deriv_k
    return s_sigma 



def calc_s_sigma( data, v, theta, xi):
    s_sigma = []
    for k in range(3):
        s_sigma_k = calc_s_sigma_k(k, data, v, theta, xi)
        s_sigma.append( s_sigma_k )
    s_sigma = np.array( s_sigma )
    return s_sigma.transpose()



def gradient(data, v, theta, xi):
    """function for analytically computing gradient
    following Nevo's appendix"""
    
    xi =np.array([xi]).transpose()
    x,y =  setup_data(data)
    z = setup_hausman(data)
    
    #set up weight matrices
    X,Z = np.array(x), np.array(z)
    V = np.linalg.inv( Z.transpose().dot(xi).dot(xi.transpose()).dot(Z) )
    
    
    #set up derivative matrix
    s_delta = calc_s_delta(data, v, theta, xi)
    s_sigma = calc_s_sigma(data, v, theta, xi)
    D_theta2 = s_delta.dot(s_sigma)
    D_theta1 = X # do not include prices as this is endogenous?
    
    D = np.concatenate( (D_theta1,D_theta2), axis=1)
    
    return (2*(D.transpose()).dot(z)).transpose()
    

gamma = gradient(data, v, theta, xi_est)

print gamma

[[  5021.1258       4852.1624       2835.98        14187.78264965
   -6720.05475434  -1731.98368468   -741.09270424]
 [  4852.1624       4699.4479       2761.          13718.10290336
   -6516.68203394  -1679.88138559   -718.87113135]
 [  2835.98         2761.           3264.           8387.8559376
   -4518.38608104  -1202.2355337    -533.02213833]
 [ 14187.78264966  13718.10290336   8387.8559376   40211.44573277
  -19100.3816671   -4931.36843788  -2113.94108161]
 [  5010.9612281    4843.1040044    2846.08735714  14180.91738933
   -6676.36153238  -1721.1326148    -736.43287084]
 [  4843.1040044    4680.01280929   2748.45621429  13704.74400926
   -6448.20050992  -1662.23238569   -711.2147752 ]]


In [181]:
print gamma.shape

(6, 7)


Below I have calculated standard errors using the formula $$(\Gamma' A \Gamma)^{-1}(\Gamma' A V A \Gamma)^{-1} (\Gamma' A \Gamma)^{-1}$$

Where $\Gamma$ is a numeric approximation of the gradient $A$ is the initial weighting matrix and $V$ is the covaraince matrix (also the optimal weight matrix)

In [182]:
def cal_standard_errors(theta, xi, data,  v):
    #set up variables
    xi =np.array([xi]).transpose()
    x,y =  setup_data(data)
    z = setup_hausman(data)
    
    #set up weight matrices
    X,Z = np.array(x), np.array(z)
    V = Z.transpose().dot(xi).dot(xi.transpose()).dot(Z)
    A =  z.transpose().dot(z)
    #G = gradient(theta, data,  v, z, h=1e-6)
    G = gamma
    GAG_inv =  np.linalg.inv( G.transpose().dot(A).dot(G) )
    GAVAG = G.transpose().dot(A).dot(V).dot(A).dot(G)
    return GAG_inv.dot(GAVAG).dot(GAG_inv)

se = cal_standard_errors(theta, xi_est, data,  v)/NOBS

se1 = np.diagonal(se)[:4]
se2 = np.diagonal(se)[4:]

Below we can see the standard errors calculated using the formula. The are high from, rounding error from calculating the gradient numerically

In [183]:
print '------------------------------------------------------------------'
print 'Mean Coefficients (Standard Error) \n------------------------------------------------------------------'
labels1 = np.array(['Network Score','Satisfaction Score','PPO','Premium'])
print pd.DataFrame([labels1, se1])
print '------------------------------------------------------------------'

print 'Coefficients Variance (Standard Error)'
print '------------------------------------------------------------------'
print pd.DataFrame([labels1,se2])
print '------------------------------------------------------------------'

------------------------------------------------------------------
Mean Coefficients (Standard Error) 
------------------------------------------------------------------
               0                   1        2        3
0  Network Score  Satisfaction Score      PPO  Premium
1         34.769             195.378  12.0389 -3.51987
------------------------------------------------------------------
Coefficients Variance (Standard Error)
------------------------------------------------------------------
               0                   1       2        3
0  Network Score  Satisfaction Score     PPO  Premium
1       -7604.69             2739.24  452893     None
------------------------------------------------------------------
