In [18]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from linearmodels.iv import IV2SLS

In [None]:
# load dataset from repository root (one level up from src)
df = pd.read_csv('../output/data/final_dataset.csv')
df.head()

Unnamed: 0,rank,app,developer,country,app_id,price,ndownloads,lowerbound,upperbound,averagescore,...,inapppurchases,estimateddownloads,date,newgroup,iap,highest_rank,lowest_rank,multiplier,new_est,nest
0,476,KUNI Cam,GinnyPix,Sweden,com.ginnypix.kuni,4.49,"50,000+",50000,100000,4.5,...,True,50000,9/21/19,OUTSIDE,1,12,498,102.88066,52263.375,OUTSIDE
1,452,Link2SD Plus (New),Bulent Akpinar,Spain,com.buak.link2sdplus,2.35,"100,000+",100000,500000,3.3,...,False,282840,9/21/19,OUTSIDE,0,2,493,814.66394,133401.22,OUTSIDE
2,291,Pushy,medienwerkstatt,Germany,de.fk.android.pushy,1.49,"10,000+",10000,50000,4.4,...,False,25040,9/21/19,OUTSIDE,0,11,494,82.815735,26811.594,OUTSIDE
3,397,EasyMSR,DEFTUN TECH,United States,com.gbtf.msrx6pro,19.99,"10,000+",10000,50000,3.5,...,False,10000,9/21/19,OUTSIDE,0,9,500,81.466393,18391.039,OUTSIDE
4,154,GTA: Liberty City Stories,Rockstar Games,Australia,com.rockstargames.gtalcs,6.99,"100,000+",100000,500000,4.3,...,False,253380,9/21/19,OUTSIDE,0,3,489,823.04529,375720.16,OUTSIDE


# Part I Demand Estimation
1.1 Berry Logit

Compute market share

In [None]:
#total market by country
df['total_market'] = df.groupby('country')['new_est'].transform('sum')
df['s_j'] = df['new_est'] / df['total_market']

#compute market share of outside good
outside_by_country = df[df['nest'].str.upper() == 'OUTSIDE'].groupby('country')['new_est'].sum()
df['Q_out'] = df['country'].map(outside_by_country)
df['s_0'] = df['Q_out'] / df['total_market']

df = df[df['nest'].str.upper() != 'OUTSIDE'].copy()

Estimate Berry equation without intruments for prices

In [17]:
epsilon = 1e-10
df['y'] = np.log(df['s_j'].clip(lower=epsilon)) - np.log(df['s_0'].clip(lower=epsilon))

formula = 'y ~ price + averagescore + iap'

ols = smf.ols(formula=formula, data=df).fit(cov_type='HC1')
print(ols.summary())

fe = smf.ols(formula + " + C(country)", data=df).fit(cov_type="HC1")
print(fe.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.061
Model:                            OLS   Adj. R-squared:                  0.060
Method:                 Least Squares   F-statistic:                     104.2
Date:                Sat, 04 Oct 2025   Prob (F-statistic):           3.31e-65
Time:                        19:33:21   Log-Likelihood:                -9838.1
No. Observations:                4504   AIC:                         1.968e+04
Df Residuals:                    4500   BIC:                         1.971e+04
Df Model:                           3                                         
Covariance Type:                  HC1                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -7.3341      0.317    -23.144   

Estimate Berry equation with price instruments

In [33]:
df['Z_numapps'] = df.groupby('appgenre')['app_id'].transform('count').astype(float)


df["Z_numapps"] = np.log(df["Z_numapps"].clip(lower=1) - 1 + 1)


cols = ["y", "price", "averagescore", "iap", "country", "Z_numapps"]
df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=cols)
df["iap"] = df["iap"].astype(int)
df["country"] = df["country"].astype(str)

fe = pd.get_dummies(df["country"], prefix="cty", drop_first=True)
X = pd.concat([df[["averagescore", "iap"]], fe], axis=1)

iv_res = IV2SLS(
    dependent=df["y"],
    exog=pd.DataFrame({"const": 1}, index=df.index).join(X),
    endog=df[["price"]],
    instruments=df[["Z_numapps"]],
).fit(cov_type="clustered", clusters=df["country"])

print("\n=== IV (2SLS) with country FE, clustered SE by country ===")
print(iv_res.summary)


# First stage with country FE via C(country)
fs_mod = smf.ols(
    "price ~ Z_numapps + averagescore + iap + C(country)",
    data=df
).fit(cov_type="cluster", cov_kwds={"groups": df["country"]})

print("\n=== First Stage OLS (clustered by country) ===")
print(fs_mod.summary())

# Robust F for excluded instrument (single Z)
t_iv = fs_mod.tvalues["Z_numapps"]
F_iv = float(t_iv**2)
print(f"\n[First-stage] robust t(Z_numapps) = {t_iv:.2f}  →  robust F = {F_iv:.2f}")

# Partial R^2 of instrument given controls + FE
controls_fe = "averagescore + iap + C(country)"
res_p = smf.ols("price ~ " + controls_fe, data=df).fit().resid
res_z = smf.ols("Z_numapps ~ " + controls_fe, data=df).fit().resid
partial_R2 = np.corrcoef(res_p, res_z)[0, 1] ** 2
print(f"[First-stage] partial R² (given controls+FE): {partial_R2:.4f}")




=== IV (2SLS) with country FE, clustered SE by country ===
                          IV-2SLS Estimation Summary                          
Dep. Variable:                      y   R-squared:                      0.0717
Estimator:                    IV-2SLS   Adj. R-squared:                 0.0689
No. Observations:                4504   F-statistic:                 6.965e+18
Date:                Sat, Oct 04 2025   P-value (F-stat)                0.0000
Time:                        20:09:48   Distribution:                 chi2(14)
Cov. Estimator:             clustered                                         
                                                                              
                                 Parameter Estimates                                  
                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
--------------------------------------------------------------------------------------
const                 -7.3605     0.7644    -9.



# Try another approach !!!

In [34]:
# MARKET = country × appgenre
market = ['country','appgenre']

# (a) leave-one-out rivals count
g = df.groupby(market)['app_id'].transform('count')
df['Z_rivals'] = (g - 1).clip(lower=0).astype(float)
df['Z_logrivals'] = np.log(df['Z_rivals'].replace(0, np.nan)).fillna(0.0)

# (b) rivals' average characteristics within market (BLP-style)
def loo_mean(s):
    n = s.size
    return (s.sum() - s) / np.maximum(n-1, 1)

grp = df.groupby(market)
df['Z_rivals_avgscore'] = grp['averagescore'].transform(loo_mean).fillna(0.0)
df['Z_rivals_iapshare'] = grp['iap'].transform(loo_mean).fillna(0.0)

# cluster key at the market level
df['cluster_market'] = df['country'].astype(str) + '|' + df['appgenre'].astype(str)


In [39]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from linearmodels.iv import IV2SLS, IVLIML

# ===== SETTINGS =====
instr = ['Z_logrivals','Z_rivals_avgscore','Z_rivals_iapshare']
controls_fe = 'averagescore + iap + C(country)'
cluster_key = df['cluster_market']   # market = country|genre (come hai creato prima)

# ===== 2SLS =====
iv = IV2SLS.from_formula(
    f"y ~ 1 + {controls_fe} + [ price ~ {' + '.join(instr)} ]",
    data=df
).fit(cov_type="clustered", clusters=cluster_key)

print("\n=== IV (2SLS) with country FE, clustered by market ===")
print(iv.summary)

# ===== FIRST STAGE (con FE) con statsmodels OLS =====
fs_formula = f"price ~ {' + '.join(instr)} + {controls_fe}"
fs = smf.ols(fs_formula, data=df).fit(cov_type="cluster", cov_kwds={"groups": cluster_key})

print("\n=== First Stage OLS (clustered by market) ===")
print(fs.summary())

# --- Wald test congiunto per gli strumenti (robust/cluster) ---
# FWL: test R*beta = 0 per i soli strumenti
params = fs.params
cov = fs.cov_params()
pidx = params.index
L = len(instr)
R = np.zeros((L, len(pidx)))
for i, nm in enumerate(instr):
    R[i, pidx.get_loc(nm)] = 1.0

r = R @ params.values            # R*beta
V = R @ cov.values @ R.T         # R*Var(beta)*R'
# Wald chi2 e F (F = chi2/L)
chi2_stat = float(r.T @ np.linalg.inv(V) @ r)
F_stat = chi2_stat / L

# p-value (fallback senza SciPy: approx chi2 -> niente p). Se SciPy c’è, decommenta:
try:
    from scipy.stats import chi2
    pval = 1 - chi2.cdf(chi2_stat, df=L)
except Exception:
    pval = np.nan

print(f"\n[First-stage] Wald chi2({L}) = {chi2_stat:.2f},  F = {F_stat:.2f},  p = {pval:.4g}")

# --- Partial R^2 degli strumenti (dato controls+FE), via FWL ---
# residua PRICE su controls+FE
res_p = smf.ols(f"price ~ {controls_fe}", data=df).fit().resid
# residua ciascuno strumento su controls+FE
res_Z = pd.DataFrame({
    z: smf.ols(f"{z} ~ {controls_fe}", data=df).fit().resid
    for z in instr
}, index=df.index)

# regressa i residui di price sui residui degli strumenti
res_mod = sm.OLS(res_p, sm.add_constant(res_Z)).fit()
partial_R2 = res_mod.rsquared
print(f"[First-stage] partial R² (given controls+FE): {partial_R2:.4f}")

# ===== LIML (robusto a weak-IV) =====
liml = IVLIML.from_formula(
    f"y ~ 1 + {controls_fe} + [ price ~ {' + '.join(instr)} ]",
    data=df
).fit(cov_type="clustered", clusters=cluster_key)

print("\n=== LIML (weak-IV-robust), clustered by market ===")
print(liml.summary)

# ===== Wu–Hausman (endogeneità del price) =====
print("\nWu–Hausman test for endogeneity of price:")
print(iv.wu_hausman)



=== IV (2SLS) with country FE, clustered by market ===
                          IV-2SLS Estimation Summary                          
Dep. Variable:                      y   R-squared:                     -0.2516
Estimator:                    IV-2SLS   Adj. R-squared:                -0.2555
No. Observations:                4504   F-statistic:                    170.96
Date:                Sat, Oct 04 2025   P-value (F-stat)                0.0000
Time:                        20:21:59   Distribution:                 chi2(14)
Cov. Estimator:             clustered                                         
                                                                              
                                      Parameter Estimates                                       
                              Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------------------------
Intercept            