# Explore the effects of model misspecification on Oliveira et al. 2016
Sources: 
- [Paper]()
- [Dataset]()

In [1]:
import pandas as pd
import statsmodels.formula.api as smf
import os

# Replicate the analyses

## Dataset: Stackoverflow

## Summmary or replication attempt: 
- There is a missing value in the dataset for Russia (pr_editions).
- 

In [34]:
# Load StackOverflow data
df_so = pd.read_csv("data/stackoverflow.csv")

# Filter out any countries with missing data: 
# "...we consider only the countries for which data for all explanatory
# variables is available (n =51 for StackOverflow and n =45 for Superuser)."
df_so = df_so.dropna(axis='index', how='any', subset=['pr_questions', 'pr_answers', 'pr_comments', 'pr_editions', 'IDV', 'GNI', 'Internet', 'English', 'WVS'])
assert(len(df_so) == 51)

# BUG FOUND: Russian Federation data for pr_editions is missing
# Work around: Remove Russian Federation
df_so = df_so.loc[df_so["country.iso"] != "RUS"]
assert(len(df_so) == 50)
df_so["pr_editions"] = df_so["pr_editions"].astype('float')

# Show the filtered dataset
df_so
# Show the data types
# df_so.dtypes


Unnamed: 0,country.iso,country.name,num_u,num_contrib,gini,pr_total,particip_lower,particip_upper,var,pr_questions,pr_answers,pr_comments,pr_editions,IDV,GNI,Internet,English,WVS
0,GTM,Guatemala,215,8112,0.948123,0.665116,0.599622,0.724814,0.125192,0.446512,0.516279,0.172093,0.055814,6.0,3140.0,12.3,45.77,-0.17
4,COL,Colombia,1224,17713,0.875322,0.658497,0.631471,0.68453,0.053059,0.496732,0.477941,0.171569,0.059641,13.0,6990.0,40.4,48.54,0.6
6,IDN,Indonesia,3585,35597,0.904504,0.543375,0.527032,0.559626,0.032594,0.421478,0.32106,0.110739,0.035983,14.0,3420.0,12.3,52.74,-0.8
9,PER,Peru,534,42535,0.95549,0.640449,0.598879,0.680014,0.081135,0.468165,0.477528,0.228464,0.093633,16.0,5880.0,36.0,51.46,0.03
12,KOR,"Korea, Republic of",1784,24887,0.929445,0.533632,0.510436,0.556684,0.046249,0.406951,0.319507,0.106502,0.04148,18.0,22670.0,83.8,53.62,-1.37
13,SLV,El Salvador,133,2511,0.8786,0.631579,0.54698,0.708791,0.161811,0.496241,0.473684,0.135338,0.052632,19.0,3580.0,18.9,43.46,0.53
14,CHN,China,13401,145338,0.925213,0.511753,0.503288,0.520211,0.016924,0.363107,0.35572,0.107828,0.029774,20.0,5680.0,38.3,50.15,-1.16
16,VNM,Viet Nam,2319,33684,0.917303,0.558862,0.538572,0.578956,0.040384,0.421302,0.370418,0.128504,0.037516,20.0,1400.0,35.1,51.57,-0.26
18,THA,Thailand,1152,21466,0.904932,0.555556,0.526724,0.584018,0.057294,0.41059,0.388889,0.170139,0.05816,20.0,5210.0,23.7,47.79,0.01
21,CHL,Chile,1075,26163,0.914023,0.664186,0.635414,0.691789,0.056375,0.484651,0.483721,0.210233,0.076279,23.0,14280.0,52.3,48.75,0.0


In [40]:
# Specify the dependent and independent variables

# Dependent variables 
dvs = ["pr_questions", "pr_answers", "pr_comments", "pr_editions"]

# m1 model IVs
baseline_ivs = ["GNI", "Internet", "English"]

# Model 1 IVs (Baseline + Individualism/Collectivism)
m1_ivs = baseline_ivs + ["IDV"]

# Model 2 IVs (Baseine + Survival/Self-expression)
m2_ivs = baseline_ivs + ["WVS"]
print(m2_ivs)

['GNI', 'Internet', 'English', 'WVS']


In [37]:
# Baseline models
# Specify the models
baseline_formulas = list()
for dv in dvs: 
    form = dv + " ~ "
    form += '+'.join(baseline_ivs)

    baseline_formulas.append(form)

# Fit the models
baseline_results = dict()
for form in baseline_formulas:
    assert(isinstance(form, str))
    baseline_model = smf.ols(form, data=df_so)
    
    baseline_results[form] = baseline_model

print("All baseline models:")
# Print summary of models
for form,model in baseline_results.items():
    print(form)
    print(model.fit().summary())
    print("\n")

print("+++++ END BASELINE +++++")

All baseline models:
pr_questions ~ GNI+Internet+English
                            OLS Regression Results                            
Dep. Variable:           pr_questions   R-squared:                       0.516
Model:                            OLS   Adj. R-squared:                  0.485
Method:                 Least Squares   F-statistic:                     16.37
Date:                Mon, 25 Oct 2021   Prob (F-statistic):           2.23e-07
Time:                        10:32:28   Log-Likelihood:                 77.431
No. Observations:                  50   AIC:                            -146.9
Df Residuals:                      46   BIC:                            -139.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------

In [38]:
# M1 models
# Specify the models
m1_formulas = list()
for dv in dvs: 
    form = dv + " ~ "
    form += '+'.join(m1_ivs)

    m1_formulas.append(form)

# Fit the models
m1_results = dict()
for form in m1_formulas:
    assert(isinstance(form, str))
    m1_model = smf.ols(form, data=df_so)
    
    m1_results[form] = m1_model

print("All M1 models:")
# Print summary of models
for form,model in m1_results.items():
    print(form)
    print(model.fit().summary())
    print("\n")

print("+++++ END M1 +++++")

All M1 models:
pr_questions ~ GNI+Internet+English+IDV
                            OLS Regression Results                            
Dep. Variable:           pr_questions   R-squared:                       0.547
Model:                            OLS   Adj. R-squared:                  0.507
Method:                 Least Squares   F-statistic:                     13.58
Date:                Mon, 25 Oct 2021   Prob (F-statistic):           2.43e-07
Time:                        10:33:26   Log-Likelihood:                 79.065
No. Observations:                  50   AIC:                            -148.1
Df Residuals:                      45   BIC:                            -138.6
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------

In [41]:
# M2 models
# Specify the models
m2_formulas = list()
for dv in dvs: 
    form = dv + " ~ "
    form += '+'.join(m2_ivs)

    m2_formulas.append(form)

# Fit the models
m2_results = dict()
for form in m2_formulas:
    assert(isinstance(form, str))
    m2_model = smf.ols(form, data=df_so)
    
    m2_results[form] = m2_model

print("All M2 models:")
# Print summary of models
for form,model in m2_results.items():
    print(form)
    print(model.fit().summary())
    print("\n")

print("+++++ END M2 +++++")

All M2 models:
pr_questions ~ GNI+Internet+English+WVS
                            OLS Regression Results                            
Dep. Variable:           pr_questions   R-squared:                       0.527
Model:                            OLS   Adj. R-squared:                  0.485
Method:                 Least Squares   F-statistic:                     12.53
Date:                Mon, 25 Oct 2021   Prob (F-statistic):           6.25e-07
Time:                        10:47:06   Log-Likelihood:                 77.979
No. Observations:                  50   AIC:                            -146.0
Df Residuals:                      45   BIC:                            -136.4
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------