In [1]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [2]:
data = pd.read_csv("data3.csv")

In [3]:
dem_ind_stats = data['dem_ind'].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9])
dem_ind_min = dem_ind_stats['min']
dem_ind_max = dem_ind_stats['max']
dem_ind_mean = dem_ind_stats['mean']
dem_ind_std = dem_ind_stats['std']
dem_ind_10th = dem_ind_stats['10%']
dem_ind_25th = dem_ind_stats['25%']
dem_ind_50th = dem_ind_stats['50%']
dem_ind_75th = dem_ind_stats['75%']
dem_ind_90th = dem_ind_stats['90%']
dem_ind_stats, dem_ind_min, dem_ind_max, dem_ind_mean, dem_ind_std, dem_ind_10th, dem_ind_25th, dem_ind_50th, dem_ind_75th, dem_ind_90th

(count    1266.000000
 mean        0.499073
 std         0.371337
 min         0.000000
 10%         0.000000
 25%         0.166667
 50%         0.500000
 75%         0.833333
 90%         1.000000
 max         1.000000
 Name: dem_ind, dtype: float64,
 0.0,
 1.0,
 0.49907319871879935,
 0.371336742550142,
 0.0,
 0.166666701,
 0.5,
 0.833333313,
 1.0)

In [6]:
us_2000_dem_ind = data[(data['country'] == 'United States') & (data['year'] == 2000)]['dem_ind'].values
us_avg_dem_ind = data[data['country'] == 'United States']['dem_ind'].mean()

us_avg_dem_ind

0.9855555561111111

In [7]:
us_2000_dem_ind

array([1.])

In [8]:
libya_2000_dem_ind = data[(data['country'] == 'Libya') & (data['year'] == 2000)]['dem_ind'].values
libya_avg_dem_ind = data[data['country'] == 'Libya']['dem_ind'].mean()
libya_2000_dem_ind, libya_avg_dem_ind

(array([0.]), 0.10925926755555555)

In [9]:
country_avg_dem_ind = data.groupby('country')['dem_ind'].mean()
countries_greater_0_95 = country_avg_dem_ind[country_avg_dem_ind > 0.95].head(5).index.tolist()
countries_less_0_10 = country_avg_dem_ind[country_avg_dem_ind < 0.10].head(5).index.tolist()
countries_between_0_3_and_0_7 = country_avg_dem_ind[(country_avg_dem_ind >= 0.3) & (country_avg_dem_ind <= 0.7)].head(5).index.tolist()
countries_greater_0_95, countries_less_0_10, countries_between_0_3_and_0_7

(['Australia', 'Austria', 'Barbados', 'Belgium', 'Belize'],
 ['Afghanistan', 'Angola', 'Brunei', 'Burundi', 'China'],
 ['Antigua', 'Argentina', 'Armenia', 'Bangladesh', 'Bolivia'])

In [10]:
regression_data = data[['dem_ind', 'log_gdppc', 'country']].dropna()
model = sm.OLS(regression_data['dem_ind'], sm.add_constant(regression_data['log_gdppc'])).fit(cov_type='cluster', cov_kwds={'groups': regression_data['country']})
coef = model.params['log_gdppc']
p_value = model.pvalues['log_gdppc']

model.summary(), coef, p_value

(<class 'statsmodels.iolib.summary.Summary'>
 """
                             OLS Regression Results                            
 Dep. Variable:                dem_ind   R-squared:                       0.438
 Model:                            OLS   Adj. R-squared:                  0.438
 Method:                 Least Squares   F-statistic:                     396.4
 Date:                Fri, 14 Jun 2024   Prob (F-statistic):           7.94e-44
 Time:                        14:43:04   Log-Likelihood:                -110.72
 No. Observations:                 958   AIC:                             225.4
 Df Residuals:                     956   BIC:                             235.2
 Df Model:                           1                                         
 Covariance Type:              cluster                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
 ---------------------------------------------------------------------

In [11]:
model_no_cluster = sm.OLS(regression_data['dem_ind'], sm.add_constant(regression_data['log_gdppc'])).fit()
coef_no_cluster = model_no_cluster.params['log_gdppc']
std_err_no_cluster = model_no_cluster.bse['log_gdppc']
coef_no_cluster, std_err_no_cluster

(0.23567310887708887, 0.008625848444172586)