#  제9장

In [1]:
%matplotlib inline
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd
import os
from ipypublish import nb_setup

In [2]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col

In [3]:
df = pd.read_csv("education2020.csv")

In [4]:
df1 =  df.loc[ (df['income'] > 0) &  (df['asset'] > 0) &  (df['education_year'] > 0) & (df['age'] > 0) ]

### Model 1 : y : income, x: education_year
### Model 2 : y : income, x: education_year, age
### Model 3 : y : income, x: education_year, age, age2
### Model 4 : y : income, x: education_year, age, age2, sex
### Model 5 : y : income, x: education_year, age, age2, sex, job_code
### Model 6 : y : income, x: education_year, age, age2, sex, job_code, marriage_code

In [5]:
df2=df1.copy()

In [6]:
df2['age2'] = df2['age']**2

In [7]:
df2['sex'] = (df2['sex']).astype(int)

In [8]:
df2['job_code'] = df2['job']

In [9]:
marriage_dummies = pd.get_dummies(df2['marriage']).rename(columns=lambda x: 'm'+str(x))

In [10]:
marriage_dummies

Unnamed: 0,m1,m2,m3,m4
0,0,1,0,0
1,0,1,0,0
2,0,0,1,0
3,0,1,0,0
4,1,0,0,0
...,...,...,...,...
18059,0,1,0,0
18060,0,1,0,0
18061,0,1,0,0
18062,0,0,0,1


In [11]:
sex_dummies = pd.get_dummies(df2['sex']).rename(columns=lambda x: 's'+str(x))

In [12]:
job_dummies = pd.get_dummies(df2['job_code']).rename(columns=lambda x: 'd'+str(x))

In [13]:
df3 = pd.concat([df2, sex_dummies, job_dummies, marriage_dummies], axis=1)

In [14]:
df3['const'] = 1
# Create lists of variables to be used in each regression

X1 = ['const', 'education_year']
X2 = ['const', 'education_year', 'age', 'age2']
X3 = ['const', 'education_year', 'age', 'age2', 's2']
X4 = ['const', 'education_year', 'age', 'age2', 's2', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'dA']
X5 = ['const', 'education_year', 'age', 'age2', 's2', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'dA','m2', 'm3', 'm4']

# Estimate an OLS regression for each set of variables
reg1 = sm.OLS(df3['income'], df3[X1], missing='drop').fit()
reg2 = sm.OLS(df3['income'], df3[X2], missing='drop').fit()
reg3 = sm.OLS(df3['income'], df3[X3], missing='drop').fit()
reg4 = sm.OLS(df3['income'], df3[X4], missing='drop').fit()
reg5 = sm.OLS(df3['income'], df3[X5], missing='drop').fit()

In [15]:
print(reg3.summary())

                            OLS Regression Results                            
Dep. Variable:                 income   R-squared:                       0.203
Model:                            OLS   Adj. R-squared:                  0.203
Method:                 Least Squares   F-statistic:                     1090.
Date:                Mon, 13 Feb 2023   Prob (F-statistic):               0.00
Time:                        16:10:32   Log-Likelihood:            -1.7031e+05
No. Observations:               17135   AIC:                         3.406e+05
Df Residuals:                   17130   BIC:                         3.407e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const          -1.184e+04    568.382    -20.

In [16]:
info_dict={'R-squared' : lambda x: f"{x.rsquared:.2f}", 'No. observations' : lambda x: f"{int(x.nobs):d}"}

results_table = summary_col(results=[reg1, reg2, reg3, reg4, reg5],
                            float_format='%0.2f',
                            stars = True,
                            model_names=['Model 1','Model 2','Model 3','Model 4','Model 5'],
                            info_dict=info_dict,
                            regressor_order=['const','education_year', 'age','age2','s2',
                                             'd2','d3','d4', 'd5', 'd6', 'd7', 'd8', 'd9','dA'
                                             ,'m2','m3','m4'])

results_table.add_title('Table 1 - OLS Regressions')

print(results_table)

                           Table 1 - OLS Regressions
                   Model 1     Model 2      Model 3      Model 4      Model 5   
--------------------------------------------------------------------------------
const            -1809.22*** -14003.25*** -11842.16*** -12886.04*** -12679.13***
                 (142.12)    (565.06)     (568.38)     (560.27)     (552.29)    
education_year   607.63***   572.07***    509.87***    379.69***    350.49***   
                 (11.03)     (12.84)      (13.05)      (14.09)      (13.86)     
age                          483.86***    451.48***    473.47***    411.92***   
                             (18.86)      (18.70)      (18.61)      (19.11)     
age2                         -4.33***     -4.07***     -4.01***     -3.72***    
                             (0.16)       (0.16)       (0.16)       (0.17)      
s2                                        -1916.66***  -1962.54***  -622.95***  
                                          (94.19)      (