In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

### Housing Price Problem

In [2]:
utown = pd.read_sas('utown.sas7bdat')
utown[['utown', 'pool', 'fplace']] = utown[['utown', 'pool', 'fplace']].astype(int)

In [3]:
print(utown.head())

     price   sqft  age  utown  pool  fplace
0  205.452  23.46  6.0      0     0       1
1  185.328  20.03  5.0      0     0       1
2  248.422  27.77  6.0      0     0       0
3  154.690  20.17  1.0      0     0       0
4  221.801  26.45  0.0      0     0       1


In [4]:
mdl_utown_dummy1 = smf.ols('price~sqft', data=utown).fit()
print(mdl_utown_dummy1.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.354
Model:                            OLS   Adj. R-squared:                  0.353
Method:                 Least Squares   F-statistic:                     546.0
Date:                Tue, 16 Feb 2021   Prob (F-statistic):           1.13e-96
Time:                        14:38:28   Log-Likelihood:                -4942.5
No. Observations:                1000   AIC:                             9889.
Df Residuals:                     998   BIC:                             9899.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     30.9203      9.337      3.312      0.0

In [5]:
mdl_utown_dummy2 = smf.ols('price~utown+sqft', data=utown).fit()
print(mdl_utown_dummy2.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.865
Model:                            OLS   Adj. R-squared:                  0.865
Method:                 Least Squares   F-statistic:                     3192.
Date:                Tue, 16 Feb 2021   Prob (F-statistic):               0.00
Time:                        14:38:28   Log-Likelihood:                -4159.7
No. Observations:                1000   AIC:                             8325.
Df Residuals:                     997   BIC:                             8340.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      5.6809      4.290      1.324      0.1

In [6]:
utown['sqft_utown'] = utown['sqft']*utown['utown']

mdl_utown_dummy3 = smf.ols('price~sqft+sqft_utown', data=utown).fit()
print(mdl_utown_dummy3.summary())


                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.865
Model:                            OLS   Adj. R-squared:                  0.865
Method:                 Least Squares   F-statistic:                     3205.
Date:                Tue, 16 Feb 2021   Prob (F-statistic):               0.00
Time:                        14:38:28   Log-Likelihood:                -4157.9
No. Observations:                1000   AIC:                             8322.
Df Residuals:                     997   BIC:                             8337.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     38.1775      4.264      8.953      0.0

In [7]:
mdl_utown_dummy4 = smf.ols('price~utown+sqft+sqft_utown', data=utown).fit()
print(mdl_utown_dummy4.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.867
Model:                            OLS   Adj. R-squared:                  0.866
Method:                 Least Squares   F-statistic:                     2162.
Date:                Tue, 16 Feb 2021   Prob (F-statistic):               0.00
Time:                        14:38:28   Log-Likelihood:                -4152.5
No. Observations:                1000   AIC:                             8313.
Df Residuals:                     996   BIC:                             8333.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     23.0625      6.239      3.696      0.0

In [8]:
mdl_utown = smf.ols('price~utown+sqft+sqft_utown+age+pool+fplace', data=utown).fit()
print(mdl_utown.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.871
Model:                            OLS   Adj. R-squared:                  0.870
Method:                 Least Squares   F-statistic:                     1113.
Date:                Tue, 16 Feb 2021   Prob (F-statistic):               0.00
Time:                        14:38:28   Log-Likelihood:                -4138.4
No. Observations:                1000   AIC:                             8291.
Df Residuals:                     993   BIC:                             8325.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     24.5000      6.192      3.957      0.0

### Wage Problem concerning Gender, Race, Location 

In [9]:
cps4_small = pd.read_sas('cps4_small.sas7bdat')

In [10]:
cps4_small['black_female'] = cps4_small['black']*cps4_small['female']

mdl_wage1 = smf.ols('wage~educ+black*female', data=cps4_small).fit()
print(mdl_wage1.summary())

                            OLS Regression Results                            
Dep. Variable:                   wage   R-squared:                       0.209
Model:                            OLS   Adj. R-squared:                  0.206
Method:                 Least Squares   F-statistic:                     65.67
Date:                Tue, 16 Feb 2021   Prob (F-statistic):           2.53e-49
Time:                        14:38:28   Log-Likelihood:                -3853.5
No. Observations:                1000   AIC:                             7717.
Df Residuals:                     995   BIC:                             7741.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -5.2812      1.900     -2.779   

### Note:

- When we use `black*female` as a regressor, we actually add three explanatory variables: `black`, `female` and `black*female`. When we have a lot of variables, it might be easier to generate a `black_female` variable (= `black`$\times$`female`) by hand.

In [11]:
mdl_wage2 = smf.ols('wage~educ+south+midwest+west', data=cps4_small).fit()
print(mdl_wage2.summary())

                            OLS Regression Results                            
Dep. Variable:                   wage   R-squared:                       0.186
Model:                            OLS   Adj. R-squared:                  0.183
Method:                 Least Squares   F-statistic:                     56.84
Date:                Tue, 16 Feb 2021   Prob (F-statistic):           3.19e-43
Time:                        14:38:28   Log-Likelihood:                -3867.7
No. Observations:                1000   AIC:                             7745.
Df Residuals:                     995   BIC:                             7770.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -6.1767      2.051     -3.012      0.0

In [12]:
mdl_wage3 = smf.ols('wage~educ+black+female+black_female+south+midwest+west', data=cps4_small).fit()
print(mdl_wage3.summary())


                            OLS Regression Results                            
Dep. Variable:                   wage   R-squared:                       0.219
Model:                            OLS   Adj. R-squared:                  0.213
Method:                 Least Squares   F-statistic:                     39.71
Date:                Tue, 16 Feb 2021   Prob (F-statistic):           2.32e-49
Time:                        14:38:28   Log-Likelihood:                -3847.1
No. Observations:                1000   AIC:                             7710.
Df Residuals:                     992   BIC:                             7749.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -4.8062      2.029     -2.369   

In [13]:
cps4_small['educ_south'] = cps4_small['educ']*cps4_small['south']
cps4_small['black_south'] = cps4_small['black']*cps4_small['south']
cps4_small['female_south'] = cps4_small['female']*cps4_small['south']
cps4_small['black_female_south'] = cps4_small['black']*cps4_small['female']*cps4_small['south']

mdl_wage4 = smf.ols('wage~educ+black+female+black_female+south+educ_south+black_south+female_south+black_female_south', data=cps4_small).fit()

print(mdl_wage4.summary())


                            OLS Regression Results                            
Dep. Variable:                   wage   R-squared:                       0.210
Model:                            OLS   Adj. R-squared:                  0.203
Method:                 Least Squares   F-statistic:                     29.26
Date:                Tue, 16 Feb 2021   Prob (F-statistic):           2.00e-45
Time:                        14:38:28   Log-Likelihood:                -3852.6
No. Observations:                1000   AIC:                             7725.
Df Residuals:                     990   BIC:                             7774.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept             -6.6056      2

In [14]:
cps4_small_nonsouth = cps4_small[cps4_small['south']==0]
print(cps4_small_nonsouth.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 704 entries, 1 to 998
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   wage                704 non-null    float64
 1   educ                704 non-null    float64
 2   exper               704 non-null    float64
 3   hrswk               704 non-null    float64
 4   married             704 non-null    float64
 5   female              704 non-null    float64
 6   metro               704 non-null    float64
 7   midwest             704 non-null    float64
 8   south               704 non-null    float64
 9   west                704 non-null    float64
 10  black               704 non-null    float64
 11  asian               704 non-null    float64
 12  black_female        704 non-null    float64
 13  educ_south          704 non-null    float64
 14  black_south         704 non-null    float64
 15  female_south        704 non-null    float64
 16  black_fe

In [15]:
mdl_wage5 = smf.ols('wage~educ+black+female+black_female', data=cps4_small_nonsouth).fit()
print(mdl_wage5.summary())

                            OLS Regression Results                            
Dep. Variable:                   wage   R-squared:                       0.221
Model:                            OLS   Adj. R-squared:                  0.217
Method:                 Least Squares   F-statistic:                     49.72
Date:                Tue, 16 Feb 2021   Prob (F-statistic):           7.79e-37
Time:                        14:38:28   Log-Likelihood:                -2702.8
No. Observations:                 704   AIC:                             5416.
Df Residuals:                     699   BIC:                             5438.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -6.6056      2.302     -2.869   

In [16]:
cps4_small_south = cps4_small[cps4_small['south']==1]
print(cps4_small_south.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 296 entries, 0 to 999
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   wage                296 non-null    float64
 1   educ                296 non-null    float64
 2   exper               296 non-null    float64
 3   hrswk               296 non-null    float64
 4   married             296 non-null    float64
 5   female              296 non-null    float64
 6   metro               296 non-null    float64
 7   midwest             296 non-null    float64
 8   south               296 non-null    float64
 9   west                296 non-null    float64
 10  black               296 non-null    float64
 11  asian               296 non-null    float64
 12  black_female        296 non-null    float64
 13  educ_south          296 non-null    float64
 14  black_south         296 non-null    float64
 15  female_south        296 non-null    float64
 16  black_fe

In [17]:
mdl_wage6 = smf.ols('wage~educ+black+female+black_female', data=cps4_small_south).fit()
print(mdl_wage6.summary())

                            OLS Regression Results                            
Dep. Variable:                   wage   R-squared:                       0.184
Model:                            OLS   Adj. R-squared:                  0.173
Method:                 Least Squares   F-statistic:                     16.43
Date:                Tue, 16 Feb 2021   Prob (F-statistic):           3.79e-12
Time:                        14:38:28   Log-Likelihood:                -1149.4
No. Observations:                 296   AIC:                             2309.
Df Residuals:                     291   BIC:                             2327.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -2.6617      3.420     -0.778   

In [18]:
mdl_wage7 = smf.ols('wage~educ+black+female+black_female', data=cps4_small).fit()
print(mdl_wage7.summary())


                            OLS Regression Results                            
Dep. Variable:                   wage   R-squared:                       0.209
Model:                            OLS   Adj. R-squared:                  0.206
Method:                 Least Squares   F-statistic:                     65.67
Date:                Tue, 16 Feb 2021   Prob (F-statistic):           2.53e-49
Time:                        14:38:28   Log-Likelihood:                -3853.5
No. Observations:                1000   AIC:                             7717.
Df Residuals:                     995   BIC:                             7741.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -5.2812      1.900     -2.779   