In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.stats.api as sms
from statsmodels.compat import lzip
from statsmodels.formula.api import ols   #导入模块
import statsmodels.stats.outliers_influence as smo
import xlrd    # 为了excel文件导入成功必须的包

In [2]:
df = pd.read_excel('NPGR.xls')
#df

In [3]:
#进行OLS分析
model = ols('常住人口自然增长率~商品房平均销售价格+常住人口城镇化率+单位人口医疗卫生机构床位数+人均GDP+工业增加值',data = df).fit()
#model = ols('常住人口自然增长率~np.log(常住人口城镇化率)+np.log(单位人口医疗卫生机构床位数)',data = df).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:              常住人口自然增长率   R-squared:                       0.797
Model:                            OLS   Adj. R-squared:                  0.719
Method:                 Least Squares   F-statistic:                     10.20
Date:                Tue, 28 Dec 2021   Prob (F-statistic):           0.000383
Time:                        22:22:52   Log-Likelihood:                -17.160
No. Observations:                  19   AIC:                             46.32
Df Residuals:                      13   BIC:                             51.99
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -6.3611     10.327     -0.616

  "anyway, n=%i" % int(n))


In [4]:
#对人均GDP和工业增加值取对数，再进行OLS分析
model1 = ols('常住人口自然增长率~商品房平均销售价格+常住人口城镇化率+单位人口医疗卫生机构床位数+np.log(人均GDP)+np.log(工业增加值)',data = df).fit()
#model1 = ols('常住人口自然增长率~np.log(常住人口城镇化率)+np.log(单位人口医疗卫生机构床位数)',data = df).fit()
model1_summary = model1.summary()
print(model1_summary)

                            OLS Regression Results                            
Dep. Variable:              常住人口自然增长率   R-squared:                       0.764
Model:                            OLS   Adj. R-squared:                  0.673
Method:                 Least Squares   F-statistic:                     8.419
Date:                Tue, 28 Dec 2021   Prob (F-statistic):           0.000964
Time:                        22:22:52   Log-Likelihood:                -18.586
No. Observations:                  19   AIC:                             49.17
Df Residuals:                      13   BIC:                             54.84
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept       -20.3664     16.525     -1.232

In [5]:
#计算相关系数矩阵
df2 = df
df2['人均GDP'] = np.log(df['人均GDP'])
df2['工业增加值'] = np.log(df['工业增加值'])
df2.corr()

Unnamed: 0,指标,常住人口自然增长率,商品房平均销售价格,常住人口城镇化率,单位人口医疗卫生机构床位数,人均GDP,工业增加值
指标,1.0,-0.494113,0.985343,0.999627,0.984232,0.98729,0.970229
常住人口自然增长率,-0.494113,1.0,-0.587776,-0.478779,-0.495191,-0.417248,-0.388095
商品房平均销售价格,0.985343,-0.587776,1.0,0.984174,0.987634,0.950828,0.923765
常住人口城镇化率,0.999627,-0.478779,0.984174,1.0,0.986044,0.98689,0.969231
单位人口医疗卫生机构床位数,0.984232,-0.495191,0.987634,0.986044,1.0,0.949661,0.918664
人均GDP,0.98729,-0.417248,0.950828,0.98689,0.949661,1.0,0.995926
工业增加值,0.970229,-0.388095,0.923765,0.969231,0.918664,0.995926,1.0


In [6]:
# 计算方差扩大因子
df3 = df2
df3 = df3.drop(['指标','常住人口自然增长率'], axis=1)  # 删除多余变量，包括因变量
df3['const'] = 1  # 添加常数项
x = np.array(df3)
vif_list = [smo.variance_inflation_factor(x, i) for i in range(x.shape[1])]
df_vif = pd.DataFrame({'variable': list(df3.columns), 'vif': vif_list})
df_vif = df_vif[~(df_vif['variable'] == 'const')]   # 删除常数项
print(df_vif)

        variable          vif
0      商品房平均销售价格    67.914886
1       常住人口城镇化率   900.559149
2  单位人口医疗卫生机构床位数   241.703541
3          人均GDP  3146.529508
4          工业增加值  1651.020245


In [7]:
#white检验异方差性
name = ["Lagrange multiplier statistic", "p-value", "f-value", "f p-value"]
test = sms.het_white(model1.resid, model1.model.exog)
lzip(name, test)

[('Lagrange multiplier statistic', 19.0),
 ('p-value', 0.39182348254493976),
 ('f-value', nan),
 ('f p-value', nan)]

In [8]:
#解决多重共线性
model2 = ols('常住人口自然增长率~商品房平均销售价格+常住人口城镇化率+单位人口医疗卫生机构床位数',data = df2).fit()
model2_summary = model2.summary()
print(model2_summary)

                            OLS Regression Results                            
Dep. Variable:              常住人口自然增长率   R-squared:                       0.753
Model:                            OLS   Adj. R-squared:                  0.703
Method:                 Least Squares   F-statistic:                     15.22
Date:                Tue, 28 Dec 2021   Prob (F-statistic):           8.07e-05
Time:                        22:22:52   Log-Likelihood:                -19.034
No. Observations:                  19   AIC:                             46.07
Df Residuals:                      15   BIC:                             49.85
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -8.0321      4.564     -1.760

  "anyway, n=%i" % int(n))


In [9]:
# 计算方差扩大因子
df4 = df2
df4 = df4.drop(['指标','常住人口自然增长率','人均GDP','工业增加值','常住人口城镇化率'], axis=1)  # 删除多余变量，包括因变量
#df4['常住人口城镇化率'] = np.log(df4['常住人口城镇化率'])
df4['商品房平均销售价格'] = np.log(df4['商品房平均销售价格'])
#df4['单位人口医疗卫生机构床位数'] = np.log(df4['单位人口医疗卫生机构床位数'])
df4['const'] = 1  # 添加常数项
x = np.array(df4)
vif_list = [smo.variance_inflation_factor(x, i) for i in range(x.shape[1])]
df_vif = pd.DataFrame({'variable': list(df4.columns), 'vif': vif_list})
df_vif = df_vif[~(df_vif['variable'] == 'const')]   # 删除常数项
print(df_vif)

        variable        vif
0      商品房平均销售价格  14.496854
1  单位人口医疗卫生机构床位数  14.496854
