In [2]:
import pandas as pd
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [3]:
df = pd.read_csv("bike.csv")
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [4]:
df_sub = df.loc[:,"season":"casual"]
df_sub.head(2)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual
0,1,0,0,1,9.84,14.395,81,0.0,3
1,1,0,0,1,9.02,13.635,80,0.0,8


In [5]:
# formula 에 "casual ~ season + holiday + workingday + ...."
# string 변환으로 적으면 된다

In [6]:
"casual ~ " + "season + holiday"

'casual ~ season + holiday'

In [8]:
"casual ~ " + " + ".join(df_sub.columns)

'casual ~ season + holiday + workingday + weather + temp + atemp + humidity + windspeed + casual'

In [9]:
"casual ~ " + " + ".join(df_sub.columns[:-1])

'casual ~ season + holiday + workingday + weather + temp + atemp + humidity + windspeed'

In [11]:
formula = "casual ~ " + " + ".join(df_sub.columns[:-1])
y, X = dmatrices(formula, data = df_sub, return_type = "dataframe")

In [12]:
y.head(2)

Unnamed: 0,casual
0,3.0
1,8.0


In [13]:
X.head(2)

Unnamed: 0,Intercept,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,1.0,1.0,0.0,0.0,1.0,9.84,14.395,81.0,0.0
1,1.0,1.0,0.0,0.0,1.0,9.02,13.635,80.0,0.0


In [16]:
df_vif = pd.DataFrame()
df_vif["Colname"] = X.columns
df_vif["VIF"] = [vif(X.values, i)for i in range(X.shape[1])]
df_vif

Unnamed: 0,Colname,VIF
0,Intercept,34.029472
1,season,1.137211
2,holiday,1.069731
3,workingday,1.071196
4,weather,1.23615
5,temp,35.516012
6,atemp,35.550831
7,humidity,1.425034
8,windspeed,1.195704


In [17]:
df_sub = pd.concat([df.loc[:,"season":"temp"],
                    df.loc[:, "humidity":"casual"]],
                  axis = 1)
df_sub.head(2)

Unnamed: 0,season,holiday,workingday,weather,temp,humidity,windspeed,casual
0,1,0,0,1,9.84,81,0.0,3
1,1,0,0,1,9.02,80,0.0,8


In [18]:
formula = "casual ~ " + " + ".join(df_sub.columns[:-1])
y, X = dmatrices(formula, data = df_sub, return_type = "dataframe")

df_vif = pd.DataFrame()
df_vif["Colname"] = X.columns
df_vif["VIF"] = [vif(X.values, i)for i in range(X.shape[1])]
df_vif

Unnamed: 0,Colname,VIF
0,Intercept,31.375118
1,season,1.136866
2,holiday,1.068094
3,workingday,1.070025
4,weather,1.235251
5,temp,1.089028
6,humidity,1.421256
7,windspeed,1.14965


In [20]:
df_dum = pd.get_dummies(df, columns=["season"], drop_first = True)
df_dum.head(2)

Unnamed: 0,datetime,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,season_2,season_3,season_4
0,2011-01-01 00:00:00,0,0,1,9.84,14.395,81,0.0,3,13,16,0,0,0
1,2011-01-01 01:00:00,0,0,1,9.02,13.635,80,0.0,8,32,40,0,0,0


In [21]:
# price 를 종속변수로 하고 나머지 수치형 변수를 독립변수로 했을 때
# 다중 공선성의 문제가 있다고 판단되는 변수의 개수는?

In [22]:
# 본 문제에서 variance_inflation_factor() 함수로 산출한 VIF 값 중
# Intercept는 저련이기 때문에 10이 넘더라도 무시한다.

In [24]:
df = pd.read_csv("diamonds.csv")
df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31


In [25]:
df_sub = df.iloc[:, [6,0,4,5,7,8,9]]
df_sub.head(2)

Unnamed: 0,price,carat,depth,table,x,y,z
0,326,0.23,61.5,55.0,3.95,3.98,2.43
1,326,0.21,59.8,61.0,3.89,3.84,2.31


In [27]:
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [28]:
formula = "price ~ " + " + ".join(df_sub.columns[1:])
y, X = dmatrices(formula, data=df_sub, return_type="dataframe")

In [29]:
df_vif = pd.DataFrame()
df_vif["vars"] = X.columns
df_vif["VIF"] = [vif(X.values, i) for i in range(X.shape[1])]
df_vif

Unnamed: 0,vars,VIF
0,Intercept,4821.69635
1,carat,21.602712
2,depth,1.49659
3,table,1.143225
4,x,56.187704
5,y,20.454295
6,z,23.530049


In [30]:
# price 를 종속변수로 하고 carat 과 depth를 독립 변수로 하여
# 생성한 선형 회귀 모델을 사용하여 알아본 carat이 1이고 depth가 60, 
# table 이 55인 다이아몬드의 가격은 얼마인가?

In [31]:
# 문제에서 제시한 조건으로 신규 데이터프레임을 생성한다.
# 해당 객체를 모델 객체의 predict() method 입력값으로 설정 후 연산결과 확인

In [33]:
df = pd.read_csv("diamonds.csv")
df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31


In [34]:
from statsmodels.formula.api import ols

In [35]:
model = ols(formula="price ~ carat + depth", data=df).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.851
Model:,OLS,Adj. R-squared:,0.851
Method:,Least Squares,F-statistic:,153600.0
Date:,"Sun, 05 Mar 2023",Prob (F-statistic):,0.0
Time:,15:08:57,Log-Likelihood:,-472490.0
No. Observations:,53940,AIC:,945000.0
Df Residuals:,53937,BIC:,945000.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4045.3332,286.205,14.134,0.000,3484.368,4606.298
carat,7765.1407,14.009,554.282,0.000,7737.682,7792.599
depth,-102.1653,4.635,-22.041,0.000,-111.251,-93.080

0,1,2,3
Omnibus:,14148.858,Durbin-Watson:,0.992
Prob(Omnibus):,0.0,Jarque-Bera (JB):,148236.675
Skew:,0.962,Prob(JB):,0.0
Kurtosis:,10.89,Cond. No.,2660.0


In [36]:
df_test = pd.DataFrame({"carat":[1], 
                       "depth":[60],
                       "table":[55]})
df_test

Unnamed: 0,carat,depth,table
0,1,60,55


In [37]:
model.predict(df_test)

0    5680.554517
dtype: float64

In [38]:
# price 를 종속변수로 하고 carat, color, depth를 독립변수로 하여
# 생성한 선형 회귀 모델을 사용하여 알아본 carat이 1이고, depth가 50, 
# color 가 E인 다이아몬드의 가격은 얼마인가?
# 가변수 생성 시 마지막 변수 하나 제거

In [39]:
# 모델 생성 후 예측값을 입력할 때 기존 학습 모델의 더미변수 생성 규칙을
# 통일해주어야 한다. 즉, predict() method 에 입력되는 데이터프레임은 변수가 3개가 아닌 8개이다.

In [40]:
df = pd.read_csv("diamonds.csv")
df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31


In [42]:
df_sub = df.loc[:, ["price", "carat", "color", "depth"]]
df_sub.head(2)

Unnamed: 0,price,carat,color,depth
0,326,0.23,E,61.5
1,326,0.21,E,59.8


In [43]:
df_dum = pd.get_dummies(df_sub, columns = ["color"], drop_first=True)
df_dum.head(2)

Unnamed: 0,price,carat,depth,color_E,color_F,color_G,color_H,color_I,color_J
0,326,0.23,61.5,1,0,0,0,0,0
1,326,0.21,59.8,1,0,0,0,0,0


In [44]:
from statsmodels.formula.api import ols

In [45]:
model = ols(formula="price ~ " + " + ".join(df_dum.columns[1:]),
           data=df_dum).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.865
Model:,OLS,Adj. R-squared:,0.865
Method:,Least Squares,F-statistic:,43190.0
Date:,"Sun, 05 Mar 2023",Prob (F-statistic):,0.0
Time:,15:17:50,Log-Likelihood:,-469770.0
No. Observations:,53940,AIC:,939600.0
Df Residuals:,53931,BIC:,939600.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3399.1860,272.825,12.459,0.000,2864.447,3933.925
carat,8070.6389,13.988,576.983,0.000,8043.223,8098.055
depth,-89.7605,4.412,-20.344,0.000,-98.408,-81.113
color_E,-97.0161,23.164,-4.188,0.000,-142.417,-51.615
color_F,-80.8972,23.316,-3.470,0.001,-126.596,-35.199
color_G,-80.6971,22.585,-3.573,0.000,-124.963,-36.431
color_H,-720.8099,24.268,-29.703,0.000,-768.374,-673.245
color_I,-1043.9064,27.213,-38.361,0.000,-1097.243,-990.570
color_J,-1899.5248,33.657,-56.438,0.000,-1965.493,-1833.557

0,1,2,3
Omnibus:,12411.519,Durbin-Watson:,0.953
Prob(Omnibus):,0.0,Jarque-Bera (JB):,159901.705
Skew:,0.746,Prob(JB):,0.0
Kurtosis:,11.302,Cond. No.,2670.0
