In [2]:
import pandas as pd
import numpy as np

# 모평균 비교에 관한 가설검정: t-test

In [90]:
from scipy.stats import ttest_1samp, ttest_rel, ttest_ind
import math

In [56]:
# Q1
df = pd.read_csv('bike.csv')
stat, p = ttest_1samp(df['temp'], popmean=20)
round(p, 3)

0.002

In [88]:
# Q2
df = pd.read_csv('bike.csv')
df_1 = df[df['datetime'].str[:7]=='2011-01']
stat, p = ttest_rel(df_1['casual'], df_1['registered'])
math.trunc(abs(stat))

21

In [89]:
# Q3
df = pd.read_csv('bike.csv')
df['datetime'] = pd.to_datetime(df['datetime'])
df['day'] = df['datetime'].dt.weekday
df['weekend'] = (df['day']>=5) + 0 # 5,6이 주말
stat, p = ttest_ind(df.loc[df['weekend']==1, 'registered'],
                    df.loc[df['weekend']==0, 'registered'])
math.trunc(abs(stat))

12

# 모평균 비교에 관한 가설검정: One way ANOVA

In [97]:
from scipy.stats import f_oneway
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [103]:
# Q2
df = pd.read_csv('bike.csv')
lm = ols('temp~C(season)', df).fit()
anova_lm(lm)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(season),3.0,412885.270005,137628.423335,6040.687453,0.0
Residual,10882.0,247930.804947,22.78357,,


In [109]:
# Q3
df = pd.read_csv('bike.csv')
df['datetime'] = pd.to_datetime(df['datetime'])
df['day'] = df['datetime'].dt.weekday
print(pairwise_tukeyhsd(df['registered'], df['day']))

 Multiple Comparison of Means - Tukey HSD, FWER=0.05  
group1 group2 meandiff p-adj   lower    upper   reject
------------------------------------------------------
     0      1   6.1979    0.9  -9.7188  22.1146  False
     0      2    5.343    0.9 -10.5427  21.2287  False
     0      3  12.7424 0.2132  -3.1383   28.623  False
     0      4   6.2956    0.9  -9.6471  22.2384  False
     0      5 -27.5063  0.001 -43.3091 -11.7036   True
     0      6 -36.7583  0.001 -52.5734 -20.9431   True
     1      2  -0.8549    0.9 -16.7716  15.0618  False
     1      3   6.5445 0.8863  -9.3671  22.4561  False
     1      4   0.0977    0.9 -15.8759  16.0713  False
     1      5 -33.7042  0.001 -49.5381 -17.8704   True
     1      6 -42.9562  0.001 -58.8024 -27.1099   True
     2      3   7.3994 0.7916  -8.4813    23.28  False
     2      4   0.9526    0.9 -14.9901  16.8954  False
     2      5 -32.8493  0.001 -48.6521 -17.0466   True
     2      6 -42.1013  0.001 -57.9164 -26.2861   True
     3    

# 모분산 비교에 관한 가설검정: 등분산 검정(F-test of equality of variances)

In [3]:
from scipy.stats import f, bartlett, levene

In [25]:
# Q1
df = pd.read_csv('financial_info_10k_persons.csv')
df['trans_once'] = df['Total_trans_amt'] / df['Total_trans_cnt']
ser_m = df.loc[df['Gender']=='M', 'trans_once']
ser_f = df.loc[df['Gender']=='F', 'trans_once']
F = ser_m.var() / ser_f.var() # F검정통계량
print(F)

1.6665446172570928


In [26]:
# Q2
df = pd.read_csv('financial_info_10k_persons.csv')
df['trans_once'] = df['Total_trans_amt'] / df['Total_trans_cnt']
df['Gen'] = (df['Age']//10) * 10
stat, p = bartlett(df.loc[df['Gen']==50, 'trans_once'],
                   df.loc[df['Gen']==60, 'trans_once'],
                   df.loc[df['Gen']==70, 'trans_once'])
print(p)

0.004109245841612487


In [35]:
# Q3
df = pd.read_csv('financial_info_10k_persons.csv')
df['trans_once'] = df['Total_trans_amt'] / df['Total_trans_cnt']
df_sub = df.loc[(df['Dependent_cnt']==0)&(df['Gender']=='M'),:]
stat, p = levene(df_sub.loc[df_sub['Edu_level']==df_sub['Edu_level'].unique()[0],'trans_once'],
                 df_sub.loc[df_sub['Edu_level']==df_sub['Edu_level'].unique()[1],'trans_once'],
                 df_sub.loc[df_sub['Edu_level']==df_sub['Edu_level'].unique()[2],'trans_once'],
                 df_sub.loc[df_sub['Edu_level']==df_sub['Edu_level'].unique()[3],'trans_once'],
                 df_sub.loc[df_sub['Edu_level']==df_sub['Edu_level'].unique()[4],'trans_once'],
                 df_sub.loc[df_sub['Edu_level']==df_sub['Edu_level'].unique()[5],'trans_once'],
                 df_sub.loc[df_sub['Edu_level']==df_sub['Edu_level'].unique()[6],'trans_once'])
print(p)

0.5070685402777693


# 범주형 변수 간의 독립성 검정(Chi-squared test)

In [63]:
from scipy.stats import chi2_contingency

In [64]:
# Q1
df = pd.read_csv('financial_info_10k_persons.csv')
df_sub = df.loc[df['Edu_level']=='High School',]
stat, p, dof, exp = chi2_contingency(pd.crosstab(df_sub['Gender'], df_sub['is_attrited']),
                                     correction=False)
print(round(p,4))

0.0461


In [65]:
# Q2
df = pd.read_csv('financial_info_10k_persons.csv')
stat, p, dof, exp = chi2_contingency(pd.crosstab(df['Gender'], df['Card']))
print(round(stat,3))

66.457


In [66]:
# Q3
df = pd.read_csv('financial_info_10k_persons.csv')
df['y'] = (df['Inactive_last_12m']>=3) + 0
chi2_contingency(pd.crosstab(df['y'], df['is_attrited']))

(210.74834292621756,
 9.431916381955241e-48,
 1,
 array([[4606.4112,  869.5888],
        [3805.5888,  718.4112]]))

# 상관분석

In [67]:
from scipy.stats import pearsonr, spearmanr, kendalltau

In [78]:
# Q1
df = pd.read_csv('bike.csv')
round(df[['temp', 'atemp', 'humidity', 'casual']].corr().min().min(), 2)

-0.35

In [94]:
# Q2
df = pd.read_csv('bike.csv')
round(df[['season', 'atemp', 'casual']].groupby('season').corr(), 3) # 1:봄, 2:여름, 3:가을, 4:겨울

Unnamed: 0_level_0,Unnamed: 1_level_0,atemp,casual
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,atemp,1.0,0.478
1,casual,0.478,1.0
2,atemp,1.0,0.378
2,casual,0.378,1.0
3,atemp,1.0,0.381
3,casual,0.381,1.0
4,atemp,1.0,0.444
4,casual,0.444,1.0


In [123]:
# Q3
df = pd.read_csv('bike.csv')
df['is_sunny'] = (df['weather']==1) + 0
df_corr = df[['temp', 'casual', 'is_sunny']].groupby('is_sunny').corr()
abs(df_corr.iloc[1,0] - df_corr.iloc[3,0]).round(3)

0.025

# 단순 회귀분석(Simple Linear Regression)

In [154]:
from statsmodels.formula.api import ols
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [155]:
# Q1
df = pd.read_csv('bike.csv')
df_train, df_test = train_test_split(df, train_size=0.7, random_state=123)
model = ols('registered~temp', df_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,registered,R-squared:,0.106
Model:,OLS,Adj. R-squared:,0.106
Method:,Least Squares,F-statistic:,902.3
Date:,"Sun, 29 May 2022",Prob (F-statistic):,1.92e-187
Time:,21:14:36,Log-Likelihood:,-48650.0
No. Observations:,7620,AIC:,97300.0
Df Residuals:,7618,BIC:,97320.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,27.5151,4.559,6.036,0.000,18.579,36.452
temp,6.3391,0.211,30.038,0.000,5.925,6.753

0,1,2,3
Omnibus:,2097.525,Durbin-Watson:,2.022
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5337.402
Skew:,1.502,Prob(JB):,0.0
Kurtosis:,5.79,Cond. No.,60.1


In [156]:
# Q2
df = pd.read_csv('bike.csv')
df_train, df_test = train_test_split(df, train_size=0.7, random_state=123)
model = ols('casual~atemp', df_train).fit()
pred = model.predict(df_test)
(mean_squared_error(y_true=df_test['casual'], y_pred=pred)**0.5).round(1)

44.5

In [169]:
# Q3
df = pd.read_csv('bike.csv')

# 여름
df_2 = df[df['season']==2]
df_train_2, df_test_2 = train_test_split(df_2, train_size=0.7, random_state=123)
model_2 = ols('casual~atemp', df_train_2).fit()
pred_2 = model.predict(df_test_2)
rmse_2 = mean_squared_error(y_true=df_test_2['casual'], y_pred=pred_2)**0.5

# 겨울
df_4 = df[df['season']==4]
df_train_4, df_test_4 = train_test_split(df_4, train_size=0.7, random_state=123)
model_4 = ols('casual~atemp', df_train_4).fit()
pred_4 = model.predict(df_test_4)
rmse_4 = mean_squared_error(y_true=df_test_4['casual'], y_pred=pred_4)**0.5

# 두 계절의 RMSE 차이
abs(rmse_2 - rmse_4).round(1)

8.6

# 다중 회귀분석(Multiple Linear Regression)

In [177]:
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [206]:
# Q1
df = pd.read_csv('diamonds.csv')
df = df.iloc[:, [6, 0, 4, 5, 7, 8, 9]]

formula = 'price ~ ' + '+'.join(df.columns[1:])
y, X = dmatrices(formula, df, return_type='dataframe')

df_vif = pd.DataFrame()
df_vif['colnames'] = X.columns
df_vif['VIF'] = [vif(X.values, i) for i in range(len(X.columns))]
df_vif

Unnamed: 0,colnames,VIF
0,Intercept,4821.69635
1,carat,21.602712
2,depth,1.49659
3,table,1.143225
4,x,56.187704
5,y,20.454295
6,z,23.530049


In [226]:
# Q2
df = pd.read_csv('diamonds.csv')
formula = 'price ~ carat + depth'
model = ols(formula, df).fit()
df_test = pd.DataFrame({'carat':[1], 'depth':[60], 'table':[55]})
model.predict(df_test).round()

0    5681.0
dtype: float64

In [227]:
# Q3
df = pd.read_csv('diamonds.csv')
formula = 'price ~ carat + C(color) + depth'
model = ols(formula, df).fit()
df_test = pd.DataFrame({'carat':[1], 'depth':[50], 'color':['E']})
model.predict(df_test).round()

0    6885.0
dtype: float64

# 분류: 로지스틱 회귀분석(Logistic Regression)

In [254]:
from statsmodels.api import Logit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

In [245]:
# Q1
df = pd.read_csv('diabetes.csv')
df_train, df_test = train_test_split(df, train_size=0.8, random_state=123)

model = Logit(endog=df_train['Outcome'],
              exog=df_train.loc[:, ['BloodPressure', 'Glucose', 'BMI', 'Insulin']]).fit()

pred = model.predict(df_test.loc[:, ['BloodPressure', 'Glucose', 'BMI', 'Insulin']])
pred_class = (pred>0.5) + 0

accuracy_score(y_pred=pred_class, y_true=df_test['Outcome']).round(2)

Optimization terminated successfully.
         Current function value: 0.626579
         Iterations 5


0.7

In [252]:
# Q2
df = pd.read_csv('diabetes.csv')

model = Logit(endog=df['Outcome'],
             exog=df.loc[:, ['Glucose', 'BMI', 'Age']]).fit()

np.exp(model.params).round(2) # odds ratio

Optimization terminated successfully.
         Current function value: 0.656276
         Iterations 4


Glucose    1.01
BMI        0.96
Age        0.99
dtype: float64

In [257]:
# Q3
df = pd.read_csv('diabetes.csv')

model = Logit(endog=df['Outcome'],
             exog=df.loc[:, ['Glucose', 'BMI', 'Age']]).fit()

pred = model.predict(df.loc[:, ['Glucose', 'BMI', 'Age']])

roc_auc_score(y_true=df['Outcome'], y_score=pred).round(2)

Optimization terminated successfully.
         Current function value: 0.656276
         Iterations 4


0.54

# 의사결정나무 모델: 분류 및 회귀나무

In [258]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

In [267]:
# Q1
df = pd.read_csv('diabetes.csv')
df_train, df_test = train_test_split(df, train_size=0.8, random_state=123)

model = DecisionTreeClassifier(random_state=123)
model.fit(y = df_train['Outcome'],
         X = df_train.loc[:, ['Glucose', 'BloodPressure', 'Pregnancies']])

pred = model.predict(df_test.loc[:, ['Glucose', 'BloodPressure', 'Pregnancies']])

accuracy_score(y_pred = pred, y_true = df_test['Outcome']).round(2)

0.63

In [272]:
# Q2
df = pd.read_csv('diabetes.csv')
df_train, df_test = train_test_split(df, train_size=0.8, random_state=123)

model = DecisionTreeRegressor(random_state=123)
model.fit(y = df_train['BMI'],
         X = df_train.loc[:, ['Glucose', 'BloodPressure', 'SkinThickness']])

pred = model.predict(df_test.loc[:, ['Glucose', 'BloodPressure', 'SkinThickness']])

(mean_squared_error(y_true = df_test['BMI'], y_pred = pred)**0.5).round(1)

9.9

In [281]:
# Q3
df = pd.read_csv('diabetes.csv')
df_train, df_test = train_test_split(df, train_size=0.7, random_state=345)

cols = ['Glucose', 'BloodPressure', 'Pregnancies', 'BMI', 'Age']
depth_list = [3, 4, 5, 6]

accs = []
for depth in depth_list:
    model = DecisionTreeClassifier(max_depth=depth, random_state=345)
    model.fit(y = df_train['Outcome'],
             X = df_train.loc[:, cols])
    pred = model.predict(df_test.loc[:, cols])
    accs.append(accuracy_score(y_pred=pred, y_true=df_test['Outcome']).round(2))

df_acc = pd.DataFrame({'depth':depth_list, 'accuracy':accs})
df_acc

Unnamed: 0,depth,accuracy
0,3,0.77
1,4,0.76
2,5,0.76
3,6,0.77
