In [1]:
import pandas as pd
import numpy as np

## 모평균 비교에 관한 가설검정: t-test

In [2]:
from scipy.stats import ttest_1samp, ttest_rel, ttest_ind
import math

In [56]:
# Q1
df = pd.read_csv('bike.csv')
stat, p = ttest_1samp(df['temp'], popmean=20)
round(p, 3)

0.002

In [88]:
# Q2
df = pd.read_csv('bike.csv')
df_1 = df[df['datetime'].str[:7]=='2011-01']
stat, p = ttest_rel(df_1['casual'], df_1['registered'])
math.trunc(abs(stat))

21

In [89]:
# Q3
df = pd.read_csv('bike.csv')
df['datetime'] = pd.to_datetime(df['datetime'])
df['day'] = df['datetime'].dt.weekday
df['weekend'] = (df['day']>=5) + 0 # 5,6이 주말
stat, p = ttest_ind(df.loc[df['weekend']==1, 'registered'],
                    df.loc[df['weekend']==0, 'registered'])
math.trunc(abs(stat))

12

## 모평균 비교에 관한 가설검정: One way ANOVA

In [3]:
from scipy.stats import f_oneway
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [103]:
# Q2
df = pd.read_csv('bike.csv')
lm = ols('temp~C(season)', df).fit()
anova_lm(lm)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(season),3.0,412885.270005,137628.423335,6040.687453,0.0
Residual,10882.0,247930.804947,22.78357,,


In [109]:
# Q3
df = pd.read_csv('bike.csv')
df['datetime'] = pd.to_datetime(df['datetime'])
df['day'] = df['datetime'].dt.weekday
print(pairwise_tukeyhsd(df['registered'], df['day']))

 Multiple Comparison of Means - Tukey HSD, FWER=0.05  
group1 group2 meandiff p-adj   lower    upper   reject
------------------------------------------------------
     0      1   6.1979    0.9  -9.7188  22.1146  False
     0      2    5.343    0.9 -10.5427  21.2287  False
     0      3  12.7424 0.2132  -3.1383   28.623  False
     0      4   6.2956    0.9  -9.6471  22.2384  False
     0      5 -27.5063  0.001 -43.3091 -11.7036   True
     0      6 -36.7583  0.001 -52.5734 -20.9431   True
     1      2  -0.8549    0.9 -16.7716  15.0618  False
     1      3   6.5445 0.8863  -9.3671  22.4561  False
     1      4   0.0977    0.9 -15.8759  16.0713  False
     1      5 -33.7042  0.001 -49.5381 -17.8704   True
     1      6 -42.9562  0.001 -58.8024 -27.1099   True
     2      3   7.3994 0.7916  -8.4813    23.28  False
     2      4   0.9526    0.9 -14.9901  16.8954  False
     2      5 -32.8493  0.001 -48.6521 -17.0466   True
     2      6 -42.1013  0.001 -57.9164 -26.2861   True
     3    

## 모평균 비교에 관한 가설검정: Two way ANOVA

In [9]:
# Q1
df = pd.read_csv('bike.csv')
formula = 'registered ~ C(season) + C(holiday) + C(season):C(holiday)'
model = ols(formula, df).fit()
anova_lm(model)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(season),3.0,10990190.0,3663398.0,168.072567,1.646504e-106
C(holiday),1.0,137039.0,137039.0,6.287195,0.01217579
C(season):C(holiday),3.0,87364.84,29121.61,1.336067,0.2606397
Residual,10878.0,237102600.0,21796.53,,


In [14]:
# Q2
df = pd.read_csv('bike.csv')
formula = 'registered ~ C(season) + C(weather) + C(season):C(weather)'
model = ols(formula, df).fit()
anova_lm(model).round(4)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(season),3.0,10990190.0,3663398.0,170.4139,0.0
C(weather),3.0,3216611.0,1072204.0,49.8767,0.0
C(season):C(weather),9.0,376518.4,41835.37,1.9461,0.0414
Residual,10873.0,233737600.0,21497.06,,


In [32]:
# Q3
df = pd.read_csv('diabetes.csv')
df = df.loc[(df['Age']<70) & (df['BMI']!=0),:]
df['preg'] = (df['Pregnancies']>0) + 0
df['gen'] = df['Age']//10

formula = 'BMI ~ C(gen) + C(preg) + C(gen):C(preg)'
model = ols(formula, df).fit()
anova_lm(model).round(4)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(gen),4.0,850.9551,212.7388,4.7079,0.0009
C(preg),1.0,1253.8068,1253.8068,27.7467,0.0
C(gen):C(preg),4.0,321.2309,80.3077,1.7772,0.1315
Residual,744.0,33619.5232,45.1875,,


## 모분산 비교에 관한 가설검정: 등분산 검정(F-test of equality of variances)

In [4]:
from scipy.stats import f, bartlett, levene

In [25]:
# Q1
df = pd.read_csv('financial_info_10k_persons.csv')
df['trans_once'] = df['Total_trans_amt'] / df['Total_trans_cnt']
ser_m = df.loc[df['Gender']=='M', 'trans_once']
ser_f = df.loc[df['Gender']=='F', 'trans_once']
F = ser_m.var() / ser_f.var() # F검정통계량
print(F)

1.6665446172570928


In [26]:
# Q2
df = pd.read_csv('financial_info_10k_persons.csv')
df['trans_once'] = df['Total_trans_amt'] / df['Total_trans_cnt']
df['Gen'] = (df['Age']//10) * 10
stat, p = bartlett(df.loc[df['Gen']==50, 'trans_once'],
                   df.loc[df['Gen']==60, 'trans_once'],
                   df.loc[df['Gen']==70, 'trans_once'])
print(p)

0.004109245841612487


In [35]:
# Q3
df = pd.read_csv('financial_info_10k_persons.csv')
df['trans_once'] = df['Total_trans_amt'] / df['Total_trans_cnt']
df_sub = df.loc[(df['Dependent_cnt']==0)&(df['Gender']=='M'),:]
stat, p = levene(df_sub.loc[df_sub['Edu_level']==df_sub['Edu_level'].unique()[0],'trans_once'],
                 df_sub.loc[df_sub['Edu_level']==df_sub['Edu_level'].unique()[1],'trans_once'],
                 df_sub.loc[df_sub['Edu_level']==df_sub['Edu_level'].unique()[2],'trans_once'],
                 df_sub.loc[df_sub['Edu_level']==df_sub['Edu_level'].unique()[3],'trans_once'],
                 df_sub.loc[df_sub['Edu_level']==df_sub['Edu_level'].unique()[4],'trans_once'],
                 df_sub.loc[df_sub['Edu_level']==df_sub['Edu_level'].unique()[5],'trans_once'],
                 df_sub.loc[df_sub['Edu_level']==df_sub['Edu_level'].unique()[6],'trans_once'])
print(p)

0.5070685402777693


## 범주형 변수 간의 독립성 검정(Chi-squared test)

In [5]:
from scipy.stats import chi2_contingency

In [64]:
# Q1
df = pd.read_csv('financial_info_10k_persons.csv')
df_sub = df.loc[df['Edu_level']=='High School',]
stat, p, dof, exp = chi2_contingency(pd.crosstab(df_sub['Gender'], df_sub['is_attrited']),
                                     correction=False)
print(round(p,4))

0.0461


In [65]:
# Q2
df = pd.read_csv('financial_info_10k_persons.csv')
stat, p, dof, exp = chi2_contingency(pd.crosstab(df['Gender'], df['Card']))
print(round(stat,3))

66.457


In [66]:
# Q3
df = pd.read_csv('financial_info_10k_persons.csv')
df['y'] = (df['Inactive_last_12m']>=3) + 0
chi2_contingency(pd.crosstab(df['y'], df['is_attrited']))

(210.74834292621756,
 9.431916381955241e-48,
 1,
 array([[4606.4112,  869.5888],
        [3805.5888,  718.4112]]))

## 시계열 분석

In [6]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [55]:
# Q2
df = pd.read_csv('seoul_subway.csv')
df_sub = df.loc[(df['노선명']=='6호선') & (df['역명']=='이태원'),:]
df_sub['ewma_01'] = df_sub['하차총승객수'].ewm(alpha=0.1).mean()
df_sub['ewma_09'] = df_sub['하차총승객수'].ewm(alpha=0.9).mean()

result_1 = df_sub.loc[df_sub['사용일자']==20201130, 'ewma_01']
result_9 = df_sub.loc[df_sub['사용일자']==20201130, 'ewma_09']
print((result_1 - result_9).abs())

216590    1263.737309
dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['ewma_01'] = df_sub['하차총승객수'].ewm(alpha=0.1).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['ewma_09'] = df_sub['하차총승객수'].ewm(alpha=0.9).mean()


In [67]:
# Q3
df = pd.read_csv('seoul_subway.csv')
df_sub = df.loc[(df['노선명']=='3호선') & (df['역명']=='신사'),:]
df_sub['사용일자'] = pd.to_datetime(df_sub['사용일자'], format='%Y%m%d')
df_sub = df_sub.set_index('사용일자')

result = seasonal_decompose(df_sub['승차총승객수'], extrapolate_trend=1)
result.trend

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['사용일자'] = pd.to_datetime(df_sub['사용일자'], format='%Y%m%d')


사용일자
2019-12-01    32993.000000
2019-12-02    33074.857143
2019-12-03    33156.714286
2019-12-04    33238.571429
2019-12-05    33320.428571
                  ...     
2020-11-26    23306.571429
2020-11-27    22918.285714
2020-11-28    22202.571429
2020-11-29    21650.571429
2020-11-30    21098.571429
Name: trend, Length: 366, dtype: float64

## 상관분석

In [7]:
from scipy.stats import pearsonr, spearmanr, kendalltau

In [78]:
# Q1
df = pd.read_csv('bike.csv')
round(df[['temp', 'atemp', 'humidity', 'casual']].corr().min().min(), 2)

-0.35

In [94]:
# Q2
df = pd.read_csv('bike.csv')
round(df[['season', 'atemp', 'casual']].groupby('season').corr(), 3) # 1:봄, 2:여름, 3:가을, 4:겨울

Unnamed: 0_level_0,Unnamed: 1_level_0,atemp,casual
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,atemp,1.0,0.478
1,casual,0.478,1.0
2,atemp,1.0,0.378
2,casual,0.378,1.0
3,atemp,1.0,0.381
3,casual,0.381,1.0
4,atemp,1.0,0.444
4,casual,0.444,1.0


In [123]:
# Q3
df = pd.read_csv('bike.csv')
df['is_sunny'] = (df['weather']==1) + 0
df_corr = df[['temp', 'casual', 'is_sunny']].groupby('is_sunny').corr()
abs(df_corr.iloc[1,0] - df_corr.iloc[3,0]).round(3)

0.025

## 계층적 군집분석(Hierarchical Clustering)

In [8]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage, dendrogram
from matplotlib import pyplot as plt

In [32]:
# Q1
df = pd.read_csv('diabetes.csv')
model = AgglomerativeClustering(n_clusters=4).fit(df.iloc[:, :-1])
df['cluster'] = model.labels_
df.groupby('cluster')['Insulin'].mean().reset_index()

Unnamed: 0,cluster,Insulin
0,0,207.018182
1,1,0.620779
2,2,548.833333
3,3,85.05


In [42]:
# Q2
df = pd.read_csv('diabetes.csv')
df_sub = df.loc[(df['Pregnancies']==0) & (df['BMI']>0), :'Age']
model = AgglomerativeClustering(n_clusters=6).fit(df_sub)
df_sub['cluster'] = model.labels_
df_sub['cluster'].value_counts()

0    44
1    33
3    13
4     9
5     6
2     3
Name: cluster, dtype: int64

In [62]:
# Q3
df = pd.read_csv('diabetes.csv')
df_sub = df.loc[(df['Pregnancies']==0) & (df['BMI']>0), :'Age']
model = AgglomerativeClustering(n_clusters=6).fit(df_sub)
df_sub['cluster'] = model.labels_

df_cl = df_sub.groupby('cluster').mean()
df_cl_t = df_cl.transpose()
df_cl_t['1_2_diff'] = df_cl_t[1] - df_cl_t[2]
((df_cl_t['1_2_diff']**2).sum())**0.5

466.248496093336

## 비계층적 군집분석(K-means Clustering)

In [9]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [96]:
# Q1
df = pd.read_csv('diabetes.csv')
df_sub = df.loc[df['BMI']!=0, :]
model = KMeans(n_clusters=4, random_state=123).fit(df_sub)
df_sub['cluster'] = model.labels_
df_sub['cluster'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['cluster'] = model.labels_


0    407
2    212
3    114
1     24
Name: cluster, dtype: int64

In [97]:
df_sub.groupby('cluster')['Insulin'].mean()

cluster
0      4.103194
1    509.166667
2    102.674528
3    224.035088
Name: Insulin, dtype: float64

In [108]:
# Q2
df = pd.read_csv('diabetes.csv')
df_sub = df.loc[df['BMI']!=0, :]

nor_minmax = MinMaxScaler().fit(df_sub)
nor_minmax = nor_minmax.transform(df_sub)
df_sub_minmax = pd.DataFrame(nor_minmax, columns=df_sub.columns)

model = KMeans(n_clusters=4, random_state=123).fit(df_sub_minmax)
df_sub['cluster'] = model.labels_
df_sub['cluster'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['cluster'] = model.labels_


0    361
1    135
2    131
3    130
Name: cluster, dtype: int64

In [107]:
df_sub.groupby('cluster')['Age'].mean()

cluster
0    25.667590
1    29.977778
2    44.297710
3    46.753846
Name: Age, dtype: float64

In [122]:
# Q3
df = pd.read_csv('diabetes.csv')
df_sub = df.loc[df['BMI']!=0, :]
model = KMeans(n_clusters=3, random_state=123).fit(df_sub)
df_sub['cluster'] = model.labels_

df_cl = df_sub.groupby('cluster').mean()
df_cl = df_cl.transpose()

print((((df_cl[0] - df_cl[1])**2).sum())**0.5)
print((((df_cl[0] - df_cl[2])**2).sum())**0.5)
print((((df_cl[1] - df_cl[2])**2).sum())**0.5)

429.2441931088845
283.40599977473795
146.33847909815478


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['cluster'] = model.labels_


## 단순 회귀분석(Simple Linear Regression)

In [10]:
from statsmodels.formula.api import ols
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [155]:
# Q1
df = pd.read_csv('bike.csv')
df_train, df_test = train_test_split(df, train_size=0.7, random_state=123)
model = ols('registered~temp', df_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,registered,R-squared:,0.106
Model:,OLS,Adj. R-squared:,0.106
Method:,Least Squares,F-statistic:,902.3
Date:,"Sun, 29 May 2022",Prob (F-statistic):,1.92e-187
Time:,21:14:36,Log-Likelihood:,-48650.0
No. Observations:,7620,AIC:,97300.0
Df Residuals:,7618,BIC:,97320.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,27.5151,4.559,6.036,0.000,18.579,36.452
temp,6.3391,0.211,30.038,0.000,5.925,6.753

0,1,2,3
Omnibus:,2097.525,Durbin-Watson:,2.022
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5337.402
Skew:,1.502,Prob(JB):,0.0
Kurtosis:,5.79,Cond. No.,60.1


In [156]:
# Q2
df = pd.read_csv('bike.csv')
df_train, df_test = train_test_split(df, train_size=0.7, random_state=123)
model = ols('casual~atemp', df_train).fit()
pred = model.predict(df_test)
(mean_squared_error(y_true=df_test['casual'], y_pred=pred)**0.5).round(1)

44.5

In [169]:
# Q3
df = pd.read_csv('bike.csv')

# 여름
df_2 = df[df['season']==2]
df_train_2, df_test_2 = train_test_split(df_2, train_size=0.7, random_state=123)
model_2 = ols('casual~atemp', df_train_2).fit()
pred_2 = model.predict(df_test_2)
rmse_2 = mean_squared_error(y_true=df_test_2['casual'], y_pred=pred_2)**0.5

# 겨울
df_4 = df[df['season']==4]
df_train_4, df_test_4 = train_test_split(df_4, train_size=0.7, random_state=123)
model_4 = ols('casual~atemp', df_train_4).fit()
pred_4 = model.predict(df_test_4)
rmse_4 = mean_squared_error(y_true=df_test_4['casual'], y_pred=pred_4)**0.5

# 두 계절의 RMSE 차이
abs(rmse_2 - rmse_4).round(1)

8.6

## 다중 회귀분석(Multiple Linear Regression)

In [11]:
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [206]:
# Q1
df = pd.read_csv('diamonds.csv')
df = df.iloc[:, [6, 0, 4, 5, 7, 8, 9]]

formula = 'price ~ ' + '+'.join(df.columns[1:])
y, X = dmatrices(formula, df, return_type='dataframe')

df_vif = pd.DataFrame()
df_vif['colnames'] = X.columns
df_vif['VIF'] = [vif(X.values, i) for i in range(len(X.columns))]
df_vif

Unnamed: 0,colnames,VIF
0,Intercept,4821.69635
1,carat,21.602712
2,depth,1.49659
3,table,1.143225
4,x,56.187704
5,y,20.454295
6,z,23.530049


In [226]:
# Q2
df = pd.read_csv('diamonds.csv')
formula = 'price ~ carat + depth'
model = ols(formula, df).fit()
df_test = pd.DataFrame({'carat':[1], 'depth':[60], 'table':[55]})
model.predict(df_test).round()

0    5681.0
dtype: float64

In [227]:
# Q3
df = pd.read_csv('diamonds.csv')
formula = 'price ~ carat + C(color) + depth'
model = ols(formula, df).fit()
df_test = pd.DataFrame({'carat':[1], 'depth':[50], 'color':['E']})
model.predict(df_test).round()

0    6885.0
dtype: float64

## 분류: 로지스틱 회귀분석(Logistic Regression)

In [12]:
from statsmodels.api import Logit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

In [245]:
# Q1
df = pd.read_csv('diabetes.csv')
df_train, df_test = train_test_split(df, train_size=0.8, random_state=123)

model = Logit(endog=df_train['Outcome'],
              exog=df_train.loc[:, ['BloodPressure', 'Glucose', 'BMI', 'Insulin']]).fit()

pred = model.predict(df_test.loc[:, ['BloodPressure', 'Glucose', 'BMI', 'Insulin']])
pred_class = (pred>0.5) + 0

accuracy_score(y_pred=pred_class, y_true=df_test['Outcome']).round(2)

Optimization terminated successfully.
         Current function value: 0.626579
         Iterations 5


0.7

In [252]:
# Q2
df = pd.read_csv('diabetes.csv')

model = Logit(endog=df['Outcome'],
             exog=df.loc[:, ['Glucose', 'BMI', 'Age']]).fit()

np.exp(model.params).round(2) # odds ratio

Optimization terminated successfully.
         Current function value: 0.656276
         Iterations 4


Glucose    1.01
BMI        0.96
Age        0.99
dtype: float64

In [257]:
# Q3
df = pd.read_csv('diabetes.csv')

model = Logit(endog=df['Outcome'],
             exog=df.loc[:, ['Glucose', 'BMI', 'Age']]).fit()

pred = model.predict(df.loc[:, ['Glucose', 'BMI', 'Age']])

roc_auc_score(y_true=df['Outcome'], y_score=pred).round(2)

Optimization terminated successfully.
         Current function value: 0.656276
         Iterations 4


0.54

## 분류: 나이브 베이즈(Naïve Bayes)

In [65]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [66]:
# Q1
df = pd.read_csv('diabetes.csv')
df_sub = df.loc[df['BMI']>0, ]

df_sub['Outcome'].value_counts(normalize=True)

0    0.648613
1    0.351387
Name: Outcome, dtype: float64

In [67]:
# Q2
df = pd.read_csv('diabetes.csv')

model = GaussianNB().fit(X = df[['Glucose', 'BloodPressure', 'Age']],
                         y = df['Outcome'])

pred = model.predict_proba(X = df[['Glucose', 'BloodPressure', 'Age']])
pred = pred[:, 1]
pred_class = (pred>0.5) + 0

accuracy_score(y_pred = pred_class, y_true= df['Outcome']).round(2)

0.76

In [83]:
# Q3
df = pd.read_csv('diabetes.csv')
df['is_preg'] = (df['Pregnancies']>0) + 0
df['gen'] = (df['Age']//10)*10
df = df.loc[df['BMI']>0, ]
df_train, df_test = train_test_split(df, train_size=0.8, random_state=123)

# 나이브 베이즈
model_nb = GaussianNB().fit(X = df_train[['is_preg', 'gen', 'BMI', 'Glucose']],
                            y = df_train['Outcome'])
pred_nb = model_nb.predict_proba(df_test[['is_preg', 'gen', 'BMI', 'Glucose']])
pred_nb = pred_nb[:, 1]
pred_class_nb = (pred_nb>0.5) + 0
print('나이브 베이즈 모델의 정확도는', accuracy_score(y_pred=pred_class_nb, y_true=df_test['Outcome']).round(2))

# 로지스틱 회귀
model_logistic = LogisticRegression().fit(X = df_train[['is_preg', 'gen', 'BMI', 'Glucose']],
                                          y = df_train['Outcome'])
pred_logistic = model_logistic.predict(df_test[['is_preg', 'gen', 'BMI', 'Glucose']])
print('로지스틱 회귀 모델의 정확도는', accuracy_score(y_pred=pred_logistic, y_true=df_test['Outcome']).round(2))

나이브 베이즈 모델의 정확도는 0.8
로지스틱 회귀 모델의 정확도는 0.83


## KNN(K-Nearest Neighbor)

In [38]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

In [39]:
# Q1
df = pd.read_csv('diabetes.csv')
df_train, df_test = train_test_split(df, train_size=0.7, random_state=123)

model = KNeighborsClassifier()
model.fit(X = df_train[['Pregnancies', 'Glucose', 'BloodPressure']],
          y = df_train['Outcome'])

pred = model.predict(df_test[['Pregnancies', 'Glucose', 'BloodPressure']])
accuracy_score(y_pred = pred, y_true = df_test['Outcome']).round(2)

0.73

In [40]:
# Q2
df = pd.read_csv('diabetes.csv')
df['is_preg'] = (df['Pregnancies']>0) + 0
df_train, df_test = train_test_split(df, train_size=0.8, random_state=123)

x_cols = ['is_preg', 'Glucose', 'BloodPressure', 'Insulin', 'BMI']
k_list = [3, 5, 10, 20]
acc_list = []
for k in k_list:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X = df_train[x_cols],
              y = df_train['Outcome'])
    pred = model.predict(df_test[x_cols])
    acc = accuracy_score(y_pred = pred, y_true = df_test['Outcome']).round(2)
    acc_list.append(acc)

df_acc = pd.DataFrame({'k':k_list, 'acc':acc_list})
df_acc

Unnamed: 0,k,acc
0,3,0.71
1,5,0.73
2,10,0.78
3,20,0.76


In [42]:
# Q3
df = pd.read_csv('diabetes.csv')
df['is_preg'] = (df['Pregnancies']>0) + 0
df_train, df_test = train_test_split(df, train_size=0.8, random_state=123)

x_cols = ['is_preg', 'Glucose', 'BloodPressure', 'Insulin']
k_list = [3, 5, 10, 20]
rmse_list = []
for k in k_list:
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X = df_train[x_cols], y = df_train['BMI'])
    pred = model.predict(df_test[x_cols])
    rmse = ((mean_squared_error(y_pred = pred, y_true = df_test['BMI']))**0.5).round(3)
    rmse_list.append(rmse)

df_rmse = pd.DataFrame({'k':k_list, 'rmse':rmse_list})
df_rmse

Unnamed: 0,k,rmse
0,3,8.536
1,5,8.715
2,10,8.526
3,20,8.491


## 의사결정나무 모델: 분류 및 회귀나무

In [258]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

In [267]:
# Q1
df = pd.read_csv('diabetes.csv')
df_train, df_test = train_test_split(df, train_size=0.8, random_state=123)

model = DecisionTreeClassifier(random_state=123)
model.fit(y = df_train['Outcome'],
         X = df_train.loc[:, ['Glucose', 'BloodPressure', 'Pregnancies']])

pred = model.predict(df_test.loc[:, ['Glucose', 'BloodPressure', 'Pregnancies']])

accuracy_score(y_pred = pred, y_true = df_test['Outcome']).round(2)

0.63

In [272]:
# Q2
df = pd.read_csv('diabetes.csv')
df_train, df_test = train_test_split(df, train_size=0.8, random_state=123)

model = DecisionTreeRegressor(random_state=123)
model.fit(y = df_train['BMI'],
         X = df_train.loc[:, ['Glucose', 'BloodPressure', 'SkinThickness']])

pred = model.predict(df_test.loc[:, ['Glucose', 'BloodPressure', 'SkinThickness']])

(mean_squared_error(y_true = df_test['BMI'], y_pred = pred)**0.5).round(1)

9.9

In [281]:
# Q3
df = pd.read_csv('diabetes.csv')
df_train, df_test = train_test_split(df, train_size=0.7, random_state=345)

cols = ['Glucose', 'BloodPressure', 'Pregnancies', 'BMI', 'Age']
depth_list = [3, 4, 5, 6]

accs = []
for depth in depth_list:
    model = DecisionTreeClassifier(max_depth=depth, random_state=345)
    model.fit(y = df_train['Outcome'],
             X = df_train.loc[:, cols])
    pred = model.predict(df_test.loc[:, cols])
    accs.append(accuracy_score(y_pred=pred, y_true=df_test['Outcome']).round(2))

df_acc = pd.DataFrame({'depth':depth_list, 'accuracy':accs})
df_acc

Unnamed: 0,depth,accuracy
0,3,0.77
1,4,0.76
2,5,0.76
3,6,0.77


## 추천: 연관성 분석(Association Rule)

In [43]:
!pip3 install mlxtend
from mlxtend.frequent_patterns import apriori, association_rules

Collecting mlxtend
  Downloading mlxtend-0.20.0-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 10.5 MB/s eta 0:00:01     |██████████████                  | 583 kB 10.5 MB/s eta 0:00:01
Installing collected packages: mlxtend
Successfully installed mlxtend-0.20.0


In [67]:
# Q1
df = pd.read_csv('association_rules_mart.csv')
df = df.iloc[:, 1:].drop_duplicates()

df['purchase'] = True
df_pivot = pd.pivot_table(data=df, index='ID', columns='Item', values='purchase',
                         aggfunc=max, fill_value=False)

# apriori
item_sets = apriori(df=df_pivot, min_support=0.005, use_colnames=True)

# association rules
rules = association_rules(df=item_sets, metric='confidence', min_threshold=0.005)
rules_sub = rules[rules['support']>=0.1]
rules_sub = rules_sub.sort_values('lift', ascending=False)
len(rules_sub)

26

In [83]:
# Q2
df = pd.read_csv('association_rules_mart.csv')
df = df.iloc[:, 1:].drop_duplicates()

df['purchase'] = True
df_pivot = pd.pivot_table(data=df, index='ID', columns='Item', values='purchase',
                         aggfunc=max, fill_value=False)

# apriori
item_sets = apriori(df=df_pivot, min_support=0.005, use_colnames=True, max_len=3)

# association rules
rules = association_rules(df=item_sets, metric='confidence', min_threshold=0.005)
rules_sub = rules[rules['support']>=0.01]
rules_sub = rules_sub.sort_values('lift', ascending=False)
rules_sub.head(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
22844,(meat),"(domestic eggs, whole milk)",0.063622,0.070292,0.010262,0.16129,2.294561,0.005789,1.108497
22841,"(domestic eggs, whole milk)",(meat),0.070292,0.063622,0.010262,0.145985,2.294561,0.005789,1.096442
22842,"(meat, whole milk)",(domestic eggs),0.03489,0.133145,0.010262,0.294118,2.208999,0.005616,1.228044
22843,(domestic eggs),"(meat, whole milk)",0.133145,0.03489,0.010262,0.077071,2.208999,0.005616,1.045704
18054,"(fruit/vegetable juice, whole milk)",(chocolate),0.06234,0.086455,0.010775,0.17284,1.999194,0.005385,1.104435


In [98]:
# Q3
df = pd.read_csv('association_rules_mart.csv')

df_item_cnt = df['Item'].value_counts().reset_index()
df_item_cnt = df_item_cnt.iloc[:30, :]
df_sub = df[df['Item'].isin(df_item_cnt['index'])]

df_sub['purchase'] = True
df_pivot = pd.pivot_table(data=df_sub, index='ID', columns='Item', values='purchase',
                          aggfunc=max, fill_value=False)

# apriori
item_sets = apriori(df=df_pivot, min_support=0.005, use_colnames=True)

# association rules
rules = association_rules(df=item_sets, metric='confidence', min_threshold=0.005)
rules_sub = rules[rules['support']>=0.03]
rules_sub = rules_sub.sort_values('lift', ascending=False)
rules_sub.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['purchase'] = True


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
41842,"(other vegetables, whole milk)","(rolls/buns, yogurt)",0.192964,0.112261,0.034661,0.179625,1.600067,0.012999,1.082114
41843,"(rolls/buns, yogurt)","(other vegetables, whole milk)",0.112261,0.192964,0.034661,0.308756,1.600067,0.012999,1.167512
41841,"(other vegetables, yogurt)","(rolls/buns, whole milk)",0.121314,0.180031,0.034661,0.285714,1.587028,0.012821,1.147957
41844,"(rolls/buns, whole milk)","(other vegetables, yogurt)",0.180031,0.121314,0.034661,0.192529,1.587028,0.012821,1.088195
41840,"(other vegetables, rolls/buns)","(yogurt, whole milk)",0.147957,0.151837,0.034661,0.234266,1.542881,0.012196,1.107647


## 주성분 분석(PCA)

In [124]:
from sklearn.decomposition import PCA
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error

In [125]:
# Q1
df = pd.read_csv('diamonds.csv')

pca = PCA(n_components=3)
df_pca = pca.fit_transform(df[['x', 'y', 'z']])
df_pca = pd.DataFrame(df_pca, columns=['comp_1', 'comp_2', 'comp_3'])

In [126]:
df[['x', 'y', 'z']].corr().round(3)

Unnamed: 0,x,y,z
x,1.0,0.975,0.971
y,0.975,1.0,0.952
z,0.971,0.952,1.0


In [127]:
df_pca.corr().round(3)

Unnamed: 0,comp_1,comp_2,comp_3
comp_1,1.0,0.0,0.0
comp_2,0.0,1.0,0.0
comp_3,0.0,0.0,1.0


In [128]:
# Q2
df = pd.read_csv('diamonds.csv')

pca = PCA(n_components=5)
df_pca = pca.fit_transform(df[['x', 'y', 'z', 'table', 'depth']])

pd.Series(pca.explained_variance_ratio_).cumsum()

0    0.541054
1    0.821735
2    0.994760
3    0.998496
4    1.000000
dtype: float64

In [139]:
# Q3
df = pd.read_csv('diamonds.csv')

pca = PCA(n_components=1)
df_pca = pca.fit_transform(df[['x', 'y', 'z']])
df['comp_1'] = df_pca

df_train, df_test = train_test_split(df, train_size=0.8, random_state=123)

# model 1
model_1 = ols('price ~ carat + x', df_train).fit()
pred_1 = model_1.predict(df_test)
rmse_1 = (mean_squared_error(y_pred=pred_1, y_true=df_test['price']))**0.5
print('1번 모델:', rmse_1.round())

# model 2
model_2 = ols('price ~ carat + comp_1', df_train).fit()
pred_2 = model_2.predict(df_test)
rmse_2 = (mean_squared_error(y_pred=pred_2, y_true=df_test['price']))**0.5
print('2번 모델:', rmse_2.round())

1번 모델: 1526.0
2번 모델: 1529.0


## 실전 종합 문제 1

In [50]:
import pandas as pd
from scipy.stats import ttest_ind
from sklearn.cluster import AgglomerativeClustering

In [52]:
# Q1
df_2019 = pd.read_csv('Seoul_Bus_2019.csv')
df_info = pd.read_csv('Seoul_Bus_info.csv')
df = df_2019.merge(df_info, left_on='Line_No', right_on='Bus_no')
df_q1 = df.copy()

df_q1['total'] = df_q1.loc[:, 'H01':'H24'].sum(axis=1) # 모든 시간의 인원 합
df_q1 = df_q1[df_q1['type'].isin(['지선', '간선'])] # 지선, 간선 버스만 추출

df_q1_cnt = df_q1.groupby('Line_No')['total'].sum().reset_index()
len(df_q1_cnt[df_q1_cnt['total']<=(1000000000/1000)])

17

In [60]:
# Q2
df_q2 = df.copy()

df_q2_gan = df_q2[df_q2['type']=='간선']
df_q2_ji = df_q2[df_q2['type']=='지선']

df_q2_gan = df_q2_gan[['Line_No', 'Station_ID']].drop_duplicates()
df_q2_ji = df_q2_ji[['Line_No', 'Station_ID']].drop_duplicates()

df_q2_gan_cnt = df_q2_gan.groupby('Line_No')['Station_ID'].nunique().reset_index()
df_q2_ji_cnt = df_q2_ji.groupby('Line_No')['Station_ID'].nunique().reset_index()

stat, p = ttest_ind(df_q2_gan_cnt['Station_ID'],
                   df_q2_ji_cnt['Station_ID'])
abs(stat).round(2)

9.5

In [92]:
# Q3
def nor_minmax(x):
    return (x-min(x)) / (max(x)-min(x))

df_q3 = df.copy()
df_q3 = df_q3[df_q3['type']=='지선']
df_q3 = pd.concat([df_q3[['Year_Month', 'Station_ID']],
                 df_q3.loc[:, 'H01':'H24']],
                 axis=1)
df_q3_sum = df_q3.iloc[:, 1:].groupby('Station_ID').sum().reset_index()
df_q3_melt = df_q3_sum.melt(id_vars='Station_ID')
df_q3_melt = df_q3_melt.sort_values(['Station_ID', 'variable'], ascending=[False, True])
df_q3_melt['value_nor'] = df_q3_melt.groupby('Station_ID')['value'].transform(nor_minmax)
df_q3_pivot = pd.pivot(df_q3_melt, index='Station_ID', columns='variable', values='value_nor')

model = AgglomerativeClustering(n_clusters=6).fit(df_q3_pivot)
df_q3_pivot['cluster'] = model.labels_
df_q3_pivot = df_q3_pivot.reset_index()
df_q3_pivot_melt = df_q3_pivot.melt(id_vars=['Station_ID', 'cluster'])
df_q3_pivot_melt

Unnamed: 0,Station_ID,cluster,variable,value
0,100000006,0,H01,0.001992
1,100000007,0,H01,0.001446
2,100000008,3,H01,0.006030
3,100000009,3,H01,0.000000
4,100000010,3,H01,0.000000
...,...,...,...,...
134131,998502006,3,H24,0.000000
134132,998502024,3,H24,0.000000
134133,998502173,1,H24,0.526761
134134,998601159,3,H24,0.115789


## 실전 종합 문제 2

## 실전 종합 문제 3