# 기계학습 3장 예제 파이썬
> 예제 파이썬

- toc:true
- branch: master
- badges: true
- comments: true
- author: 김동준
- categories : ["Python", "기계학습"]

# Advertising data 불러오고 df로 만들기

In [11]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
ad = pd.read_csv("Advertising.csv", header=0) # csv 파일 불러오기

# 1. Logistic regression model
>statmodels에 의한 로지스틱 모형 적합

In [12]:
ad['sales'] = (ad['sales'] > 10)*1   #sales 변수가 10을 넘으면 1(판매량 높음), 그렇지 않으면 0으로 할당하여 분류문제로 세팅
X = ad[['TV','radio','newspaper']]
Y = ad['sales']

# linear model with statsmodels

In [13]:
import statsmodels.api as sm              #statsmodels 패키지 불러오기
model = sm.GLM.from_formula("sales ~ TV + radio + newspaper", family = sm.families.Binomial(), data=ad)
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,sales,No. Observations:,200.0
Model:,GLM,Df Residuals:,196.0
Model Family:,Binomial,Df Model:,3.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-13.36
Date:,"Sat, 23 Oct 2021",Deviance:,26.719
Time:,20:03:15,Pearson chi2:,30.1
No. Iterations:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-21.5305,6.269,-3.435,0.001,-33.817,-9.244
TV,0.2089,0.060,3.454,0.001,0.090,0.328
radio,0.4054,0.124,3.277,0.001,0.163,0.648
newspaper,-0.0086,0.026,-0.332,0.740,-0.059,0.042


# sklearn을 이용한 로지스틱 적합 
> (머신러닝에 특화된 패키지 , 서머리같은건 제공 잘 안해줌)

In [6]:
from sklearn.linear_model import LogisticRegression

In [15]:
model1 = LogisticRegression()   # model instance setup
model2 = model1.fit(X,Y)      # model fitting to data
print(model2.intercept_)
print(model2.coef_)
print(model2.score(X,Y))

[-21.16756665]
[[ 0.2054616   0.39803989 -0.00823107]]
0.97


In [16]:
from sklearn import metrics
Y_pred = model2.predict(X)

In [17]:
cm = metrics.confusion_matrix(Y, Y_pred) # confusion matrix
cm

array([[ 42,   3],
       [  3, 152]])

In [18]:
model2.score(X,Y)   # 정분류율

0.97

# 예측성능의 평가를 위해 훈련자료와 평가자료를 적당한 비율로 나누어 계산해 보자.

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

In [20]:
model1 = LogisticRegression()   # model instance setup
model2 = model1.fit(X_train,Y_train)      # model fitting to data

In [22]:
print(model2.intercept_)
print(model2.coef_)
print(model2.score(X,Y))

[-21.87095867]
[[0.20169776 0.38998892 0.02502465]]
0.96


In [23]:
model2.predict_proba(X_test)  # predicted probability for 0 vs 1

array([[3.68362025e-01, 6.31637975e-01],
       [9.98889402e-01, 1.11059773e-03],
       [9.49784560e-01, 5.02154405e-02],
       [0.00000000e+00, 1.00000000e+00],
       [7.68983531e-08, 9.99999923e-01],
       [9.99484252e-01, 5.15748381e-04],
       [3.03408631e-01, 6.96591369e-01],
       [1.36557432e-13, 1.00000000e+00],
       [8.49772353e-01, 1.50227647e-01],
       [1.22679644e-12, 1.00000000e+00],
       [0.00000000e+00, 1.00000000e+00],
       [2.75460182e-01, 7.24539818e-01],
       [1.77948022e-05, 9.99982205e-01],
       [2.34914310e-11, 1.00000000e+00],
       [1.08841492e-02, 9.89115851e-01],
       [3.33913329e-05, 9.99966609e-01],
       [0.00000000e+00, 1.00000000e+00],
       [9.97690690e-01, 2.30930953e-03],
       [1.23437704e-06, 9.99998766e-01],
       [3.13082893e-14, 1.00000000e+00],
       [0.00000000e+00, 1.00000000e+00],
       [6.14898732e-04, 9.99385101e-01],
       [1.01664233e-10, 1.00000000e+00],
       [2.94828121e-06, 9.99997052e-01],
       [9.999425