# sklearn.linear_model을 활용한 Ordinary Least Square Regression

## 작동 예

In [1]:
#패키지 라이브러리 호출
import numpy as np
from sklearn.linear_model import LinearRegression
#자료는 array로 입력
x = np.array([5, 15, 25, 35, 45, 55]).reshape((-1, 1)) #2차원 자료
y = np.array([5, 20, 14, 32, 22, 38]) #반응변수 y
#모델 생성
model = LinearRegression()
model.fit(x, y)
#계산값
## R^2
r_sq = model.score(x, y)
print('coefficient of determination:', r_sq)
## 절편
print('intercept:', model.intercept_)
## 기울기
print('slope:', model.coef_)
# 예측과 SSE
y_pred = model.predict(x)
print('predicted response:', y_pred, sep='\n')
residuals_sq=(y-y_pred)**2
print('SSE:',residuals_sq.sum())

coefficient of determination: 0.715875613747954
intercept: 5.633333333333333
slope: [0.54]
predicted response:
[ 8.33333333 13.73333333 19.13333333 24.53333333 29.93333333 35.33333333]
SSE: 202.53333333333336


# Pandas 데이터 사용

In [2]:
import pandas as pd
sample_data=pd.DataFrame({'x1':[1,2,3,4,5,6],'x2':[2,3,4,5,5,7],'y1':[10,20,34,40,52,61]})

In [3]:
sample_data

Unnamed: 0,x1,x2,y1
0,1,2,10
1,2,3,20
2,3,4,34
3,4,5,40
4,5,5,52
5,6,7,61


In [4]:
sample_data.loc[:,'y1']

0    10
1    20
2    34
3    40
4    52
5    61
Name: y1, dtype: int64

In [5]:
model = LinearRegression()
model.fit(sample_data.loc[:,['x1','x2']],sample_data.loc[:,'y1'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [6]:
model.score(sample_data.loc[:,['x1','x2']],sample_data.loc[:,'y1'])

0.9936002202074767

In [7]:
model.coef_

array([10.89189189, -0.75675676])

In [8]:
model.intercept_

1.3243243243243157

In [9]:
def SSE(m,x,y):
    y_pred = m.predict(x)
    residuals_sq=(y-y_pred)**2
    return residuals_sq.sum()

In [10]:
SSE(model,sample_data.loc[:,['x1','x2']],sample_data.loc[:,'y1'])

11.729729729729733

## Group 처리 절차

In [11]:
import pandas as pd
sample_data=pd.DataFrame({'x1':[1,2,3,4,5,6,1,2,3,4,5,6],
                          'x2':[2,3,4,5,5,7,2,3,3,4,4,8],
                          'y1':[10,20,34,40,52,61,10,20,30,40,50,65],
                         'g':['a','a','a','a','a','a','b','b','b','b','b','b']})

In [12]:
sample_data

Unnamed: 0,x1,x2,y1,g
0,1,2,10,a
1,2,3,20,a
2,3,4,34,a
3,4,5,40,a
4,5,5,52,a
5,6,7,61,a
6,1,2,10,b
7,2,3,20,b
8,3,3,30,b
9,4,4,40,b


In [13]:
def regressio_runner(v):
    model = LinearRegression()
    x=v[['x1','x2']]
    y=v['y1']
    model.fit(x,y)
    return model
def regression_runner_slope(v):
    model=regressio_runner(v)
    return model.coef_
def regression_runner_intercept(v):
    model=regressio_runner(v)
    return model.intercept_
def regression_SSE(v):
    model=regressio_runner(v)
    x=v[['x1','x2']]
    y=v['y1']
    y_pred = model.predict(x)
    residuals_sq=(y-y_pred)**2
    return residuals_sq.sum()

In [14]:
regression_runner_intercept(sample_data)

-1.6468023255813904

In [15]:
v1=sample_data.groupby(['g']).apply(regression_runner_slope)

In [16]:
v1

g
a    [10.89189189189189, -0.7567567567567541]
b     [9.322916666666668, 1.4322916666666692]
dtype: object

In [17]:
v1['a']

array([10.89189189, -0.75675676])

In [18]:
v1['a'][0]

10.89189189189189

In [19]:
v2=sample_data.groupby(['g']).apply(regression_SSE)

In [20]:
v2

g
a    11.729730
b     0.651042
dtype: float64

In [21]:
v2.min()

0.651041666666663

In [22]:
v2.idxmin()

'b'

## Group 처리하여 SSE, slope 처리한 다음 DataFrame으로

In [23]:
def find_min_SSE_in_group(dataset):
    #helper 함수
    def regressio_runner(v):
        model = LinearRegression()
        x=v[['x1','x2']]
        y=v['y1']
        model.fit(x,y)
        return model
    def regression_runner_slope(v):
        model=regressio_runner(v)
        return model.coef_
    def regression_runner_intercept(v):
        model=regressio_runner(v)
        return model.intercept_
    def regression_SSE(v):
        model=regressio_runner(v)
        x=v[['x1','x2']]
        y=v['y1']
        y_pred = model.predict(x)
        residuals_sq=(y-y_pred)**2
        return residuals_sq.sum()
    #데이터 프로세싱
    runner=dataset.groupby(['g']).apply #실행 객체
    v=runner(regression_SSE)
    s=runner(regression_runner_slope)
    idx_min=v.idxmin()
    return pd.DataFrame({"MinSseGroupName":idx_min,
                         "MinSseValue":v.min(),
                        "beta1":s[idx_min][0],
                        "beta2":s[idx_min][1]},index=[0,])

In [24]:
sample_data=pd.DataFrame({'x1':[1,2,3,4,5,6,1,2,3,4,5,6],
                          'x2':[2,3,4,5,5,7,2,3,3,4,4,8],
                          'y1':[10,20,34,40,52,61,10,20,30,40,50,65],
                         'g':['a','a','a','a','a','a','b','b','b','b','b','b']})

In [25]:
find_min_SSE_in_group(sample_data)

Unnamed: 0,MinSseGroupName,MinSseValue,beta1,beta2
0,b,0.651042,9.322917,1.432292
