# Linear Regression Example

In [1]:
import numpy as np
import pandas as pd 

# 모델 라이브러리 선언
from sklearn import datasets, linear_model

# 모델 정확도 라이브러리 선언
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# 시각화 라이브러리 선언
import matplotlib.pyplot as plt

### 1. 분석데이터 로딩

In [2]:
pwd

'C:\\Users\\cj\\Python_CJ_ST_COPY\\Python_CJ_ST\\Session01 - Why Python for Data Analysis'

In [3]:
#CSV 파일을 읽어 DataFrame 변수에 저장하기
featuresData = pd.read_csv("../dataset./feature_regression_example.csv")

In [4]:
featuresData.head(5)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155


In [5]:
featuresData.head(2)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442


In [6]:
featuresData.corr()

Unnamed: 0,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT
YEARWEEK,1.0,0.961051,0.213022,0.037392,-0.030681,0.30032
YEAR,0.961051,1.0,-0.065302,-0.048803,0.067443,0.208435
WEEK,0.213022,-0.065302,1.0,0.307541,-0.349205,0.347462
QTY,0.037392,-0.048803,0.307541,1.0,-0.54492,0.712772
HCLUS,-0.030681,0.067443,-0.349205,-0.54492,1.0,-0.552991
PRO_PERCENT,0.30032,0.208435,0.347462,0.712772,-0.552991,1.0


### 2.데이터 형 변환

In [7]:
featuresData.dtypes

REGIONID         object
PRODUCTGROUP     object
PRODUCT          object
ITEM             object
YEARWEEK          int64
YEAR              int64
WEEK              int64
QTY               int64
HOLIDAY          object
HCLUS             int64
PROMOTION        object
PRO_PERCENT     float64
dtype: object

In [8]:
#형변환
featuresData[['WEEK','QTY','PRO_PERCENT']]=featuresData[['WEEK','QTY','PRO_PERCENT']].astype('float64')
featuresData.dtypes

REGIONID         object
PRODUCTGROUP     object
PRODUCT          object
ITEM             object
YEARWEEK          int64
YEAR              int64
WEEK            float64
QTY             float64
HOLIDAY          object
HCLUS             int64
PROMOTION        object
PRO_PERCENT     float64
dtype: object

### 3. 문자데이터 코드변환(Vector연산)

In [9]:
#case when 
############################
def codeConversion(df):
    if df == "Y":
        return 1
    else:
        return 0

In [10]:
featuresData['PROMOTIONCODE'] = featuresData['PROMOTION'].apply(codeConversion)
featuresData.head(2)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PROMOTIONCODE
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1.0,1225.0,Y,1,Y,0.209442,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2.0,968.0,N,4,Y,0.209442,1


In [11]:
featuresData['HOLIDAYCODE'] = featuresData['HOLIDAY'].apply(codeConversion)
featuresData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PROMOTIONCODE,HOLIDAYCODE
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1.0,1225.0,Y,1,Y,0.209442,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2.0,968.0,N,4,Y,0.209442,1,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3.0,1209.0,N,4,Y,0.208155,1,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4.0,1810.0,Y,2,Y,0.208155,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5.0,1773.0,N,4,Y,0.208155,1,0


In [12]:
featuresData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PROMOTIONCODE,HOLIDAYCODE
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1.0,1225.0,Y,1,Y,0.209442,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2.0,968.0,N,4,Y,0.209442,1,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3.0,1209.0,N,4,Y,0.208155,1,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4.0,1810.0,Y,2,Y,0.208155,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5.0,1773.0,N,4,Y,0.208155,1,0


In [13]:
featuresData.corr()

Unnamed: 0,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT,PROMOTIONCODE,HOLIDAYCODE
YEARWEEK,1.0,0.961051,0.213022,0.037392,-0.030681,0.30032,0.108551,0.009395
YEAR,0.961051,1.0,-0.065302,-0.048803,0.067443,0.208435,0.085606,-0.070803
WEEK,0.213022,-0.065302,1.0,0.307541,-0.349205,0.347462,0.089293,0.284231
QTY,0.037392,-0.048803,0.307541,1.0,-0.54492,0.712772,0.630081,0.514813
HCLUS,-0.030681,0.067443,-0.349205,-0.54492,1.0,-0.552991,-0.386926,-0.974902
PRO_PERCENT,0.30032,0.208435,0.347462,0.712772,-0.552991,1.0,0.903477,0.496585
PROMOTIONCODE,0.108551,0.085606,0.089293,0.630081,-0.386926,0.903477,1.0,0.378861
HOLIDAYCODE,0.009395,-0.070803,0.284231,0.514813,-0.974902,0.496585,0.378861,1.0


### 4. 데이터 셋 분리

In [14]:
predictStd = 201630

In [15]:
#where between and (조건 절)
trainingData = featuresData.query('YEARWEEK <= @predictStd')
trainingData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PROMOTIONCODE,HOLIDAYCODE
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1.0,1225.0,Y,1,Y,0.209442,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2.0,968.0,N,4,Y,0.209442,1,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3.0,1209.0,N,4,Y,0.208155,1,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4.0,1810.0,Y,2,Y,0.208155,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5.0,1773.0,N,4,Y,0.208155,1,0


In [16]:
#where between and (조건 절)
testData = featuresData.query('YEARWEEK > @predictStd')
testData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PROMOTIONCODE,HOLIDAYCODE
83,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201631,2016,31.0,1522.0,N,4,Y,0.280258,1,0
84,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201632,2016,32.0,2100.0,N,4,Y,0.280258,1,0
85,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201633,2016,33.0,43.0,N,4,N,0.0,0,0
86,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34.0,1700.0,Y,1,Y,0.308584,1,1
87,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35.0,1514.0,Y,1,Y,0.308584,1,1


In [17]:
trainingData_feature = trainingData[['WEEK','PRO_PERCENT','HOLIDAYCODE']]

In [18]:
trainingData_label = trainingData[['QTY']]

In [19]:
testData_feature = testData[['WEEK','PRO_PERCENT','HOLIDAYCODE']]

In [20]:
testData_label = testData[['QTY']]

In [21]:
# from sklearn import tree
#clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)

### 5. 모델선언 및 예측

### 안녕하세요

In [22]:
#model_method= tree.DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=0)

In [23]:
# from sklearn import tree
#model_method = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)

In [24]:
# from sklearn import tree
model_method = linear_model.LinearRegression()

In [25]:
### Extract Coefficient 머신러닝!!

In [26]:
model = model_method.fit(trainingData_feature, trainingData_label)

In [27]:
predict = model.predict(testData_feature)

In [28]:
predict

array([[1363.12337213],
       [1367.51233386],
       [ 301.07146112],
       [1945.76874053],
       [1950.15770226],
       [1954.54666398],
       [1497.68802819],
       [1502.07698991],
       [1398.23506593],
       [1402.62402766],
       [1868.2605869 ],
       [1411.40195111],
       [ 344.96107837],
       [2422.58190441],
       [2426.97086614],
       [2431.35982787],
       [2435.74878959],
       [2440.13775132],
       [1983.27911552],
       [1722.01044627],
       [1726.399408  ],
       [2192.03596724],
       [ 200.12534144],
       [ 217.68118834]])

In [29]:
# 정답지
testData_label.head()

Unnamed: 0,QTY
83,1522.0
84,2100.0
85,43.0
86,1700.0
87,1514.0


### 6. 데이터 정리

In [30]:
type(predict)

numpy.ndarray

In [31]:
predictData = pd.DataFrame(predict)

In [32]:
predictData

Unnamed: 0,0
0,1363.123372
1,1367.512334
2,301.071461
3,1945.768741
4,1950.157702
5,1954.546664
6,1497.688028
7,1502.07699
8,1398.235066
9,1402.624028


In [33]:
predictData.head()

Unnamed: 0,0
0,1363.123372
1,1367.512334
2,301.071461
3,1945.768741
4,1950.157702


In [34]:
predictData.columns = ["PREDICT"]

In [35]:
testData

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PROMOTIONCODE,HOLIDAYCODE
83,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201631,2016,31.0,1522.0,N,4,Y,0.280258,1,0
84,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201632,2016,32.0,2100.0,N,4,Y,0.280258,1,0
85,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201633,2016,33.0,43.0,N,4,N,0.0,0,0
86,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34.0,1700.0,Y,1,Y,0.308584,1,1
87,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35.0,1514.0,Y,1,Y,0.308584,1,1
88,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201636,2016,36.0,1501.0,Y,1,Y,0.308584,1,1
89,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201637,2016,37.0,1491.0,N,4,Y,0.308584,1,0
90,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201638,2016,38.0,806.0,N,4,Y,0.308584,1,0
91,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201639,2016,39.0,2111.0,N,4,Y,0.280258,1,0
92,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201640,2016,40.0,2400.0,N,4,Y,0.280258,1,0


In [36]:
testData.reset_index(drop=True, inplace=True)

In [37]:
testData.head(5)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PROMOTIONCODE,HOLIDAYCODE
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201631,2016,31.0,1522.0,N,4,Y,0.280258,1,0
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201632,2016,32.0,2100.0,N,4,Y,0.280258,1,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201633,2016,33.0,43.0,N,4,N,0.0,0,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34.0,1700.0,Y,1,Y,0.308584,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35.0,1514.0,Y,1,Y,0.308584,1,1


In [38]:
predictData.reset_index(drop=True, inplace=True)

In [39]:
#testData_feature.reset_index(drop=True, inplace=True)

In [40]:
#cncatenate labels to df as a new column / column binding
finalDf = pd.concat([testData, predictData], axis = 1)

In [41]:
finalDf.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,PROMOTIONCODE,HOLIDAYCODE,PREDICT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201631,2016,31.0,1522.0,N,4,Y,0.280258,1,0,1363.123372
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201632,2016,32.0,2100.0,N,4,Y,0.280258,1,0,1367.512334
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201633,2016,33.0,43.0,N,4,N,0.0,0,0,301.071461
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34.0,1700.0,Y,1,Y,0.308584,1,1,1945.768741
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35.0,1514.0,Y,1,Y,0.308584,1,1,1950.157702


In [42]:
finalDf.to_csv("./dt_result_20181112_LinearRegression.csv")

In [43]:
pwd

'C:\\Users\\cj\\Python_CJ_ST_COPY\\Python_CJ_ST\\Session01 - Why Python for Data Analysis'

### 7. 정확도 측정

In [52]:
mean_absolute_error(finalDf['QTY'], finalDf['PREDICT'])  

455.62189421546094

In [53]:
mean_squared_error(finalDf['QTY'], finalDf['PREDICT'])  

376307.3344786445

In [54]:
r2_score(finalDf['QTY'], finalDf['PREDICT'])  

0.38670554032608817

In [55]:
finalDf.to_csv("./decisiontree_result_LinearRegression.csv")