In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from tensorflow.keras.layers import LSTM, Dense

from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer, PowerTransformer, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error,r2_score

## R-Squared 계산 함수 정의
def show_reg_result(y_test,y_pred):
    print("Max Error: ",
      round(np.abs(y_test - y_pred).max(),3))
    print("Mean Absolute Error: ",
      round(np.abs(y_test-y_pred).mean(),3))
    print("Root Mean Squared Error: ",
      mean_squared_error(y_test,y_pred, squared=False).round(3))
    print("R-Squared: ",
      r2_score(y_test, y_pred).round(3))

## 교차 검증을 수행하는 함수 정의
def cross_validation(model, X, y, n=5):
    cv = KFold(n, shuffle=False)
    return cross_val_score(model, X, y, cv=cv).mean().round(4)

# 입력 변수 가중치
def plot_feature_weight(feature, weight, N=20):
    plt.figure(figsize=(8,int(N/2)))
    W = pd.DataFrame({'feature':feature[:N],'weight':weight[:N]})
    W.sort_values('weight', inplace=True)
    plt.barh(W.feature, W.weight)


#
# 1. Data Load
- Cape의 경우 BDI 정보가 없음

In [45]:
#data = pd.read_excel('PERFORMANCE_DATA(TRADE, NATION).xlsx')
data = pd.read_excel("https://raw.githubusercontent.com/coj918/github_from_HD/master/DAILY_CB_TR_CAPE_OWNED.xlsx")
data.columns = ['YEAR_MONTH','CB_TYPE','VSL_TYPE','TC_TYPE','IND_12','IND_11','IND_10','IND_9','IND_8','IND_7','IND_6','IND_5','IND_4','IND_3','IND_2','IND_1','IND','M12','M_11','M_10','M_9','M_8','M_7','M_6','M_5','M_4','M_3','M_2','M_1','DAY_CB']
data.set_index("YEAR_MONTH", inplace=True)

data.info()

data[:5]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108 entries, 201501 to 202312
Data columns (total 29 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   CB_TYPE   108 non-null    object
 1   VSL_TYPE  108 non-null    object
 2   TC_TYPE   108 non-null    object
 3   IND_12    108 non-null    int64 
 4   IND_11    108 non-null    int64 
 5   IND_10    108 non-null    int64 
 6   IND_9     108 non-null    int64 
 7   IND_8     108 non-null    int64 
 8   IND_7     108 non-null    int64 
 9   IND_6     108 non-null    int64 
 10  IND_5     108 non-null    int64 
 11  IND_4     108 non-null    int64 
 12  IND_3     108 non-null    int64 
 13  IND_2     108 non-null    int64 
 14  IND_1     108 non-null    int64 
 15  IND       108 non-null    int64 
 16  M12       108 non-null    int64 
 17  M_11      108 non-null    int64 
 18  M_10      108 non-null    int64 
 19  M_9       108 non-null    int64 
 20  M_8       108 non-null    int64 
 21  M_7     

Unnamed: 0_level_0,CB_TYPE,VSL_TYPE,TC_TYPE,IND_12,IND_11,IND_10,IND_9,IND_8,IND_7,IND_6,...,M_9,M_8,M_7,M_6,M_5,M_4,M_3,M_2,M_1,DAY_CB
YEAR_MONTH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201501,TR,Cape,OWNED,2171,1674,2766,1887,1508,1768,1451,...,20608,19958,19800,20019,18913,19784,20338,20987,21323,19858
201502,TR,Cape,OWNED,1674,2766,1887,1508,1768,1451,1854,...,19958,19800,20019,18913,19784,20338,20987,21323,19858,17825
201503,TR,Cape,OWNED,2766,1887,1508,1768,1451,1854,2306,...,19800,20019,18913,19784,20338,20987,21323,19858,17825,18370
201504,TR,Cape,OWNED,1887,1508,1768,1451,1854,2306,2085,...,20019,18913,19784,20338,20987,21323,19858,17825,18370,19303
201505,TR,Cape,OWNED,1508,1768,1451,1854,2306,2085,3149,...,18913,19784,20338,20987,21323,19858,17825,18370,19303,19964


### 1-1) INDEX (IND ~ IND_12) -> 숫자로 변환할 필요없음. (-) 부호를 _ 로 바꿔서 오류

In [46]:
#data = data.astype({'IND_1':'int'})
#data = data.astype({'IND_1':'float64'})
#data['IND_1i']=pd.to_numeric(data['IND_1'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108 entries, 201501 to 202312
Data columns (total 29 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   CB_TYPE   108 non-null    object
 1   VSL_TYPE  108 non-null    object
 2   TC_TYPE   108 non-null    object
 3   IND_12    108 non-null    int64 
 4   IND_11    108 non-null    int64 
 5   IND_10    108 non-null    int64 
 6   IND_9     108 non-null    int64 
 7   IND_8     108 non-null    int64 
 8   IND_7     108 non-null    int64 
 9   IND_6     108 non-null    int64 
 10  IND_5     108 non-null    int64 
 11  IND_4     108 non-null    int64 
 12  IND_3     108 non-null    int64 
 13  IND_2     108 non-null    int64 
 14  IND_1     108 non-null    int64 
 15  IND       108 non-null    int64 
 16  M12       108 non-null    int64 
 17  M_11      108 non-null    int64 
 18  M_10      108 non-null    int64 
 19  M_9       108 non-null    int64 
 20  M_8       108 non-null    int64 
 21  M_7     

In [47]:
data.isnull().sum()

CB_TYPE     0
VSL_TYPE    0
TC_TYPE     0
IND_12      0
IND_11      0
IND_10      0
IND_9       0
IND_8       0
IND_7       0
IND_6       0
IND_5       0
IND_4       0
IND_3       0
IND_2       0
IND_1       0
IND         0
M12         0
M_11        0
M_10        0
M_9         0
M_8         0
M_7         0
M_6         0
M_5         0
M_4         0
M_3         0
M_2         0
M_1         0
DAY_CB      0
dtype: int64

In [48]:
data[:3]

Unnamed: 0_level_0,CB_TYPE,VSL_TYPE,TC_TYPE,IND_12,IND_11,IND_10,IND_9,IND_8,IND_7,IND_6,...,M_9,M_8,M_7,M_6,M_5,M_4,M_3,M_2,M_1,DAY_CB
YEAR_MONTH,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201501,TR,Cape,OWNED,2171,1674,2766,1887,1508,1768,1451,...,20608,19958,19800,20019,18913,19784,20338,20987,21323,19858
201502,TR,Cape,OWNED,1674,2766,1887,1508,1768,1451,1854,...,19958,19800,20019,18913,19784,20338,20987,21323,19858,17825
201503,TR,Cape,OWNED,2766,1887,1508,1768,1451,1854,2306,...,19800,20019,18913,19784,20338,20987,21323,19858,17825,18370


#
# 6. 모델 테스트

### * 훈련 데이터/테스트 데이터를 통한 시각화 Function

In [49]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

def eval_on_features(features, target, n_train, model):

    X = data.index.astype("int64").values.reshape(-1, 1)
    xticks = pd.date_range(start=data.index.min(), end=data.index.max(), freq='M')
    #print(xticks)

    # 년도와 월을 추출하여 xticks_name을 생성합니다.
    xticks_name = [date.strftime("%Y-%m") for date in xticks]
    #month = ["Jan", "Feb", "Mar","Apr", "May", "Jun", "Jul", "Aug", 'Oct', 'Sep', 'Nov', 'Dec']
    #xticks_name = [month[int(m)]+d for m, d in zip(xticks.strftime("%m"),xticks.strftime(" %m-%d"))]
    #plt.xticks(xticks, xticks_name, rotation=90, ha='left')
    #print(xticks_name)

    X_train, X_test = features[:n_train], features[n_train:]
    y_train, y_test = target[:n_train], target[n_train:]
    #X_train, X_test, y_train, y_test = train_test_split(features, target)

    model.fit(X_train, y_train)
    print('Train : {:.2f}'.format(model.score(X_train, y_train)))
    print('Test : {:.2f}'.format(model.score(X_test, y_test)))

    y_pred = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    plt.figure(figsize=(12, 5))

    #plt.xticks(range(0, len(xticks_name), 8), xticks_name, rotation=90, ha="left")

    plt.plot(range(n_train), y_train, label="Train")
    plt.plot(range(n_train, len(y_test) + n_train), y_test, '-', label="Test")
    plt.plot(range(n_train), y_pred_train, '--', label="Train Pred")
    plt.plot(range(n_train, len(y_test) + n_train), y_pred, '--', label="Test Pred")

    plt.legend(loc='best')
    plt.title(model)
    plt.xlabel("Date")
    plt.ylabel("Daily C/B")

    return y_train, y_test, y_pred

In [50]:
X_data = data.drop(['DAY_CB','CB_TYPE','VSL_TYPE','TC_TYPE'], axis=1)
y_data = data.DAY_CB

split_size = int(len(X_data)*0.75) #전체 108row 중 70%로 분할.

X_train, X_test = X_data[:split_size], X_data[split_size:]
y_train, y_test = y_data[:split_size], y_data[split_size:]
X_train.shape, X_test.shape, y_train.shape, y_test.shape

X_train[:3], type(X_train)

X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81 entries, 201501 to 202109
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   IND_12  81 non-null     int64
 1   IND_11  81 non-null     int64
 2   IND_10  81 non-null     int64
 3   IND_9   81 non-null     int64
 4   IND_8   81 non-null     int64
 5   IND_7   81 non-null     int64
 6   IND_6   81 non-null     int64
 7   IND_5   81 non-null     int64
 8   IND_4   81 non-null     int64
 9   IND_3   81 non-null     int64
 10  IND_2   81 non-null     int64
 11  IND_1   81 non-null     int64
 12  IND     81 non-null     int64
 13  M12     81 non-null     int64
 14  M_11    81 non-null     int64
 15  M_10    81 non-null     int64
 16  M_9     81 non-null     int64
 17  M_8     81 non-null     int64
 18  M_7     81 non-null     int64
 19  M_6     81 non-null     int64
 20  M_5     81 non-null     int64
 21  M_4     81 non-null     int64
 22  M_3     81 non-null     int64
 23  M_2     

### 6-1) RandomForest
- 훈련 세트 : 2015/01 ~ 2021/12, 테스트 세트 : 2022/01 ~ 2023/07

In [51]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print('Train : ', rf.score(X_train, y_train)), print('Test : ', rf.score(X_test, y_test))

Train :  0.97001629092982
Test :  0.393149239364336


(None, None)

In [52]:
show_reg_result(y_test, y_pred)

Max Error:  18776.5
Mean Absolute Error:  4118.263
Root Mean Squared Error:  6945.538
R-Squared:  0.393


### 6-2) LSTM

In [53]:
from numpy import array
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense

In [54]:
# 입력 데이터를 3차원으로 변환
def create_sequences(data, look_back=12):
    X, y = [], []
    for i in range(len(data) - look_back):
        X.append(data.iloc[i:i+look_back, :-1].values)
        y.append(data.iloc[i+look_back, -1])
    #for i in range(len(X_data) - look_back):
    #    X.append(X_data.iloc[i:i+look_back, :-1].values)
    #    y.append(X_data.iloc[i+look_back, -1])
    return np.array(X), np.array(y)


In [55]:
X[:3]

array([[[0.        , 0.30021142, 0.23840318, 0.37420719, 0.26489243,
         0.21775899, 0.25009327, 0.21067031, 0.26078846, 0.31700037,
         0.28951623, 0.42183808, 0.15097625, 0.1081955 , 0.30871448,
         0.29638059, 0.32281037, 0.3824242 , 0.33470377, 0.32310403,
         0.32917706, 0.25037406, 0.3124332 , 0.35190595, 0.39814749],
        [0.        , 0.23840318, 0.37420719, 0.26489243, 0.21775899,
         0.25009327, 0.21067031, 0.26078846, 0.31700037, 0.28951623,
         0.42183808, 0.15097625, 0.1081955 , 0.10633006, 0.29638059,
         0.32281037, 0.3824242 , 0.33470377, 0.32310403, 0.32917706,
         0.25037406, 0.3124332 , 0.35190595, 0.39814749, 0.42208764],
        [0.        , 0.37420719, 0.26489243, 0.21775899, 0.25009327,
         0.21067031, 0.26078846, 0.31700037, 0.28951623, 0.42183808,
         0.15097625, 0.1081955 , 0.10633006, 0.08481532, 0.32281037,
         0.3824242 , 0.33470377, 0.32310403, 0.32917706, 0.25037406,
         0.3124332 , 0.35190595,

In [56]:

look_back = 12
X, y = create_sequences(X_data, look_back)

scaler = MinMaxScaler()
X = X.reshape(X.shape[0], X.shape[1] * X.shape[2])  # 3차원 배열을 2차원으로 변환
X = scaler.fit_transform(X)
y = scaler.fit_transform(y.reshape(-1, 1))

# 다시 LSTM 입력 형태로 변환
X = X.reshape(X.shape[0], look_back, X.shape[1] // look_back)

In [60]:
#train_size = int(len(X) * 0.8)
train_size = 108
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

model = Sequential()
model.add(LSTM(100, input_shape=(look_back, X_train.shape[2]), return_sequences=True))
model.add(LSTM(50))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')

model.fit(X_train, y_train, epochs=200, batch_size=32, verbose=1)

train_loss = model.evaluate(X_train, y_train, verbose=0)
test_loss = model.evaluate(X_test, y_test, verbose=0)
#print(f'Train Loss: {train_loss:.4f}')
#print(f'Test Loss: {test_loss:.4f}')

print(f'Train Loss: {train_loss:}')
print(f'Train Loss: {test_loss:}')

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

y_train_pred = np.exp(scaler.inverse_transform(y_train_pred))
y_test_pred  = np.exp(scaler.inverse_transform(y_test_pred))
y_train_true = np.exp(scaler.inverse_transform(y_train))
y_test_true  = np.exp(scaler.inverse_transform(y_test))

plt.figure(figsize=(12, 6))
plt.title('LSTM')
plt.plot(data.index[-len(y_train_pred):], y_train_true, label='y_train')
plt.plot(data.index[-len(y_test_pred):], y_test_true, label='y_test')
plt.plot(data.index[-len(y_train_pred):], y_train_pred, label='y_train_pred', linestyle='--')
plt.plot(data.index[-len(y_test_pred):], y_test_pred, label='y_train_test', linestyle='--')
plt.legend()
plt.show()

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

ValueError: ignored

In [None]:
show_reg_result(y_test_true,y_test_pred)