In [None]:
!pip install lightgbm
!pip install catboost

In [151]:
# ===MODULES===
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from mlxtend.plotting import scatterplotmatrix

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.metrics import make_scorer

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import GradientBoostingRegressor,VotingRegressor, RandomForestRegressor

In [152]:
# ===DATA===

train = pd.read_csv('/content/drive/MyDrive/gh/kaggle/dacon/bicycle/dataset/train.csv')
test = pd.read_csv('/content/drive/MyDrive/gh/kaggle/dacon/bicycle/dataset/test.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/gh/kaggle/dacon/bicycle/dataset/sample_submission.csv')
train.head()

Unnamed: 0,date_time,wind_direction,sky_condition,precipitation_form,wind_speed,humidity,low_temp,high_temp,Precipitation_Probability,number_of_rentals
0,2018-04-01,207.5,4.0,0.0,3.05,75.0,12.6,21.0,30.0,22994
1,2018-04-02,208.317,2.95,0.0,3.278,69.833,12.812,19.0,19.5,28139
2,2018-04-03,213.516,2.911,0.0,2.69,74.879,10.312,15.316,19.113,26817
3,2018-04-04,143.836,3.692,0.425,3.138,71.849,8.312,12.368,43.493,26034
4,2018-04-05,95.905,4.0,0.723,3.186,73.784,5.875,10.421,63.378,2833


In [153]:
# ===UTILS===

# 1. seperate datetime
def seperate_datetime(dataframe):
    year = []
    month = []
    day = []

    for date in dataframe.date_time:
        year_point, month_point, day_point = date.split('-') # - 기준으로 string을 나누고 list로 만듦 ex) '2016-04-01' -> ['2016', '04', '01']
        year.append(int(year_point)-2017)
        month.append(int(month_point)-3)
        day.append(int(day_point))
    return year, month, day


# 2. Calculate NMAE
def NMAE(true, pred):
    return np.mean(np.abs(true-pred) / true)



In [154]:
year, month, day = seperate_datetime(train)

train['year'] = year
train['month'] = month
train['day'] = day

X = train.drop(['date_time', 'number_of_rentals'], axis=1)
y = train.number_of_rentals

In [155]:
X_combination = X.copy()

In [156]:
# ===FEATURE ENGINEERING===

# 요일 정보 추가
week_day = pd.to_datetime(train['date_time']).dt.day_name()
le = LabelEncoder()
le.fit(week_day)
X_combination['week_day'] = le.transform(week_day)

# 1. "일교차가 너무 큰 날씨"를 알기 위한 정보
X_combination['temp_diff_info'] = X_combination['high_temp'] - X_combination['low_temp']

# 2. "덥고 습한 날씨"를 알기 위한 정보
X_combination['sweat_info'] = X_combination['high_temp'] * X_combination['humidity'] 

# 3. "춥고 바람부는 날씨"를 알기 위한 정보
X_combination['cold_info'] = X_combination['low_temp'] * X_combination['wind_speed']


In [157]:
X_combination = X_combination.drop(['wind_direction', 'Precipitation_Probability','humidity'], axis=1)
col_list = X_combination.columns
# 이중 for문을 사용하여 변수 자기 자신의 제곱과 두 변수간의 곱이라는 새로운 변수를 추가합니다.
for i in range(len(col_list)):
    for j in range(i, len(col_list)):
        X_combination[f'{col_list[i]}*{col_list[j]}'] = X_combination[col_list[i]] * X_combination[col_list[j]]

X_2=X_combination

In [158]:
# # train-val split
# X_train, X_val, y_train, y_val = train_test_split(X_2, 
#                                                   y, 
#                                                   test_size=0.25, 
#                                                   random_state=42,
#                                                   shuffle=False)

In [159]:
# 모델 선언
bc_RF = RandomForestRegressor()
bc_Cat = CatBoostRegressor()
bc_XGB = XGBRegressor()
bc_LGBM = LGBMRegressor()
bc_GBM = GradientBoostingRegressor(n_estimators=4000,alpha=0.01)
bc_Ridge = Ridge()
bc_Lasso = Lasso()

# 보팅 생성
vt = VotingRegressor([
                      ('bc_Cat', bc_Cat), 
                      ('bc_XGB', bc_XGB), 
                      ('bc_LGBM', bc_LGBM),
                      ('bc_RF', bc_RF),
                      ('bc_GBM', bc_GBM),
                      ('bc_Ridge', bc_Ridge),
                      ('bc_Lasso', bc_Lasso)
                      ])

# 훈련 + cross_val_score 검증

scores = cross_validate(estimator=vt,
                        X=X_2,
                        y=y,
                        scoring=make_scorer(NMAE),
                        cv=10,
                        n_jobs=-1)

print(scores['test_score'])

[0.37530979 0.65374004 0.10363103 0.19389011 0.08709973 0.19294367
 0.12696399 0.11218676 0.36763133 0.13173211]


In [160]:
vt.fit(X_2, y)

Learning rate set to 0.03335
0:	learn: 27088.3002345	total: 13.6ms	remaining: 13.6s
1:	learn: 26490.0193599	total: 22.5ms	remaining: 11.2s
2:	learn: 25964.1448350	total: 31.3ms	remaining: 10.4s
3:	learn: 25447.5032956	total: 39.9ms	remaining: 9.95s
4:	learn: 24965.7118513	total: 48.4ms	remaining: 9.63s
5:	learn: 24471.6422120	total: 57.6ms	remaining: 9.54s
6:	learn: 23960.9896933	total: 66.5ms	remaining: 9.43s
7:	learn: 23484.5886947	total: 75.8ms	remaining: 9.4s
8:	learn: 23008.1661516	total: 84.4ms	remaining: 9.29s
9:	learn: 22556.1018663	total: 93.5ms	remaining: 9.26s
10:	learn: 22082.2570302	total: 102ms	remaining: 9.18s
11:	learn: 21673.7118189	total: 111ms	remaining: 9.14s
12:	learn: 21329.8478341	total: 123ms	remaining: 9.34s
13:	learn: 20904.0930722	total: 132ms	remaining: 9.31s
14:	learn: 20517.0361619	total: 141ms	remaining: 9.28s
15:	learn: 20097.8931897	total: 150ms	remaining: 9.23s
16:	learn: 19763.5205155	total: 159ms	remaining: 9.19s
17:	learn: 19361.7134931	total: 168ms

  positive)


VotingRegressor(estimators=[('bc_Cat',
                             <catboost.core.CatBoostRegressor object at 0x7f2513fc6d50>),
                            ('bc_XGB',
                             XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=3, min_child_weight=1,
                                          missing=None, n_estimators=100,
                                          n_jobs=1, nthrea...
                                                       warm_start=False)),
                            ('bc_Ridge',
                             Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
         

In [161]:
# 예측
y_hat = vt.predict(X_2)

score = NMAE(y, y_hat)

print(f'NMAE: {score}')

NMAE: 0.05921272072044324


In [166]:
# ===TEST===

year, month, day = seperate_datetime(test) #날짜 숫자로 분리 후 

#새로운 컬럼을 생성
test['year'] = year 
test['month'] = month
test['day'] = day

# date_time은 제거합니다.
test_X = test.drop('date_time', axis = 1)


# 요일 정보 추가

week_day = pd.to_datetime(test['date_time']).dt.day_name()
le = LabelEncoder()
le.fit(week_day)
test_X['week_day'] = le.transform(week_day)

# 1. "일교차가 너무 큰 날씨"를 알기 위한 정보
test_X['temp_diff_info'] = test_X['high_temp'] - test_X['low_temp']

# 2. "덥고 습한 날씨"를 알기 위한 정보
test_X['sweat_info'] = test_X['high_temp'] * test_X['humidity'] 

# 3. "춥고 바람부는 날씨"를 알기 위한 정보
test_X['cold_info'] = test_X['low_temp'] * test_X['wind_speed'] 

In [167]:
test_X = test_X.drop(['wind_direction', 'Precipitation_Probability','humidity'], axis=1)


In [168]:
col_list = test_X.columns

for i in range(len(col_list)):
    for j in range(i, len(col_list)):
        test_X[f'{col_list[i]}*{col_list[j]}'] = test_X[col_list[i]] * test_X[col_list[j]]

In [169]:
# train data로 학습시킨 모델에 test_X데이터를 넣고 예측합니다.
test_yhat = vt.predict(test_X)

# submission dataFrame 완성
sample_submission['number_of_rentals'] = test_yhat

# 제출 파일 생성
sample_submission.to_csv('/content/drive/MyDrive/gh/kaggle/dacon/bicycle/dataset/ridgelasso2.csv', index= False)

In [None]:
# 1.다중회귀 앙상블
# 2.randomserch cv 적용
# 3.아래 계산해서 칼럼 추가
# weekend 토요일 일요일이 아니고 날씨도 좋았지만 대여량이 낮은날.
# discomfort index 불쾌지수
# sensible temperature 체감온도
# 4.wb,optuna 적용 가능 탐색
# 유튜브 데이터 홀릭



# 1. 정규화 표준화
# 2. 요일컬럼 추가
# 3. 불쾌지수
# 4. 체감온도
# 5. 논문탐색
# 6. 앙상블 구성
# 7. 옵튜나 적용
# 8. 일교차
# 9. 컬럼 중요도


# # 1. "일교차가 너무 큰 날씨"를 알기 위한 정보
# X_combination['temp_diff_info'] = X_combination['high_temp'] - X_combination['low_temp']

# # 2. "덥고 습한 날씨"를 알기 위한 정보
# X_combination['sweat_info'] = X_combination['high_temp'] * X_combination['humidity'] 

# # 3. "춥고 바람부는 날씨"를 알기 위한 정보
# X_combination['cold_info'] = X_combination['low_temp'] * X_combination['wind_speed'] 