In [1]:
# ===MODULES===
from mlxtend.plotting import scatterplotmatrix
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [2]:
# ===DATA===

train = pd.read_csv('/content/drive/MyDrive/gh/kaggle/dacon/bicycle/dataset/train.csv')
test = pd.read_csv('/content/drive/MyDrive/gh/kaggle/dacon/bicycle/dataset/test.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/gh/kaggle/dacon/bicycle/dataset/sample_submission.csv')
train.head()

Unnamed: 0,date_time,wind_direction,sky_condition,precipitation_form,wind_speed,humidity,low_temp,high_temp,Precipitation_Probability,number_of_rentals
0,2018-04-01,207.5,4.0,0.0,3.05,75.0,12.6,21.0,30.0,22994
1,2018-04-02,208.317,2.95,0.0,3.278,69.833,12.812,19.0,19.5,28139
2,2018-04-03,213.516,2.911,0.0,2.69,74.879,10.312,15.316,19.113,26817
3,2018-04-04,143.836,3.692,0.425,3.138,71.849,8.312,12.368,43.493,26034
4,2018-04-05,95.905,4.0,0.723,3.186,73.784,5.875,10.421,63.378,2833


In [None]:
cols = ['sky_condition',
       'precipitation_form',
       'wind_speed',
       'humidity',
       'low_temp',
       'high_temp',
       'number_of_rentals']

In [3]:
# ===UTILS===

# 1. seperate datetime
def seperate_datetime(dataframe):
    year = []
    month = []
    day = []

    for date in dataframe.date_time:
        year_point, month_point, day_point = date.split('-') # - 기준으로 string을 나누고 list로 만듦 ex) '2016-04-01' -> ['2016', '04', '01']
        year.append(int(year_point)-2017)
        month.append(int(month_point)-3)
        day.append(int(day_point))
    return year, month, day


# 2. Calculate NMAE
def NMAE(true, pred):
    score = np.mean(np.abs(true-pred) / true)
    return score



In [54]:
year, month, day = seperate_datetime(train)

train['year'] = year
train['month'] = month
train['day'] = day

X = train.drop(['date_time', 'number_of_rentals'], axis=1)
y = train.number_of_rentals

In [55]:
X_combination = X.copy()

In [56]:
# ===FEATURE ENGINEERING===

# 요일 정보 추가
week_day = pd.to_datetime(train['date_time']).dt.day_name()
le = LabelEncoder()
le.fit(week_day)
X_combination['week_day'] = le.transform(week_day)

# # 1. "일교차가 너무 큰 날씨"를 알기 위한 정보
# X_combination['temp_diff_info'] = X_combination['high_temp'] - X_combination['low_temp']

# # 2. "덥고 습한 날씨"를 알기 위한 정보
# X_combination['sweat_info'] = X_combination['high_temp'] * X_combination['humidity'] 

# # 3. "춥고 바람부는 날씨"를 알기 위한 정보
# X_combination['cold_info'] = X_combination['low_temp'] * X_combination['wind_speed']

In [57]:
X_2 = X_combination.drop(['wind_direction', 'Precipitation_Probability','humidity'], axis=1)

In [43]:
!pip install lightgbm
!pip install catboost

Collecting catboost
  Downloading catboost-1.0.3-cp37-none-manylinux1_x86_64.whl (76.3 MB)
[K     |████████████████████████████████| 76.3 MB 38 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.3


In [58]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
# 모델 선언
bc_RF = RandomForestRegressor()
bc_Cat = CatBoostRegressor()
bc_XGB = XGBRegressor()
bc_LGBM = LGBMRegressor()
bc_GBM = GradientBoostingRegressor(n_estimators=4000,alpha=0.01);

# 보팅 생성
vt = VotingRegressor([('bc_Cat', bc_Cat), ('bc_XGB', bc_XGB), ('bc_LGBM', bc_LGBM),('bc_RF', bc_RF),('bc_GBM', bc_GBM)])
# 훈련
vt.fit(X_2, y) # 데이콘 제출용


# 예측
y_hat = vt.predict(X_2)

score = NMAE(y, y_hat)

print(f'모델 NMAE: {score}')

In [60]:
# ===TEST===

year, month, day = seperate_datetime(test) #날짜 숫자로 분리 후 

#새로운 컬럼을 생성
test['year'] = year 
test['month'] = month
test['day'] = day

# date_time은 제거합니다.
test_X = test.drop('date_time', axis = 1)

# 요일 정보 추가

week_day = pd.to_datetime(test['date_time']).dt.day_name()
le = LabelEncoder()
le.fit(week_day)
test_X['week_day'] = le.transform(week_day)

# # 1. "일교차가 너무 큰 날씨"를 알기 위한 정보
# test_X['temp_diff_info'] = test_X['high_temp'] - test_X['low_temp']

# # 2. "덥고 습한 날씨"를 알기 위한 정보
# test_X['sweat_info'] = test_X['high_temp'] * test_X['humidity'] 

# # 3. "춥고 바람부는 날씨"를 알기 위한 정보
# test_X['cold_info'] = test_X['low_temp'] * test_X['wind_speed'] 

In [61]:
test_X = test_X.drop(['wind_direction', 'Precipitation_Probability','humidity'], axis=1)


In [62]:
# train data로 학습시킨 모델에 test_X데이터를 넣고 예측합니다.
test_yhat = vt.predict(test_X)

# submission dataFrame 완성
sample_submission['number_of_rentals'] = test_yhat

# 제출 파일 생성
sample_submission.to_csv('/content/drive/MyDrive/gh/kaggle/dacon/bicycle/dataset/ensemble2.csv', index= False)

In [None]:
# 1.다중회귀 앙상블
# 2.randomserch cv 적용
# 3.아래 계산해서 칼럼 추가
# weekend 토요일 일요일이 아니고 날씨도 좋았지만 대여량이 낮은날.
# discomfort index 불쾌지수
# sensible temperature 체감온도
# 4.wb,optuna 적용 가능 탐색
# 유튜브 데이터 홀릭



# 1. 정규화 표준화
# 2. 요일컬럼 추가
# 3. 불쾌지수
# 4. 체감온도
# 5. 논문탐색
# 6. 앙상블 구성
# 7. 옵튜나 적용
# 8. 일교차
# 9. 컬럼 중요도


# # 1. "일교차가 너무 큰 날씨"를 알기 위한 정보
# X_combination['temp_diff_info'] = X_combination['high_temp'] - X_combination['low_temp']

# # 2. "덥고 습한 날씨"를 알기 위한 정보
# X_combination['sweat_info'] = X_combination['high_temp'] * X_combination['humidity'] 

# # 3. "춥고 바람부는 날씨"를 알기 위한 정보
# X_combination['cold_info'] = X_combination['low_temp'] * X_combination['wind_speed'] 