## 필수 라이브러리 import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 데이터 불러오기

In [2]:
PATH = '/kaggle/input/bike-sharing-demand/'

train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'test.csv')
submission = pd.read_csv(PATH + 'sampleSubmission.csv')

## 데이터 전처리

### 이상치 제거

In [3]:
# 날씨가 4인 데이터에서 수요가 있었던 경우는 극히 일부이므로 이상 데이터로 취급해서 제거해줌
train = train[train['weather'] != 4]

### 데이터 합치기
- 훈련 데이터와 테스트 데이터에 같은 피처 엔지니어링을 적용하기 위해 합쳐줌

In [4]:
# ignore_index=True하면 인덱스 재배열 해줌
all_data = pd.concat([train, test], ignore_index=True)

In [5]:
# 데이터가 없으면 기본적으로 NaN이 들어감
all_data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3.0,13.0,16.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8.0,32.0,40.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5.0,27.0,32.0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3.0,10.0,13.0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
17373,2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
17374,2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
17375,2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,,,
17376,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,


### 파생 피처(변수) 추가

In [6]:
from datetime import datetime
import calendar

# !! day, minute, second 피처는 제외 (쓸모없음)
all_data['year'] = all_data['datetime'].apply(lambda x: x.split()[0].split('-')[0]) # 연도 피처
all_data['month'] = all_data['datetime'].apply(lambda x: x.split()[0].split('-')[1]) # 월 피처
all_data['hour'] = all_data['datetime'].apply(lambda x: x.split()[1].split(':')[0]) # 시 피처
all_data['weekday'] = all_data['datetime'].apply(
    lambda dateStr: datetime.strptime(dateStr.split()[0], '%Y-%m-%d').weekday()
) # 요일 피처

In [7]:
all_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,hour,weekday
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3.0,13.0,16.0,2011,1,0,5
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8.0,32.0,40.0,2011,1,1,5
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5.0,27.0,32.0,2011,1,2,5
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3.0,10.0,13.0,2011,1,3,5
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0.0,1.0,1.0,2011,1,4,5


### 필요 없는 피처 제거 (피처 선택)

In [8]:
# casual, registered, datetime, windspeed, month(season으로 대체) 피처 제거
all_data = all_data.drop(['casual', 'registered', 'datetime', 'windspeed', 'month'], axis=1)

In [9]:
all_data.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,count,year,hour,weekday
0,1,0,0,1,9.84,14.395,81,16.0,2011,0,5
1,1,0,0,1,9.02,13.635,80,40.0,2011,1,5
2,1,0,0,1,9.02,13.635,80,32.0,2011,2,5
3,1,0,0,1,9.84,14.395,75,13.0,2011,3,5
4,1,0,0,1,9.84,14.395,75,1.0,2011,4,5


### 다시 데이터 나누기 (훈련 / 테스트)

In [10]:
X_train = all_data[~pd.isnull(all_data['count'])]
X_test = all_data[pd.isnull(all_data['count'])]

In [11]:
# 타깃값 분리
X_train = X_train.drop(['count'], axis=1)
X_test = X_test.drop(['count'], axis=1)

y = train['count']

In [12]:
print(X_train.shape, X_test.shape)

(10885, 10) (6493, 10)


## 평가지표 계산 함수 작성

### RMSLE
$$\sqrt{\frac{1}{n} \sum_{i=1}^n (\log(p_i + 1) - \log(a_i+1))^2 }$$

In [13]:
def rmsle(y_true, y_pred, convertExp=True):
    # 지수 변환 (타깃값으로 count가 아닌 log(count)를 사용할 것이므로...)
    if convertExp:
        y_true = np.exp(y_true)
        y_pred = np.exp(y_pred)
    
    # 로그 변환 후 결측값을 0으로 변환
    log_true = np.nan_to_num(np.log(y_true + 1))
    log_pred = np.nan_to_num(np.log(y_pred + 1))
    
    # RMSLE 계산
    output = np.sqrt(np.mean((log_true - log_pred) ** 2))
    return output

## Baseline Model

### sklearn LinearRegression

In [14]:
from sklearn.linear_model import LinearRegression

linear_reg_model = LinearRegression()

In [15]:
log_y = np.log(y) # 타깃값 로그 변환
linear_reg_model.fit(X_train, log_y) # 모델 훈련

LinearRegression()

In [16]:
preds = linear_reg_model.predict(X_train)

In [17]:
print(f'선형 회귀의 RMSLE 값: {rmsle(log_y, preds, True)}')

선형 회귀의 RMSLE 값: 1.0204980189305008


### 예측 및 결과 제출

In [18]:
linearreg_preds = linear_reg_model.predict(X_test)

submission['count'] = np.exp(linearreg_preds)
submission.to_csv('submission.csv', index=False)

In [19]:
submission.head()

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,11.362398
1,2011-01-20 01:00:00,13.601789
2,2011-01-20 02:00:00,15.084417
3,2011-01-20 03:00:00,16.308758
4,2011-01-20 04:00:00,18.086453
