In [1]:
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [34]:
data_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sub_df = pd.read_csv('sample_submission.csv')

In [35]:
data_df

Unnamed: 0,date,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,rental
0,2018-01-01,,-1.3,3.8,-5.1,34.0,17.0,39.1,8.3,86.5,1.4,3.8,4950
1,2018-01-02,,-1.8,1.8,-4.3,36.0,22.0,42.0,7.9,82.3,1.8,4.9,7136
2,2018-01-03,,-4.7,-0.4,-7.1,31.0,19.0,42.3,8.6,88.7,2.2,3.5,7156
3,2018-01-04,,-4.7,-0.7,-8.7,39.0,24.0,43.0,6.2,63.9,1.4,3.5,7102
4,2018-01-05,,-3.0,1.6,-5.6,51.0,35.0,48.4,8.2,84.5,1.7,3.6,7705
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,2020-12-27,0.0,5.8,10.0,1.4,70.0,42.0,62.9,5.9,61.5,1.8,2.8,37103
1091,2020-12-28,1.3,6.7,11.4,4.2,66.0,44.0,72.1,8.0,83.3,1.4,3.1,46912
1092,2020-12-29,0.2,0.1,4.3,-6.2,69.0,46.0,70.8,0.0,0.0,2.9,6.1,35747
1093,2020-12-30,,-10.9,-6.2,-12.9,39.0,15.0,55.5,8.3,86.5,4.1,6.2,22488


In [36]:
# 강수량 결측치는 0으로 채운다. 
def fill_nan(data):
    data['precipitation'] = data['precipitation'].fillna(0)
    data = data.fillna(method='bfill')
    return data
data_df = fill_nan(data_df)

In [37]:
# date time 나누기
def seperate_datetime(dataframe):
    week_list = []
    year = []
    month = []
    for date in dataframe.date:
        year_point, month_point, day_point = date.split('-')
        year.append(int(year_point))
        month.append(int(month_point))
    dataframe['year'] = year
    dataframe['month'] = month
    for day in dataframe['date']:
        num = pd.date_range(day, day, freq='D').to_series()
        week_list.append(int(num.dt.dayofweek[0]))
    dataframe['day'] = week_list
    dataframe = dataframe.drop(['date'], axis=1)
    return dataframe

In [38]:
data_df = seperate_datetime(data_df)
data_df

Unnamed: 0,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,wind_max,rental,year,month,day
0,0.0,-1.3,3.8,-5.1,34.0,17.0,39.1,8.3,86.5,1.4,3.8,4950,2018,1,0
1,0.0,-1.8,1.8,-4.3,36.0,22.0,42.0,7.9,82.3,1.8,4.9,7136,2018,1,1
2,0.0,-4.7,-0.4,-7.1,31.0,19.0,42.3,8.6,88.7,2.2,3.5,7156,2018,1,2
3,0.0,-4.7,-0.7,-8.7,39.0,24.0,43.0,6.2,63.9,1.4,3.5,7102,2018,1,3
4,0.0,-3.0,1.6,-5.6,51.0,35.0,48.4,8.2,84.5,1.7,3.6,7705,2018,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,0.0,5.8,10.0,1.4,70.0,42.0,62.9,5.9,61.5,1.8,2.8,37103,2020,12,6
1091,1.3,6.7,11.4,4.2,66.0,44.0,72.1,8.0,83.3,1.4,3.1,46912,2020,12,0
1092,0.2,0.1,4.3,-6.2,69.0,46.0,70.8,0.0,0.0,2.9,6.1,35747,2020,12,1
1093,0.0,-10.9,-6.2,-12.9,39.0,15.0,55.5,8.3,86.5,4.1,6.2,22488,2020,12,2


In [39]:
# 요일 데이터 부여
def weekday_onehotcode(dataframe):
    mon = dataframe[dataframe['day'] == 0]
    mon['day'] = 'mon'
    tue = dataframe[dataframe['day'] == 1]
    tue['day'] = 'tue'
    wed = dataframe[dataframe['day'] == 2]
    wed['day'] = 'wed'
    thu = dataframe[dataframe['day'] == 3]
    thu['day'] = 'thu'
    fri = dataframe[dataframe['day'] == 4]
    fri['day'] = 'fri'
    sat = dataframe[dataframe['day'] == 5]
    sat['day'] = 'sat'
    sun = dataframe[dataframe['day'] == 6]
    sun['day'] = 'sun'
    dataframe = pd.concat([mon, tue, wed, thu, fri, sat, sun], axis=0)
    dataframe = pd.get_dummies(dataframe)
    return dataframe

In [40]:
# # 요일 데이터 부여
# def weekday_onehotcode(dataframe):
#     sun = dataframe[dataframe['day'] == 6]
#     sun['day'] = 'sun'
#     dataframe = pd.concat([sun], axis=0)
#     dataframe = pd.get_dummies(dataframe)
#     return dataframe

In [41]:
data_df = weekday_onehotcode(data_df)
data_df

Unnamed: 0,precipitation,temp_mean,temp_highest,temp_lowest,PM10,PM2.5,humidity,sunshine_sum,sunshine_rate,wind_mean,...,rental,year,month,day_fri,day_mon,day_sat,day_sun,day_thu,day_tue,day_wed
0,0.0,-1.3,3.8,-5.1,34.0,17.0,39.1,8.3,86.5,1.4,...,4950,2018,1,0,1,0,0,0,0,0
7,0.9,1.3,4.0,-1.2,49.0,37.0,51.9,0.5,5.2,1.8,...,6309,2018,1,0,1,0,0,0,0,0
14,0.2,4.7,7.7,1.8,63.0,45.0,63.9,6.5,65.7,1.4,...,6993,2018,1,0,1,0,0,0,0,0
21,3.3,0.2,4.0,-5.3,38.0,21.0,56.5,0.7,7.0,1.5,...,4929,2018,1,0,1,0,0,0,0,0
28,0.0,-8.5,-4.7,-11.6,39.0,16.0,35.0,8.9,87.3,2.7,...,4425,2018,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1062,0.0,-0.6,2.6,-3.6,26.0,16.0,59.6,1.5,15.3,2.2,...,32211,2020,11,0,0,0,1,0,0,0
1069,0.0,4.5,9.1,1.2,45.0,33.0,66.9,6.0,61.9,1.8,...,40616,2020,12,0,0,0,1,0,0,0
1076,2.9,-1.1,2.0,-6.0,36.0,25.0,74.9,1.1,11.5,2.9,...,14362,2020,12,0,0,0,1,0,0,0
1083,0.0,-3.7,1.1,-8.2,27.0,16.0,50.4,9.1,94.8,2.0,...,23442,2020,12,0,0,0,1,0,0,0
