In [152]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from category_encoders.cat_boost import CatBoostEncoder
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from feature_engine.creation import CyclicalTransformer
from sklearn.model_selection import GridSearchCV
import missingno as msno
from sktime.utils.plotting import plot_series
from IPython.display import display_html
from itertools import chain,cycle
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from impyute.imputation.cs import mice
import statsmodels.tsa.api as tsa
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import make_pipeline
import xgboost
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, Normalizer

In [260]:
train = pd.read_csv('train.csv', index_col = 'date')
test = pd.read_csv('test.csv', index_col = 'date')

In [261]:
def set_first(df):
    df.reset_index(inplace = True)
    df['date'] = df['date'].astype({'date': 'datetime64[ns]'})
    return df
                                  
def create_date_features(df):
    df['month'] = df.date.dt.month.astype("int8")
    df['day_of_month'] = df.date.dt.day.astype("int8")
    df['day_of_year'] = df.date.dt.dayofyear.astype("int16")
    df['week_of_month'] = (df.date.apply(lambda d: (d.day-1) // 7 + 1)).astype("int8")
    df['week_of_year'] = (df.date.dt.weekofyear).astype("int8")
    df['day_of_week'] = (df.date.dt.dayofweek + 1).astype("int8")
    df['year'] = df.date.dt.year.astype("int32")
    df["is_wknd"] = (df.date.dt.weekday // 4).astype("int8")
    df["quarter"] = df.date.dt.quarter.astype("int8")
    df['is_month_start'] = df.date.dt.is_month_start.astype("int8")
    df['is_month_end'] = df.date.dt.is_month_end.astype("int8")
    df['is_quarter_start'] = df.date.dt.is_quarter_start.astype("int8")
    df['is_quarter_end'] = df.date.dt.is_quarter_end.astype("int8")
    df['is_year_start'] = df.date.dt.is_year_start.astype("int8")
    df['is_year_end'] = df.date.dt.is_year_end.astype("int8")
    # 0: Winter - 1: Spring - 2: Summer - 3: Fall
    df["season"] = np.where(df.month.isin([12,1,2]), 0, 1)
    df["season"] = np.where(df.month.isin([6,7,8]), 2, df["season"])
    df["season"] = pd.Series(np.where(df.month.isin([9, 10, 11]), 3, df["season"])).astype("int8")
    return df

def discomport_idx(df): 
    df['humidity'] = df['humidity'] / 100 
    df['discomfort'] = ((9/5) * df['temp_mean']) - 0.55 * (1 - df['humidity']) * (((9/5) * df['temp_mean']) - 26) + 32 
    df['discomfort'] = round(df['discomfort'],1)
    return df

def drop_col(df):
    df.drop(columns =['sunshine_rate','wind_max','temp_mean','temp_lowest','PM10','day_of_year','month','is_wknd','quarter'], inplace = True)
    return df

def mkcol(df):
    df = create_date_features(df)
    df = discomport_idx(df)
    drop_col(df)
    return df

In [262]:
train = set_first(train)
train = mkcol(train)
train

  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,date,precipitation,temp_highest,PM2.5,humidity,sunshine_sum,wind_mean,rental,day_of_month,week_of_month,...,day_of_week,year,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,discomfort
0,2018-01-01,,3.8,17.0,0.391,8.3,1.4,4950,1,1,...,1,2018,1,0,1,0,1,0,0,39.2
1,2018-01-02,,1.8,22.0,0.420,7.9,1.8,7136,2,1,...,2,2018,0,0,0,0,0,0,0,38.1
2,2018-01-03,,-0.4,19.0,0.423,8.6,2.2,7156,3,1,...,3,2018,0,0,0,0,0,0,0,34.5
3,2018-01-04,,-0.7,24.0,0.430,6.2,1.4,7102,4,1,...,4,2018,0,0,0,0,0,0,0,34.3
4,2018-01-05,,1.6,35.0,0.484,8.2,1.7,7705,5,1,...,5,2018,0,0,0,0,0,0,0,35.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,2020-12-27,0.0,10.0,42.0,0.629,5.9,1.8,37103,27,4,...,7,2020,0,0,0,0,0,0,0,45.6
1091,2020-12-28,1.3,11.4,44.0,0.721,8.0,1.4,46912,28,4,...,1,2020,0,0,0,0,0,0,0,46.2
1092,2020-12-29,0.2,4.3,46.0,0.708,0.0,2.9,35747,29,5,...,2,2020,0,0,0,0,0,0,0,36.3
1093,2020-12-30,,-6.2,15.0,0.555,8.3,4.1,22488,30,5,...,3,2020,0,0,0,0,0,0,0,23.5


In [263]:
test = set_first(test)
test = mkcol(test)
test

  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,date,precipitation,temp_highest,PM2.5,humidity,sunshine_sum,wind_mean,day_of_month,week_of_month,week_of_year,day_of_week,year,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,season,discomfort
0,2021-01-01,,1.6,17.0,0.640,6.5,2.0,1,1,53,5,2021,1,0,1,0,1,0,0,31.1
1,2021-01-02,,-1.4,12.0,0.385,9.0,2.6,2,1,53,6,2021,0,0,0,0,0,0,0,34.8
2,2021-01-03,,-2.0,14.0,0.450,5.5,2.0,3,1,53,7,2021,0,0,0,0,0,0,0,32.8
3,2021-01-04,0.0,0.3,23.0,0.514,4.6,1.7,4,1,1,1,2021,0,0,0,0,0,0,0,34.3
4,2021-01-05,0.0,-2.1,17.0,0.528,8.6,2.9,5,1,1,2,2021,0,0,0,0,0,0,0,31.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,2021-12-27,0.0,-3.9,20.0,0.609,3.8,1.7,27,4,52,1,2021,0,0,0,0,0,0,0,26.9
361,2021-12-28,,-0.9,38.0,0.738,1.7,2.2,28,4,52,2,2021,0,0,0,0,0,0,0,29.4
362,2021-12-29,0.2,5.9,49.0,0.729,1.8,2.6,29,5,52,3,2021,0,0,0,0,0,0,0,36.5
363,2021-12-30,0.0,0.2,17.0,0.485,7.3,3.3,30,5,52,4,2021,0,0,0,0,0,0,0,34.3


In [264]:
target = 'rental'
features = train.columns.drop(target)

In [265]:
X_train = train[features]

In [266]:
X_train.drop(columns ='date',inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [267]:
X_train['precipitation'] =X_train['precipitation'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [268]:
X_train['sunshine_sum'] = X_train['sunshine_sum'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [269]:
X_train['PM2.5'] = X_train['PM2.5'].fillna(X_train['PM2.5'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [270]:
y_train = train[target]

# Modeling

In [271]:
boosting = XGBRegressor()

dists = {'n_estimators' : [500,800,1000], 'eta' : [0.1,0.15,0.25,0.35], 'min_child_weight' : np.arange(1, 8, 3), 'max_depth' : np.arange(3,9,3) , 'colsample_bytree' :np.arange(0.5, 1.0, 0.2), 'subsample' :np.arange(0.5, 1.0, 0.2) }

clf =RandomizedSearchCV( estimator = boosting, param_distributions = dists, refit=True, cv=5, n_iter=50,scoring='neg_mean_absolute_error', verbose=1, n_jobs=-1 )

clf.fit(X_train, y_train);

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [272]:
print('최적 파라미터: ',clf.best_params_)

최적 파라미터:  {'subsample': 0.5, 'n_estimators': 1000, 'min_child_weight': 7, 'max_depth': 3, 'eta': 0.15, 'colsample_bytree': 0.5}


In [273]:
clf.best_score_

-7475.1020820373815

In [274]:
model_xg = clf.best_estimator_

In [275]:
len(X_train.columns)

19

In [276]:
test.drop(columns='date',inplace = True)

In [288]:
y_pred = model_xg.predict(test)

In [289]:
y_pred

array([ 33558.9   ,  25161.764 ,  23127.807 ,  22959.246 ,  24017.996 ,
        16173.679 ,  12312.804 ,  21669.479 ,  18601.295 ,  17361.49  ,
        23643.857 ,   8411.149 ,  27121.617 ,  31596.68  ,  23116.463 ,
        18749.441 ,  17213.154 ,  18081.035 ,  20506.51  ,  32282.414 ,
        17387.45  ,  21774.113 ,  24755.383 ,  35985.023 ,  38299.64  ,
         8928.093 ,  32027.2   ,   8138.015 ,  22779.344 ,  22435.295 ,
        26075.193 ,  19865.645 ,  18814.898 ,  23544.967 ,  13854.764 ,
        21028.145 ,  21586.566 ,  23455.498 ,  23217.303 ,  25953.947 ,
        25483.16  ,  34867.426 ,  38286.234 ,  39387.16  ,  38276.094 ,
        13324.145 ,  13542.779 ,  15590.576 ,  17987.059 ,  31299.52  ,
        44900.938 ,  48289.957 ,  37236.44  ,  28691.496 ,  37082.527 ,
        30524.088 ,  53421.953 ,  55267.566 ,  30377.152 ,  16574.436 ,
        24353.904 ,  39605.28  ,  44234.06  ,  48933.902 ,  35918.7   ,
        36082.06  ,  48093.934 ,  47654.6   ,  49481.363 ,  5207

In [282]:
y_pred = np.round(y_pred)

In [283]:
y_pred = y_pred.astype(int)

In [284]:
sub = pd.read_csv('subm.csv', index_col = 'date')

In [285]:
sub

Unnamed: 0_level_0,rental
date,Unnamed: 1_level_1
2021-01-01,34924
2021-01-02,32884
2021-01-03,26133
2021-01-04,24270
2021-01-05,26416
...,...
2021-12-27,34172
2021-12-28,24031
2021-12-29,29257
2021-12-30,20882


In [286]:
sub.rental = y_pred

In [287]:
sub

Unnamed: 0_level_0,rental
date,Unnamed: 1_level_1
2021-01-01,33559
2021-01-02,25162
2021-01-03,23128
2021-01-04,22959
2021-01-05,24018
...,...
2021-12-27,29677
2021-12-28,25746
2021-12-29,32883
2021-12-30,27289


In [291]:
sub.to_csv('sub0630_last.csv')