In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import calendar
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [3]:
train = train[train['weather'] != 4]
test.weather

0       1
1       1
2       1
3       1
4       1
       ..
6488    2
6489    2
6490    1
6491    1
6492    1
Name: weather, Length: 6493, dtype: int64

In [4]:
all_data = pd.concat([train, test], ignore_index=True)

In [5]:
all_data['tempDate'] = all_data.datetime.apply(lambda x:x.split())

all_data['date'] = all_data['tempDate'].apply(lambda x: x[0])     
all_data['year'] = all_data['tempDate'].apply(lambda x: x[0].split('-')[0])
all_data['month'] = all_data['tempDate'].apply(lambda x: x[0].split('-')[1])
all_data['day'] = all_data['tempDate'].apply(lambda x: x[0].split('-')[2])
all_data['hour'] = all_data['tempDate'].apply(lambda x: x[1].split(':')[0])
all_data['weekday'] = all_data.tempDate.apply(lambda x:calendar.day_name[datetime.strptime(x[0], "%Y-%m-%d").weekday()])

all_data = all_data.drop('tempDate', axis=1)

In [6]:
all_data = all_data.drop(['datetime', 'season', 'workingday', 'date', 'day'], axis=1)

In [7]:
weekday_mapping = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}
all_data['weekday'] = all_data['weekday'].map(weekday_mapping)

In [8]:
new_train = all_data[pd.notnull(all_data['count'])]

In [9]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
condition = (new_train.atemp < 20) & (new_train.temp > 20) #wrong condition
lin_reg.fit(new_train.loc[~condition, ["temp"]], new_train.loc[~condition, ["atemp"]])

In [10]:
condition = (all_data.atemp < 20) & (all_data.temp > 20)
all_data.loc[condition, "atemp"] = all_data.loc[condition, "temp"] * lin_reg.coef_[0][0] + lin_reg.intercept_[0]

sub_train = all_data[pd.notnull(all_data['count'])]

In [11]:
from sklearn.ensemble import RandomForestRegressor

rnd_reg = RandomForestRegressor(n_estimators=100, n_jobs=-1, max_leaf_nodes=100,random_state=42)

condition = (sub_train.windspeed != 0)
rnd_reg.fit(sub_train.loc[condition,['temp','atemp','humidity']],sub_train.loc[condition,'windspeed'])
condition = (all_data.windspeed == 0)
rnd_reg.predict(all_data.loc[condition,['temp','atemp','humidity']])

array([ 7.91058722,  7.94815748,  7.94815748, ...,  9.62801817,
       16.37592775, 10.36697335])

In [12]:
all_data.loc[condition,'windspeed'] = rnd_reg.predict(all_data.loc[condition,['temp','atemp','humidity']])

sub_train = all_data[pd.notnull(all_data['count'])]

In [13]:
rnd_reg = RandomForestRegressor(n_estimators=100, n_jobs=-1, max_leaf_nodes=100,random_state=42)
condition = (sub_train.humidity != 0)
rnd_reg.fit(sub_train.loc[condition,['temp','atemp','windspeed']],sub_train.loc[condition,'humidity'])
condition = (all_data.humidity == 0)
rnd_reg.predict(all_data.loc[condition,['temp','atemp','windspeed']])

array([57.17164074, 57.17164074, 57.17164074, 61.75819021, 54.27117952,
       60.97156544, 57.03199635, 61.9654168 , 63.86193681, 61.54144809,
       64.17201816, 64.17201816, 54.99828091, 67.69687892, 62.52437589,
       52.38414703, 53.56453469, 43.60941302, 54.27117952, 61.59370835,
       63.35453667, 68.10729513])

In [14]:
all_data.loc[condition,'humidity'] = rnd_reg.predict(all_data.loc[condition,['temp','atemp','windspeed']])

 57.03199635 61.9654168  63.86193681 61.54144809 64.17201816 64.17201816
 54.99828091 67.69687892 62.52437589 52.38414703 53.56453469 43.60941302
 54.27117952 61.59370835 63.35453667 68.10729513]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  all_data.loc[condition,'humidity'] = rnd_reg.predict(all_data.loc[condition,['temp','atemp','windspeed']])


In [15]:
dropFeatures = ['casual', 'count', 'registered']
new_train = all_data.drop(dropFeatures, axis=1)
new_train = all_data[pd.notnull(all_data['count'])]
new_test = all_data[~pd.notnull(all_data['count'])]
target = train['count']
target1 = train['casual']
target2 = train['registered']
target = np.log(target+1)
target1 = np.log(target1+1)
target2 = np.log(target2+1)

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

num_attributes = ['temp', 'atemp', 'humidity', 'windspeed']
cat_attributes = ['year', 'month','hour','weekday','holiday','weather']
#num_attributes = ['temp', 'atemp', 'humidity', 'windspeed','weather','year', 'month','hour','weekday','holiday']
#cat_attributes = []
full_pipeline = ColumnTransformer([('num', StandardScaler(), num_attributes), ('cat', OneHotEncoder(), cat_attributes)],)
#new_train_final = full_pipeline.fit_transform(new_train)
#new_test_final = full_pipeline.fit_transform(new_test)
all_data_final = full_pipeline.fit_transform(all_data)
new_train_final = all_data_final[pd.notnull(all_data['count'])]
new_test_final = all_data_final[~pd.notnull(all_data['count'])]

In [17]:
'''from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(new_train_final, target, test_size=0.1, random_state=42)'''

'from sklearn.model_selection import train_test_split\n\nX_train, X_val, y_train, y_val = train_test_split(new_train_final, target, test_size=0.1, random_state=42)'

In [18]:
def rmsle_score(preds, true):
    rmsle_score = (np.sum((np.log1p(preds)-np.log1p(true))**2)/len(true))**0.5
    return rmsle_score

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer
RMSLE = make_scorer(rmsle_score)

'''param_Rf =  {'min_samples_split' : [3,4,6,10], 'n_estimators' : [100], 'random_state': [42] }
RF = GridSearchCV(RandomForestRegressor(), param_Rf, cv=10, scoring = RMSLE, n_jobs=-1)
RF.fit(new_train_final, target)'''

"param_Rf =  {'min_samples_split' : [3,4,6,10], 'n_estimators' : [100], 'random_state': [42] }\nRF = GridSearchCV(RandomForestRegressor(), param_Rf, cv=10, scoring = RMSLE, n_jobs=-1)\nRF.fit(new_train_final, target)"

In [22]:
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer
RMSLE = make_scorer(rmsle_score)

param_Rf =  {'min_samples_split' : [3,4,6,10], 'n_estimators' : [100], 'random_state': [42] }
RF = GridSearchCV(RandomForestRegressor(), param_Rf, cv=10, scoring = RMSLE, n_jobs=-1)
RF.fit(new_train_final, target1)

In [23]:
print("최적의 하이퍼파라미터:", RF.best_params_)

최적의 하이퍼파라미터: {'min_samples_split': 3, 'n_estimators': 100, 'random_state': 42}


In [24]:
RF_yhat = RF.predict(new_train_final)
s_RF = rmsle_score(RF_yhat, target1)
s_RF

0.10383839061464407

In [25]:
pred_rf1 = RF.predict(new_test_final)

In [26]:
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer
RMSLE = make_scorer(rmsle_score)

param_Rf =  {'min_samples_split' : [3,4,6,10], 'n_estimators' : [100], 'random_state': [42] }
RF = GridSearchCV(RandomForestRegressor(), param_Rf, cv=10, scoring = RMSLE, n_jobs=-1)
RF.fit(new_train_final, target2)

In [27]:
print("최적의 하이퍼파라미터:", RF.best_params_)

최적의 하이퍼파라미터: {'min_samples_split': 10, 'n_estimators': 100, 'random_state': 42}


In [28]:
RF_yhat = RF.predict(new_train_final)
s_RF = rmsle_score(RF_yhat, target2)
s_RF

0.06409897429416722

In [29]:
pred_rf2 = RF.predict(new_test_final)

In [32]:
sub = pd.DataFrame()
sub['datetime'] = test['datetime']
sub['count'] = np.exp(pred_rf1) + np.exp(pred_rf2)- 2

sub.to_csv('submission1.csv', index=False)