In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import calendar
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
train = train[train['weather'] != 4]
y1 = train["casual"]
y2 = train["registered"]
all_data = pd.concat([train, test], ignore_index=True)
all_data['tempDate'] = pd.to_datetime(all_data['datetime'])
all_data['hour'] = all_data['tempDate'].dt.hour
all_data['dayofweek'] = all_data['tempDate'].dt.dayofweek
all_data['year'] = all_data['tempDate'].dt.year
all_data = all_data.drop('tempDate', axis=1)
dropFeatures = ['datetime', 'casual', 'registered']
all_data = all_data.drop(dropFeatures, axis=1)

In [3]:
def predict_atemp(data):
    condition = (data.atemp < 20) & (data.temp > 20) # condition of wrong values
    data_correct_train = data[~condition]

    lin_reg = LinearRegression()
    lin_reg.fit(data_correct_train[["temp"]], data_correct_train[["atemp"]])
    
    predict_atemp = lin_reg.predict(data.loc[condition, ['temp']])
    
    data.loc[condition, ['atemp']] = predict_atemp

    return data
all_data = predict_atemp(all_data)

In [4]:
pca=PCA(n_components=1)
pca.fit(all_data[['temp', 'atemp']])
all_data['pca'] = pca.transform(all_data[['temp','atemp']])
all_data = all_data.drop(columns=['temp', 'atemp'], axis=1)

In [5]:
def predict_windspeed(data):
    condition = data['windspeed'] == 0
    dataWindNot0_train = data.loc[~condition]

    windColumns = ["weather", "humidity", "pca"]

    rf_reg = RandomForestRegressor(random_state=42)
    rf_reg.fit(dataWindNot0_train[windColumns], dataWindNot0_train["windspeed"])
    
    predictWind0 = rf_reg.predict(data.loc[condition, windColumns])
    
    data.loc[condition, ['windspeed']] = predictWind0
    
    return data
all_data = predict_windspeed(all_data)

In [6]:
'''def predict_humidity(data):
    condition = data['humidity'] == 0
    dataHumNot0_train = data[~condition & data['count'].notnull()]

    humidityColumns = ["weather", "windspeed", "pca"]

    rf_reg = RandomForestRegressor(random_state = 42)
    rf_reg.fit(dataHumNot0_train[humidityColumns], dataHumNot0_train["humidity"])

    predictHum0 = rf_reg.predict(data.loc[condition, humidityColumns])
    
    data.loc[condition, ['humidity']] = predictHum0

    return data
all_data = predict_humidity(all_data)'''

'def predict_humidity(data):\n    condition = data[\'humidity\'] == 0\n    dataHumNot0_train = data[~condition & data[\'count\'].notnull()]\n\n    humidityColumns = ["weather", "windspeed", "pca"]\n\n    rf_reg = RandomForestRegressor(random_state = 42)\n    rf_reg.fit(dataHumNot0_train[humidityColumns], dataHumNot0_train["humidity"])\n\n    predictHum0 = rf_reg.predict(data.loc[condition, humidityColumns])\n    \n    data.loc[condition, [\'humidity\']] = predictHum0\n\n    return data\nall_data = predict_humidity(all_data)'

In [7]:
oheFeatures = ['holiday', 'weather', 'season', 'workingday']
all_data = pd.get_dummies(all_data, columns=oheFeatures)
X_train = all_data[pd.notnull(all_data['count'])]
X_test = all_data[~pd.notnull(all_data['count'])]
X_train = X_train.drop(['count'], axis=1)
X_test = X_test.drop(['count'], axis=1)
y1 = np.log(y1+1)
y2 = np.log(y2+1)

In [8]:
# rmsle
def rmsle_score(preds, true):
    rmsle_score = (np.sum((np.log1p(preds)-np.log1p(true))**2)/len(true))**0.5
    return rmsle_score

In [9]:
from sklearn.metrics import make_scorer
RMSLE = make_scorer(rmsle_score)

In [10]:
# X_train, X_test, y1, y2

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
param_Rf =  {'min_samples_split' : [3,4,6,10], 'n_estimators' : [100], 'random_state': [42] }

rf_reg1 = make_pipeline(GridSearchCV(RandomForestRegressor(), param_Rf, cv=10, scoring = RMSLE, n_jobs = -1))
rf_reg1.fit(X_train, y1)

pred_rf1 = rf_reg1.predict(X_test)

In [12]:
rf_reg2 = make_pipeline(GridSearchCV(RandomForestRegressor(), param_Rf, cv=10, scoring = RMSLE, n_jobs = -1))
rf_reg2.fit(X_train, y2)

pred_rf2 = rf_reg2.predict(X_test)

In [13]:
sub = pd.DataFrame()
sub['datetime'] = test['datetime']
sub['count'] = np.exp(pred_rf1) + np.exp(pred_rf2) - 2

sub.to_csv('submission1.csv', index=False)