In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import calendar
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
train = train[train['weather'] != 4]
y1 = train["casual"]
y2 = train["registered"]
all_data = pd.concat([train, test], ignore_index=True)
all_data['tempDate'] = pd.to_datetime(all_data['datetime'])
all_data['hour'] = all_data['tempDate'].dt.hour
all_data['dayofweek'] = all_data['tempDate'].dt.dayofweek
all_data['year'] = all_data['tempDate'].dt.year
all_data = all_data.drop('tempDate', axis=1)
dropFeatures = ['datetime', 'casual', 'registered']
all_data = all_data.drop(dropFeatures, axis=1)

In [3]:
def predict_atemp(data):
    condition = (data.atemp < 20) & (data.temp > 20) # condition of wrong values
    data_correct_train = data[~condition]

    lin_reg = LinearRegression()
    lin_reg.fit(data_correct_train[["temp"]], data_correct_train[["atemp"]])
    
    predict_atemp = lin_reg.predict(data.loc[condition, ['temp']])
    
    data.loc[condition, ['atemp']] = predict_atemp

    return data
all_data = predict_atemp(all_data)

In [4]:
pca=PCA(n_components=1)
pca.fit(all_data[['temp', 'atemp']])
all_data['pca'] = pca.transform(all_data[['temp','atemp']])
all_data = all_data.drop(columns=['temp', 'atemp'], axis=1)

In [5]:
def predict_windspeed(data):
    condition = data['windspeed'] == 0
    dataWindNot0_train = data.loc[~condition]

    windColumns = ["weather", "humidity", "pca"]

    rf_reg = RandomForestRegressor(random_state=42)
    rf_reg.fit(dataWindNot0_train[windColumns], dataWindNot0_train["windspeed"])
    
    predictWind0 = rf_reg.predict(data.loc[condition, windColumns])
    
    data.loc[condition, ['windspeed']] = predictWind0
    
    return data
all_data = predict_windspeed(all_data)

In [6]:
def predict_humidity(data):
    condition = data['humidity'] == 0
    dataHumNot0_train = data[~condition]

    humidityColumns = ['weather', 'windspeed', 'pca']

    rf_reg = RandomForestRegressor(random_state = 42)
    rf_reg.fit(dataHumNot0_train[humidityColumns], dataHumNot0_train['humidity'])

    predictHum0 = rf_reg.predict(data.loc[condition, humidityColumns])
    
    data.loc[condition, 'humidity'] = predictHum0

    return data
all_data = predict_humidity(all_data)

 76.016      75.15162356 78.87451984 83.12587662 79.71980952 79.71980952
 92.57260714 93.4022476  86.3522286  56.69366667 88.27       88.3
 84.61271032 81.34416331 68.72489669 93.24669444]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  data.loc[condition, 'humidity'] = predictHum0


In [7]:
all_data = pd.get_dummies(all_data, columns = ['season', 'weather'])
X_train = all_data[pd.notnull(all_data['count'])]
X_test = all_data[~pd.notnull(all_data['count'])]
X_train = X_train.drop(['count'], axis=1)
X_test = X_test.drop(['count'], axis=1)
y1 = np.log(y1+1)
y2 = np.log(y2+1)

In [8]:
# rmsle
def rmsle_score(preds, true):
    rmsle_score = (np.sum((np.log1p(preds)-np.log1p(true))**2)/len(true))**0.5
    return rmsle_score

In [9]:
from sklearn.metrics import make_scorer
RMSLE = make_scorer(rmsle_score)

In [10]:
# X_train, X_test, y1, y2

In [11]:
from sklearn.model_selection import GridSearchCV
param_Rf =  {'min_samples_split' : [3,4,6,10], 'n_estimators' : [100], 'random_state': [42] }

rf_reg1 = GridSearchCV(RandomForestRegressor(), param_Rf, cv=10, scoring = RMSLE, n_jobs = -1)
rf_reg1.fit(X_train, y1)

pred_rf1 = rf_reg1.predict(X_test)

In [12]:
rf_reg2 = GridSearchCV(RandomForestRegressor(), param_Rf, cv=10, scoring = RMSLE, n_jobs = -1)
rf_reg2.fit(X_train, y2)

pred_rf2 = rf_reg2.predict(X_test)

In [13]:
sub = pd.DataFrame()
sub['datetime'] = test['datetime']
sub['count'] = np.exp(pred_rf1) + np.exp(pred_rf2) - 2

sub.to_csv('submission1.csv', index=False) # 0.384

In [14]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso

In [15]:
lin_reg1 = LinearRegression()
lin_reg1.fit(X_train, y1)
preds_lin1 = lin_reg1.predict(X_test)

In [16]:
lin_reg2 = LinearRegression()
lin_reg2.fit(X_train, y2)
preds_lin2 = lin_reg2.predict(X_test)

In [17]:
sub = pd.DataFrame()
sub['datetime'] = test['datetime']
sub['count'] = np.exp(preds_lin1) + np.exp(preds_lin2) - 2
sub.to_csv('submission2.csv', index=False) # 1.0003

In [18]:
alpha = np.array([0.1, 1, 2, 3, 4, 10, 30, 100, 200, 300, 400, 800, 900, 1000])

ridge_params_ = {'max_iter':[3000], 'alpha':alpha}

ridge_reg1 = GridSearchCV(Ridge(), ridge_params_, scoring = RMSLE, cv=10)
ridge_reg1.fit(X_train, y1)

preds_ridge1 = ridge_reg1.predict(X = X_test)

In [19]:
ridge_reg2 = GridSearchCV(Ridge(), ridge_params_, scoring = RMSLE, cv=10)
ridge_reg2.fit(X_train, y2)

preds_ridge2 = ridge_reg2.predict(X = X_test)

In [20]:
sub = pd.DataFrame()
sub['datetime'] = test['datetime']
sub['count'] = np.exp(preds_ridge1) + np.exp(preds_ridge2) - 2
sub.to_csv('submission3.csv', index=False) # 1.00203

In [21]:
lasso_params_ = {'max_iter':[3000], 'alpha':1/alpha}

lasso_reg1 = GridSearchCV(Lasso(), lasso_params_, scoring = RMSLE, cv=10)
lasso_reg1.fit(X_train, y1)

preds_lasso1 = lasso_reg1.predict(X = X_test)

In [22]:
lasso_reg2 = GridSearchCV(Lasso(), lasso_params_, scoring = RMSLE, cv=10)
lasso_reg2.fit(X_train, y2)

preds_lasso2 = lasso_reg2.predict(X = X_test)

In [23]:
sub = pd.DataFrame()
sub['datetime'] = test['datetime']
sub['count'] = np.exp(preds_lasso1) + np.exp(preds_lasso2) - 2
sub.to_csv('submission4.csv', index=False) # 1.41242