# The Linear Regression Model



In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_validate
from sklearn import metrics

In [2]:
df = pd.read_csv('train.csv', parse_dates=['datetime'])
df

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129


In [3]:
X = df[['season','holiday','workingday','weather','temp','atemp','humidity','windspeed']]
X

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,1,0,0,1,9.84,14.395,81,0.0000
1,1,0,0,1,9.02,13.635,80,0.0000
2,1,0,0,1,9.02,13.635,80,0.0000
3,1,0,0,1,9.84,14.395,75,0.0000
4,1,0,0,1,9.84,14.395,75,0.0000
...,...,...,...,...,...,...,...,...
10881,4,0,1,1,15.58,19.695,50,26.0027
10882,4,0,1,1,14.76,17.425,57,15.0013
10883,4,0,1,1,13.94,15.910,61,15.0013
10884,4,0,1,1,13.94,17.425,61,6.0032


In [4]:
y=df['count']
y

0         16
1         40
2         32
3         13
4          1
        ... 
10881    336
10882    241
10883    168
10884    129
10885     88
Name: count, Length: 10886, dtype: int64

In [5]:
X['weekday'] = df['datetime'].dt.weekday
X['month'] = df['datetime'].dt.month
X['hour'] = df['datetime'].dt.hour
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['weekday'] = df['datetime'].dt.weekday
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['month'] = df['datetime'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['hour'] = df['datetime'].dt.hour


Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,weekday,month,hour
0,1,0,0,1,9.84,14.395,81,0.0000,5,1,0
1,1,0,0,1,9.02,13.635,80,0.0000,5,1,1
2,1,0,0,1,9.02,13.635,80,0.0000,5,1,2
3,1,0,0,1,9.84,14.395,75,0.0000,5,1,3
4,1,0,0,1,9.84,14.395,75,0.0000,5,1,4
...,...,...,...,...,...,...,...,...,...,...,...
10881,4,0,1,1,15.58,19.695,50,26.0027,2,12,19
10882,4,0,1,1,14.76,17.425,57,15.0013,2,12,20
10883,4,0,1,1,13.94,15.910,61,15.0013,2,12,21
10884,4,0,1,1,13.94,17.425,61,6.0032,2,12,22


In [6]:
# define a column transformer that just passes through the temp column

#feat_eng = ColumnTransformer([
#    ('do-nothing', 'passthrough', ['humidity'])
#])
# FEATURE ENGINEERING
#feat_eng = ColumnTransformer([
#    ('one-hot', OneHotEncoder(), ['season','holiday','workingday','weather']),
#    ('polynomials', PolynomialFeatures(degree=3), ['temp', 'humidity']),
#    ('do-nothing', 'passthrough', ['atemp', 'windspeed'])
#])

feat_eng = ColumnTransformer([
    ('one-hot', OneHotEncoder(), ['season','weather']),
    ('polynomials', PolynomialFeatures(degree=3), ['temp', 'humidity']),
    ('do-nothing', 'passthrough', ['holiday','workingday','atemp', 'windspeed','hour','weekday','month'])
])

In [7]:
## Models You can try out!
# Ridge Regression
#from sklearn.linear_model import Ridge
# Poisson Regression
#from sklearn.linear_model import PoissonRegressor
# Decision Tree
#from sklearn.tree import DecisionTreeRegressor
# Random Forrest
from sklearn.ensemble import RandomForestRegressor

In [8]:
# fit the feature engineering pipeline
feat_eng.fit(X)
#fit(X)
# transform the data

X_trans = feat_eng.transform(X)
#X_trans = X
# define the model without any regularization (alpha=0)
#model = Ridge(alpha=2)
model = RandomForestRegressor(n_estimators = 100, random_state = 0)
model.fit(X_trans,y)

# calculate predictions

y_pred = model.predict(X_trans)


### Now to Test data

In [9]:
df_test = pd.read_csv('test.csv', parse_dates=['datetime'])
df_test.count()

datetime      6493
season        6493
holiday       6493
workingday    6493
weather       6493
temp          6493
atemp         6493
humidity      6493
windspeed     6493
dtype: int64

In [10]:
X_test = df_test[['season','holiday','workingday','weather','temp','atemp','humidity','windspeed']]
X_test['weekday'] = df_test['datetime'].dt.weekday
X_test['month'] = df_test['datetime'].dt.month
X_test['hour'] = df_test['datetime'].dt.hour
X_test

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,weekday,month,hour
0,1,0,1,1,10.66,11.365,56,26.0027,3,1,0
1,1,0,1,1,10.66,13.635,56,0.0000,3,1,1
2,1,0,1,1,10.66,13.635,56,0.0000,3,1,2
3,1,0,1,1,10.66,12.880,56,11.0014,3,1,3
4,1,0,1,1,10.66,12.880,56,11.0014,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...
6488,1,0,1,2,10.66,12.880,60,11.0014,0,12,19
6489,1,0,1,2,10.66,12.880,60,11.0014,0,12,20
6490,1,0,1,1,10.66,12.880,60,11.0014,0,12,21
6491,1,0,1,1,10.66,13.635,56,8.9981,0,12,22


In [11]:

Xtest_trans = feat_eng.transform(X_test)
#Xtest_trans = X_test 

In [12]:
ytest_predict = model.predict(Xtest_trans)
print(ytest_predict)

[ 15.     5.4    4.03 ... 129.13 102.85  62.43]


In [13]:
yKaggle_predict = ytest_predict


In [14]:
submission = pd.DataFrame({'datetime':df_test.datetime,'count':yKaggle_predict})

In [15]:
submission.to_csv('Forest.csv',index=False)