In [75]:
import pandas as pd
import numpy as np
import holidays
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error

In [2]:
data = pd.read_csv('BikeSharing_Bluebikes2022.csv', index_col=0)

  mask |= (ar1 == a)


In [3]:
data.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,postal code
0,597,2022-01-01 00:00:25.1660,2022-01-01 00:10:22.1920,178,MIT Pacific St at Purrington St,42.359573,-71.101295,74,Harvard Square at Mass Ave/ Dunster,42.373268,-71.118579,4923,Subscriber,2139
1,411,2022-01-01 00:00:40.4300,2022-01-01 00:07:32.1980,189,Kendall T,42.362428,-71.084955,178,MIT Pacific St at Purrington St,42.359573,-71.101295,3112,Subscriber,2139
2,476,2022-01-01 00:00:54.8180,2022-01-01 00:08:51.6680,94,Main St at Austin St,42.375603,-71.064608,356,Charlestown Navy Yard,42.374125,-71.054812,6901,Customer,2124
3,466,2022-01-01 00:01:01.6080,2022-01-01 00:08:48.2350,94,Main St at Austin St,42.375603,-71.064608,356,Charlestown Navy Yard,42.374125,-71.054812,5214,Customer,2124
4,752,2022-01-01 00:01:06.0520,2022-01-01 00:13:38.2300,19,Park Dr at Buswell St,42.347241,-71.105301,41,Packard's Corner - Commonwealth Ave at Brighto...,42.352261,-71.123831,2214,Subscriber,2215


In [4]:
#transforming the dates from object to datetime
for date_column in ['starttime','stoptime']:
    data[date_column] = pd.to_datetime(data[date_column], format='%Y-%m-%d %H:%M:%S')

In [5]:
def DatetimeInterval(df,freq):
    df1 = df.set_index('starttime')
    df1 = pd.get_dummies(df1, columns=['usertype'])
    df1 = df1.resample(rule=freq, label='left', origin='start_day').sum()
    pickups = df1.loc[:,['usertype_Customer','usertype_Subscriber']]
    pickups['pickups'] = pickups.loc[:,['usertype_Customer','usertype_Subscriber']].sum(axis=1)
    #pickups = pickups['tripduration'].groupby(pd.Grouper(freq=freq, label='left', origin='start_day')).count()    
    pickups = pd.DataFrame(pickups)
    #pickups.rename(columns={'tripduration':'pickups'}, inplace=True)
    
    return pickups

In [6]:
df15 = DatetimeInterval(data, freq='15Min')
df30 = DatetimeInterval(data, freq='30Min')
df60 = DatetimeInterval(data, freq='60Min')
df120 = DatetimeInterval(data, freq='120Min')

### Weather Data

In [7]:
weather_data = pd.read_csv('Data/WeatherData', index_col=0)

In [8]:
weather_data.head()

Unnamed: 0,name,datetime,temp,feelslike,dew,humidity,precip,precipprob,preciptype,snow,...,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations
0,Boston,2022-01-01T00:00:00,7.8,7.8,6.7,92.5,0.0,0,,0.0,...,1014.5,100.0,8.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591..."
1,Boston,2022-01-01T01:00:00,7.2,6.5,6.7,96.49,0.0,0,,0.0,...,1014.1,100.0,5.1,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591..."
2,Boston,2022-01-01T02:00:00,7.2,6.0,6.7,96.49,0.0,0,,0.0,...,1014.2,100.0,4.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591..."
3,Boston,2022-01-01T03:00:00,7.2,7.2,6.7,96.6,0.0,0,,0.0,...,1014.1,100.0,1.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591..."
4,Boston,2022-01-01T04:00:00,6.8,5.4,6.7,99.79,0.0,0,,0.0,...,1013.6,100.0,0.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591..."


In [9]:
weather_data['datetime'] = pd.to_datetime(weather_data['datetime'], format='%Y-%m-%dT%H:%M:%S')
weather_data.drop('name',axis=1, inplace=True)
weather_data.set_index('datetime', inplace=True)

In [10]:
weather_data.head()

Unnamed: 0_level_0,temp,feelslike,dew,humidity,precip,precipprob,preciptype,snow,snowdepth,windgust,...,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01 00:00:00,7.8,7.8,6.7,92.5,0.0,0,,0.0,0.0,,...,1014.5,100.0,8.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591..."
2022-01-01 01:00:00,7.2,6.5,6.7,96.49,0.0,0,,0.0,0.0,,...,1014.1,100.0,5.1,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591..."
2022-01-01 02:00:00,7.2,6.0,6.7,96.49,0.0,0,,0.0,0.0,,...,1014.2,100.0,4.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591..."
2022-01-01 03:00:00,7.2,7.2,6.7,96.6,0.0,0,,0.0,0.0,,...,1014.1,100.0,1.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591..."
2022-01-01 04:00:00,6.8,5.4,6.7,99.79,0.0,0,,0.0,0.0,,...,1013.6,100.0,0.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591..."


In [11]:
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5831 entries, 2022-01-01 00:00:00 to 2022-08-31 23:00:00
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   temp              5831 non-null   float64
 1   feelslike         5831 non-null   float64
 2   dew               5831 non-null   float64
 3   humidity          5831 non-null   float64
 4   precip            5831 non-null   float64
 5   precipprob        5831 non-null   int64  
 6   preciptype        547 non-null    object 
 7   snow              5831 non-null   float64
 8   snowdepth         5831 non-null   float64
 9   windgust          5675 non-null   float64
 10  windspeed         5831 non-null   float64
 11  winddir           5831 non-null   float64
 12  sealevelpressure  5831 non-null   float64
 13  cloudcover        5831 non-null   float64
 14  visibility        5831 non-null   float64
 15  solarradiation    5691 non-null   float64
 16  solare

In [12]:
weather_data.isna().sum()

temp                   0
feelslike              0
dew                    0
humidity               0
precip                 0
precipprob             0
preciptype          5284
snow                   0
snowdepth              0
windgust             156
windspeed              0
winddir                0
sealevelpressure       0
cloudcover             0
visibility             0
solarradiation       140
solarenergy         2547
uvindex              140
severerisk           230
conditions             0
icon                   0
stations               0
dtype: int64

In [13]:
weather_data.head()

Unnamed: 0_level_0,temp,feelslike,dew,humidity,precip,precipprob,preciptype,snow,snowdepth,windgust,...,sealevelpressure,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01 00:00:00,7.8,7.8,6.7,92.5,0.0,0,,0.0,0.0,,...,1014.5,100.0,8.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591..."
2022-01-01 01:00:00,7.2,6.5,6.7,96.49,0.0,0,,0.0,0.0,,...,1014.1,100.0,5.1,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591..."
2022-01-01 02:00:00,7.2,6.0,6.7,96.49,0.0,0,,0.0,0.0,,...,1014.2,100.0,4.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591..."
2022-01-01 03:00:00,7.2,7.2,6.7,96.6,0.0,0,,0.0,0.0,,...,1014.1,100.0,1.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591..."
2022-01-01 04:00:00,6.8,5.4,6.7,99.79,0.0,0,,0.0,0.0,,...,1013.6,100.0,0.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591..."


temp

In [14]:
weather_data['temp'].describe()

count    5831.000000
mean       12.352478
std        11.056270
min       -15.600000
25%         4.300000
50%        12.900000
75%        21.600000
max        37.200000
Name: temp, dtype: float64

feelslike

In [15]:
weather_data['feelslike'].describe()

count    5831.000000
mean       10.596210
std        13.410548
min       -26.500000
25%         0.800000
50%        12.900000
75%        21.600000
max        39.500000
Name: feelslike, dtype: float64

Humidity

In [16]:
weather_data['humidity'].describe()

count    5831.000000
mean       62.503353
std        20.229678
min        15.130000
25%        46.120000
50%        61.270000
75%        79.645000
max        99.940000
Name: humidity, dtype: float64

precip

In [17]:
weather_data['precip'].describe()

count    5831.000000
mean        0.077047
std         0.451573
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max        10.546000
Name: precip, dtype: float64

In [18]:
weather_data[weather_data['precip']>0]['conditions'].unique()

array(['Rain, Overcast', 'Snow, Rain, Overcast', 'Snow, Overcast',
       'Rain, Partially cloudy', 'Snow, Ice, Overcast',
       'Snow, Partially cloudy'], dtype=object)

snow

In [19]:
weather_data['snow'].describe()

count    5831.000000
mean        0.019317
std         0.142421
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         6.600000
Name: snow, dtype: float64

In [20]:
weather_data[weather_data['snow']>2]['conditions'].unique()

array(['Partially cloudy'], dtype=object)

Preciptype

In [21]:
weather_data['preciptype'].nunique()

5

In [22]:
weather_data['preciptype'].unique()

array([nan, 'rain', 'rain,snow', 'snow', 'freezingrain', 'snow,ice'],
      dtype=object)

In [23]:
weather_data[weather_data['preciptype']=='snow'][['snow','snowdepth']]

Unnamed: 0_level_0,snow,snowdepth
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-01-07 05:00:00,1.04,21.88
2022-01-24 00:00:00,0.00,0.10
2022-01-25 00:00:00,0.13,2.00
2022-01-25 01:00:00,0.13,2.13
2022-01-29 02:00:00,0.42,7.50
...,...,...
2022-03-03 03:00:00,0.70,13.42
2022-03-03 04:00:00,1.10,13.33
2022-03-03 05:00:00,0.40,13.25
2022-03-12 20:00:00,1.30,1.70


In [24]:
def PreciptypeMap(row):
    if row == 'rain':
        return 2
    elif row == 'rain,snow':
        return 3
    elif row == 'snow':
        return 4
    elif row in ['freezingrain','snow,ice']:
        return 5
    else:
        return 0

In [25]:
weather_data['enc_preciptype'] = weather_data['preciptype'].map(lambda row: PreciptypeMap(row))

In [26]:
weather_data.head()

Unnamed: 0_level_0,temp,feelslike,dew,humidity,precip,precipprob,preciptype,snow,snowdepth,windgust,...,cloudcover,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations,enc_preciptype
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01 00:00:00,7.8,7.8,6.7,92.5,0.0,0,,0.0,0.0,,...,100.0,8.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591...",0
2022-01-01 01:00:00,7.2,6.5,6.7,96.49,0.0,0,,0.0,0.0,,...,100.0,5.1,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591...",0
2022-01-01 02:00:00,7.2,6.0,6.7,96.49,0.0,0,,0.0,0.0,,...,100.0,4.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591...",0
2022-01-01 03:00:00,7.2,7.2,6.7,96.6,0.0,0,,0.0,0.0,,...,100.0,1.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591...",0
2022-01-01 04:00:00,6.8,5.4,6.7,99.79,0.0,0,,0.0,0.0,,...,100.0,0.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591...",0


Conditions

In [27]:
weather_data['conditions'].unique()

array(['Overcast', 'Rain, Overcast', 'Partially cloudy', 'Clear',
       'Snow, Rain, Overcast', 'Snow, Overcast', 'Rain, Partially cloudy',
       'Snow, Ice, Overcast', 'Snow, Partially cloudy'], dtype=object)

In [28]:
def ConditionsMap(row):
    if row in ['Overcast','Partially cloudy','Clear']:
        return 1
    elif row in ['Rain, Overcast','Rain, Partially cloudy','Snow, Partially cloudy']:
        return 2
    elif row in ['Snow, Rain, Overcast','Snow, Overcast','Snow, Ice, Overcast']:
        return 3

In [29]:
weather_data['enc_conditions'] = weather_data['conditions'].map(lambda row: ConditionsMap(row))

In [30]:
weather_data.head()

Unnamed: 0_level_0,temp,feelslike,dew,humidity,precip,precipprob,preciptype,snow,snowdepth,windgust,...,visibility,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations,enc_preciptype,enc_conditions
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01 00:00:00,7.8,7.8,6.7,92.5,0.0,0,,0.0,0.0,,...,8.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591...",0,1
2022-01-01 01:00:00,7.2,6.5,6.7,96.49,0.0,0,,0.0,0.0,,...,5.1,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591...",0,1
2022-01-01 02:00:00,7.2,6.0,6.7,96.49,0.0,0,,0.0,0.0,,...,4.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591...",0,1
2022-01-01 03:00:00,7.2,7.2,6.7,96.6,0.0,0,,0.0,0.0,,...,1.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591...",0,1
2022-01-01 04:00:00,6.8,5.4,6.7,99.79,0.0,0,,0.0,0.0,,...,0.0,,,,,Overcast,cloudy,"KOWD,72509854704,KBED,KBOS,72509014739,7250591...",0,1


Severerisk

In [31]:
weather_data['severerisk'].unique()

array([nan, 10.,  3.,  5., 30., 15.,  8., 60., 75.])

In [32]:
weather_data['severerisk'].fillna(weather_data['severerisk'].median(), inplace=True)

solar radiation

In [33]:
weather_data['solarradiation'].unique()

array([      nan, 1.100e+01, 2.400e+01, 2.100e+01, 2.600e+01, 2.700e+01,
       4.500e+01, 3.200e+01, 1.200e+01, 7.000e+00, 2.500e+01, 3.600e+01,
       7.000e+01, 8.200e+01, 4.300e+01, 5.000e+01, 3.000e+01, 9.000e+00,
       5.200e+01, 6.400e+01, 7.400e+01, 6.800e+01, 4.000e+01, 3.900e+01,
       1.800e+01, 2.300e+01, 1.540e+02, 2.750e+02, 3.530e+02, 3.800e+02,
       3.660e+02, 3.010e+02, 2.360e+02, 1.600e+01, 5.000e+00, 3.700e+01,
       1.050e+02, 1.110e+02, 7.500e+01, 5.600e+01, 7.200e+01, 4.400e+01,
       1.500e+01, 2.580e+02, 3.190e+02, 2.220e+02, 1.970e+02, 1.990e+02,
       1.270e+02, 2.000e+01, 1.000e+01, 3.500e+01, 5.300e+01, 1.450e+02,
       9.800e+01, 6.700e+01, 1.350e+02, 2.820e+02, 3.630e+02, 3.950e+02,
       3.810e+02, 2.720e+02, 1.860e+02, 1.700e+01, 2.900e+01, 5.700e+01,
       1.920e+02, 1.520e+02, 9.200e+01, 1.340e+02, 2.860e+02, 3.960e+02,
       4.010e+02, 3.990e+02, 3.150e+02, 1.480e+02, 3.300e+01, 0.000e+00,
       1.330e+02, 3.170e+02, 3.670e+02, 3.790e+02, 

In [34]:
weather_data['solarradiation'].min()

0.0

In [35]:
#weather_data[weather_data['solarradiation'].isna()==True]
weather_data[weather_data['solarradiation']==0.0]['severerisk'].unique()

array([10.,  3.,  5., 15., 30., 60.,  8.])

In [36]:
weather_data[weather_data['solarradiation']==0.0]['enc_conditions'].unique()

array([1, 2, 3])

In [37]:
weather_data[weather_data['solarradiation'].isna()==True]['severerisk'].unique()

array([10.])

In [38]:
sorted(weather_data[weather_data['solarradiation'].isna()==True].index)

[Timestamp('2022-01-01 00:00:00'),
 Timestamp('2022-01-01 01:00:00'),
 Timestamp('2022-01-01 02:00:00'),
 Timestamp('2022-01-01 03:00:00'),
 Timestamp('2022-01-01 04:00:00'),
 Timestamp('2022-01-01 05:00:00'),
 Timestamp('2022-01-01 06:00:00'),
 Timestamp('2022-01-01 07:00:00'),
 Timestamp('2022-01-01 17:00:00'),
 Timestamp('2022-01-01 18:00:00'),
 Timestamp('2022-01-01 19:00:00'),
 Timestamp('2022-01-01 20:00:00'),
 Timestamp('2022-01-01 21:00:00'),
 Timestamp('2022-01-01 22:00:00'),
 Timestamp('2022-01-01 23:00:00'),
 Timestamp('2022-01-02 00:00:00'),
 Timestamp('2022-01-02 01:00:00'),
 Timestamp('2022-01-02 02:00:00'),
 Timestamp('2022-01-02 03:00:00'),
 Timestamp('2022-01-02 04:00:00'),
 Timestamp('2022-01-02 05:00:00'),
 Timestamp('2022-01-02 06:00:00'),
 Timestamp('2022-01-02 07:00:00'),
 Timestamp('2022-01-02 17:00:00'),
 Timestamp('2022-01-02 18:00:00'),
 Timestamp('2022-01-02 19:00:00'),
 Timestamp('2022-01-02 20:00:00'),
 Timestamp('2022-01-02 21:00:00'),
 Timestamp('2022-01-

In [39]:
weather_data['solarradiation'].fillna(0.0, inplace=True)

UVindex

In [40]:
weather_data['uvindex'].unique()

array([nan,  0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])

In [41]:
sorted(weather_data[weather_data['uvindex'].isna()==True].index)

[Timestamp('2022-01-01 00:00:00'),
 Timestamp('2022-01-01 01:00:00'),
 Timestamp('2022-01-01 02:00:00'),
 Timestamp('2022-01-01 03:00:00'),
 Timestamp('2022-01-01 04:00:00'),
 Timestamp('2022-01-01 05:00:00'),
 Timestamp('2022-01-01 06:00:00'),
 Timestamp('2022-01-01 07:00:00'),
 Timestamp('2022-01-01 17:00:00'),
 Timestamp('2022-01-01 18:00:00'),
 Timestamp('2022-01-01 19:00:00'),
 Timestamp('2022-01-01 20:00:00'),
 Timestamp('2022-01-01 21:00:00'),
 Timestamp('2022-01-01 22:00:00'),
 Timestamp('2022-01-01 23:00:00'),
 Timestamp('2022-01-02 00:00:00'),
 Timestamp('2022-01-02 01:00:00'),
 Timestamp('2022-01-02 02:00:00'),
 Timestamp('2022-01-02 03:00:00'),
 Timestamp('2022-01-02 04:00:00'),
 Timestamp('2022-01-02 05:00:00'),
 Timestamp('2022-01-02 06:00:00'),
 Timestamp('2022-01-02 07:00:00'),
 Timestamp('2022-01-02 17:00:00'),
 Timestamp('2022-01-02 18:00:00'),
 Timestamp('2022-01-02 19:00:00'),
 Timestamp('2022-01-02 20:00:00'),
 Timestamp('2022-01-02 21:00:00'),
 Timestamp('2022-01-

In [42]:
weather_data['uvindex'].fillna(0.0, inplace=True)

In [43]:
weather_data.drop(['dew','sealevelpressure','cloudcover','solarenergy','icon','windgust','winddir','solarradiation','precipprob','conditions','preciptype','stations'], axis=1, inplace=True)

In [44]:
weather_data.head()

Unnamed: 0_level_0,temp,feelslike,humidity,precip,snow,snowdepth,windspeed,visibility,uvindex,severerisk,enc_preciptype,enc_conditions
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2022-01-01 00:00:00,7.8,7.8,92.5,0.0,0.0,0.0,0.1,8.0,0.0,10.0,0,1
2022-01-01 01:00:00,7.2,6.5,96.49,0.0,0.0,0.0,5.3,5.1,0.0,10.0,0,1
2022-01-01 02:00:00,7.2,6.0,96.49,0.0,0.0,0.0,7.3,4.0,0.0,10.0,0,1
2022-01-01 03:00:00,7.2,7.2,96.6,0.0,0.0,0.0,0.1,1.0,0.0,10.0,0,1
2022-01-01 04:00:00,6.8,5.4,99.79,0.0,0.0,0.0,7.4,0.0,0.0,10.0,0,1


In [45]:
weather_data15 = weather_data.resample('15Min').fillna(method='ffill')
weather_data30 = weather_data.resample('20Min').fillna(method='ffill')
weather_data60 = weather_data
weather_data120 = weather_data.resample('2H').mean()

# Connect Data

In [46]:
df15 = DatetimeInterval(data, freq='15Min')
df30 = DatetimeInterval(data, freq='30Min')
df60 = DatetimeInterval(data, freq='60Min')
df120 = DatetimeInterval(data, freq='120Min')

In [47]:
weather_data15 = weather_data.resample('15Min').fillna(method='ffill')
weather_data30 = weather_data.resample('30Min').fillna(method='ffill')
weather_data60 = weather_data
weather_data120 = weather_data.resample('2H').mean()

In [48]:
def MergingDataFrames(wd, df):
    pickups = pd.merge(wd, df, left_index=True, right_index=True)
    return pickups

In [49]:
picksup15 = MergingDataFrames(weather_data15, df15)
picksup30 = MergingDataFrames(weather_data30, df30)
picksup60 = MergingDataFrames(weather_data60, df60)
picksup120 = MergingDataFrames(weather_data120, df120)

In [50]:
picksup15.head()

Unnamed: 0,temp,feelslike,humidity,precip,snow,snowdepth,windspeed,visibility,uvindex,severerisk,enc_preciptype,enc_conditions,usertype_Customer,usertype_Subscriber,pickups
2022-01-01 00:00:00,7.8,7.8,92.5,0.0,0.0,0.0,0.1,8.0,0.0,10.0,0,1,7.0,23.0,30.0
2022-01-01 00:15:00,7.8,7.8,92.5,0.0,0.0,0.0,0.1,8.0,0.0,10.0,0,1,6.0,30.0,36.0
2022-01-01 00:30:00,7.8,7.8,92.5,0.0,0.0,0.0,0.1,8.0,0.0,10.0,0,1,7.0,22.0,29.0
2022-01-01 00:45:00,7.8,7.8,92.5,0.0,0.0,0.0,0.1,8.0,0.0,10.0,0,1,8.0,22.0,30.0
2022-01-01 01:00:00,7.2,6.5,96.49,0.0,0.0,0.0,5.3,5.1,0.0,10.0,0,1,5.0,16.0,21.0


Weekend/Weekday & Holidays & Month/Hour/Minute & Season

In [51]:
us_holidays = holidays.US()
season_month =  {1:'Winter', 2:'Winter',
                3:'Spring', 4:'Spring', 5:'Spring',
                6:'Summer', 7:'Summer', 8:'Summer'}

In [52]:
def DataPreprocess(pickup):
    pickup['workingday'] = pickup.index.map(lambda row: 1 if row.dayofweek < 5 else 0)
    pickup['holiday'] = pickup.index.map(lambda row: 1 if row in us_holidays else 0)
    pickup['month'] = pickup.index.month
    pickup['hour'] = pickup.index.hour
    pickup['minute'] = pickup.index.minute
    pickup.reset_index(drop=True, inplace=True)
    pickup['season'] = pickup['month'].map(season_month)
    
    return pickup

In [53]:
for pickup in [picksup15,picksup30, picksup60, picksup120]:
    pickup = DataPreprocess(pickup)
    pickups = pd.get_dummies(pickup, 'season')

In [54]:
picksup15 = pd.get_dummies(picksup15, 'season')
picksup30 = pd.get_dummies(picksup30, 'season')
picksup60 = pd.get_dummies(picksup60, 'season')
picksup120 = pd.get_dummies(picksup120, 'season')

In [55]:
picksup120.head()

Unnamed: 0,temp,feelslike,humidity,precip,snow,snowdepth,windspeed,visibility,uvindex,severerisk,...,usertype_Subscriber,pickups,workingday,holiday,month,hour,minute,season_Spring,season_Summer,season_Winter
0,7.5,7.15,94.495,0.0,0.0,0.0,2.7,6.55,0.0,10.0,...,158.0,217.0,0,1,1,0,0,0,0,1
1,7.2,6.6,96.545,0.0,0.0,0.0,3.7,2.5,0.0,10.0,...,82.0,168.0,0,1,1,2,0,0,0,1
2,6.8,5.2,99.82,0.0,0.0,0.0,8.2,0.0,0.0,10.0,...,104.0,215.0,0,1,1,4,0,0,0,1
3,7.05,5.8,99.68,0.0,0.0,0.0,7.1,0.1,0.0,10.0,...,94.0,283.0,0,1,1,6,0,0,0,1
4,7.9,6.3,99.79,0.0,0.0,0.0,9.2,0.05,0.0,10.0,...,18.0,56.0,0,1,1,8,0,0,0,1


## MODELLING

In [88]:
def rmse(y_pred, y_true):
    return np.sqrt(np.mean((y_pred - y_true)**2))

In [89]:
def mae(y_pred, y_true):
    return np.mean(np.abs(y_pred - y_true))

In [90]:
def mape(y_pred, y_true):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [91]:
def r2(y_pred, y_true):
    return max(0, 1 - np.sum((y_true-y_pred)**2) / np.sum((y_true - np.mean(y_true))**2))

In [92]:
def adjustedr2(y_pred, y_true, nvariables):
    r_squared = max(0, 1 - np.sum((y_true-y_pred)**2) / np.sum((y_true - np.mean(y_true))**2))
    return 1 - (1-r_squared)*(len(y_true)-1)/(len(y_true)-nvariables-1)

In [62]:
def rmsle(y_true, y_pred, convertExp=True):
    if convertExp:
        y_true = np.exp(y_true)
        y_pred = np.exp(y_pred)
        
    log_true = np.nan_to_num(np.log(y_true+1))
    log_pred = np.nan_to_num(np.log(y_pred+1))
    
    output = np.sqrt(np.mean((log_true - log_pred)**2))
    
    return output

In [136]:
X_train = picksup60[(picksup60['month']!=8) & (picksup60['hour']<=9)].drop(['pickups','usertype_Subscriber','usertype_Customer','month','hour','minute'], axis=1)
y_train = picksup60[(picksup60['month']!=8) & (picksup60['hour']<=9)]['pickups']
X_test = picksup60[(picksup60['month']==8) & (picksup60['hour']==10)].drop(['pickups','usertype_Subscriber','usertype_Customer','month','hour','minute'], axis=1)
y_test = picksup60[(picksup60['month']==8) & (picksup60['hour']==10)]['pickups']

In [137]:
LRM = LinearRegression()
log_ytest, log_ytrain = np.log1p(y_test), np.log1p(y_train)  # Log Transformation of Target Value y
LRM.fit(X_train, log_ytrain) 

# Step 3 : Predict
preds = LRM.predict(X_test)

# Step 4 : Evaluate
print ('Linear Regression RMSLE:', rmsle(log_ytest, preds, True))

Linear Regression RMSLE: 2.7004937542056946


In [143]:
# Step 1: Create Model
RM = Ridge()

# Step 2-1 : Create GridSearchCV Object
# Hyper-parameter List
ridge_params = {'max_iter':[3000], 'alpha':[0.1, 1, 2, 3, 4, 10, 30, 100, 200, 300, 400, 800, 900, 1000]}
# Evaluate Function for Cross-Validation (RMSLE score)
rmsle_scorer = metrics.make_scorer(rmsle, greater_is_better=False) 
# Create GridSearchCV Object (with Ridge)
gridsearch_RM = GridSearchCV(estimator=RM,
                                      param_grid=ridge_params,
                                      scoring=rmsle_scorer,
                                      cv=5)

# Step 2-2 : Perform Grid Search
#log_y = np.log1p(y) # Log Transformation of Target Value y
gridsearch_RM.fit(X_train, log_ytrain) # Train (Grid Search)

print('Best Parameter:', gridsearch_RM.best_params_)

# Step 3 : Predict
preds = gridsearch_RM.best_estimator_.predict(X_test)

# Step 4 : Evaluate
print('Ridge Regression RMSLE:', rmsle(log_ytest, preds, True))

Best Parameter: {'alpha': 30, 'max_iter': 3000}
Ridge Regression RMSLE: 2.6462931428006615


In [144]:
def compute_error(y_true, y_pred, nvariables = 2):
    #corr = np.corrcoef(y_pred[:,0], y_true[:,0])[0,1]
    #rho2 = pearsonr(y_pred[:,0], y_true[:,0])[0]
    mae = np.mean(np.abs(y_pred - y_true))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    rmse = np.sqrt(np.mean((y_pred - y_true)**2))
    r_squared = max(0, 1 - np.sum((y_true-y_pred)**2) / np.sum((y_true - np.mean(y_true))**2))
    adjustedr2 = 1 - (1-r_squared)*(len(y_true)-1)/(len(y_true)-nvariables-1)
    return  mae, mape, rmse, r_squared, adjustedr2

In [145]:
mae, mape, rmse, r_squared, adjustedr2 = compute_error(log_ytest, LRM.predict(X_test))

#print("Pearson’s correlation coefficient = ", rho2)
print("Mean Absolute Error = ", mae)
print("Mean Absolute Percentage Error = ", mape)
print("R-squared = ", r_squared)
print("Adjusted R-squared = ", adjustedr2)
print("Root Mean Squared Error = ", rmse)

Mean Absolute Error =  2.620510756244481
Mean Absolute Percentage Error =  47.338448610833375
R-squared =  0
Adjusted R-squared =  -0.0714285714285714
Root Mean Squared Error =  2.771129172249078


In [146]:
X_train

Unnamed: 0,temp,feelslike,humidity,precip,snow,snowdepth,windspeed,visibility,uvindex,severerisk,enc_preciptype,enc_conditions,workingday,holiday,season_Spring,season_Summer,season_Winter
0,7.8,7.8,92.50,0.0,0.0,0.0,0.1,8.0,0.0,10.0,0,1,0,1,0,0,1
1,7.2,6.5,96.49,0.0,0.0,0.0,5.3,5.1,0.0,10.0,0,1,0,1,0,0,1
2,7.2,6.0,96.49,0.0,0.0,0.0,7.3,4.0,0.0,10.0,0,1,0,1,0,0,1
3,7.2,7.2,96.60,0.0,0.0,0.0,0.1,1.0,0.0,10.0,0,1,0,1,0,0,1
4,6.8,5.4,99.79,0.0,0.0,0.0,7.4,0.0,0.0,10.0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5068,20.9,20.9,61.87,0.0,0.0,0.0,9.0,16.0,0.0,10.0,0,1,0,0,0,1,0
5069,20.9,20.9,61.87,0.0,0.0,0.0,12.5,15.7,0.0,10.0,0,1,0,0,0,1,0
5070,21.6,21.6,61.62,0.0,0.0,0.0,12.5,16.0,1.0,10.0,0,1,0,0,0,1,0
5071,23.8,23.8,53.93,0.0,0.0,0.0,14.2,16.0,2.0,10.0,0,1,0,0,0,1,0


In [147]:
X_test

Unnamed: 0,temp,feelslike,humidity,precip,snow,snowdepth,windspeed,visibility,uvindex,severerisk,enc_preciptype,enc_conditions,workingday,holiday,season_Spring,season_Summer,season_Winter
5097,24.4,24.4,64.6,0.0,0.0,0.0,12.9,16.0,4.0,10.0,0,1,1,0,0,1,0
5121,26.2,26.2,64.22,0.0,0.0,0.0,14.8,16.0,5.0,10.0,0,1,1,0,0,1,0
5145,25.7,25.7,55.58,0.0,0.0,0.0,16.2,16.0,5.0,10.0,0,1,1,0,0,1,0
5169,31.1,32.2,46.63,0.0,0.0,0.0,16.6,16.0,5.0,3.0,0,1,1,0,0,1,0
5193,31.1,33.6,53.8,0.0,0.0,0.0,7.6,16.0,4.0,30.0,0,1,1,0,0,1,0
5217,28.9,32.1,67.37,0.0,0.0,0.0,12.7,16.0,4.0,60.0,0,1,0,0,0,1,0
5241,31.2,35.0,59.17,0.0,0.0,0.0,18.3,16.0,4.0,60.0,0,1,0,0,0,1,0
5265,31.1,34.9,59.29,0.0,0.0,0.0,13.1,16.0,4.0,30.0,0,1,1,0,0,1,0
5289,30.7,34.4,60.79,0.0,0.0,0.0,24.0,16.0,3.0,30.0,0,1,1,0,0,1,0
5313,20.7,20.7,80.54,0.0,0.0,0.0,11.0,16.0,3.0,10.0,0,1,1,0,0,1,0


In [148]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31 entries, 5097 to 5817
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   temp            31 non-null     float64
 1   feelslike       31 non-null     float64
 2   humidity        31 non-null     float64
 3   precip          31 non-null     float64
 4   snow            31 non-null     float64
 5   snowdepth       31 non-null     float64
 6   windspeed       31 non-null     float64
 7   visibility      31 non-null     float64
 8   uvindex         31 non-null     float64
 9   severerisk      31 non-null     float64
 10  enc_preciptype  31 non-null     int64  
 11  enc_conditions  31 non-null     int64  
 12  workingday      31 non-null     int64  
 13  holiday         31 non-null     int64  
 14  season_Spring   31 non-null     uint8  
 15  season_Summer   31 non-null     uint8  
 16  season_Winter   31 non-null     uint8  
dtypes: float64(10), int64(4), uint8(