In [434]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 50)
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
%matplotlib inline
import os
import pickle
import re
from datetime import datetime as dt

from tqdm import tqdm
from tqdm import tnrange
from tqdm import tqdm_notebook

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from scipy.stats import norm
import statsmodels.api as sm
from pandas.tools.plotting import autocorrelation_plot
from dateutil.relativedelta import relativedelta
from datetime import datetime as dt
from datetime import timedelta

from sklearn import cross_validation, grid_search, linear_model, metrics, pipeline, preprocessing

### Weather Data

In [435]:
with open('processed_weather.pkl', 'rb') as f:
    weather_df = pickle.load(f)

In [436]:
# Set index to measurement time and sort
weather_df = weather_df.set_index('MeasurementTime')
weather_df = weather_df.sort_index()

# Subset on 2015-2017
weather_df = weather_df[weather_df.index >= dt.strptime("2015-01-01", "%Y-%m-%d")]
weather_df = weather_df[weather_df.index < dt.strptime("2018-01-01", "%Y-%m-%d")]

# 
weather_df = weather_df[~weather_df.TempC.isna()]
weather_df = weather_df.convert_objects(convert_numeric=True)

# Fix swapped temperature columns
weather_df = weather_df.rename(columns={'TempC': 'TempF', 'TempF': 'TempC'})

# Limit index to hour
weather_df.index = pd.to_datetime(weather_df.index.strftime("%Y-%m-%d %H"))

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  # This is added back by InteractiveShellApp.init_path()


In [437]:
def encode_weather(weather):
    
    joined = ''.join(weather).replace(',', '')
    if len(joined) == 0:
        return 1 #'Clear'       
    else:
        heavy = re.compile('[\w\s]*heavy[\w\s]*', flags=re.I)
        moderate = re.compile('[\w\s]*(rain|snow|ice)[\w\s]*', flags=re.I)
        light = re.compile('[\w\s]*(mist|drizzle)[\w\s]*', flags=re.I)
        if len(heavy.findall(joined)) > 0:
            return 4 #'Severe'
        elif len(moderate.findall(joined)) > 0:
            return 3 #'Moderate'
        elif len(light.findall(joined)) > 0:
            return 2 #'Light'
        else: 
            return 1 #'Clear'
        
weather_df['Weather'] = weather_df.WeatherText.apply(encode_weather)

In [438]:
weather_df.head()

Unnamed: 0,ReportType,SkyConditions,Visibility,WeatherType,TempF,TempC,Humidity,WindSpeed,WindDirection,Pressure,PressureTendency,PressureChange,Precip,SkyText,WeatherText,Weather
2015-01-01 00:00:00,FM-15,SCT:04 65,10.0,,22.0,-5.6,50.0,10.0,220.0,30.07,8.0,,0.0,Scattered,"[, , ]",1
2015-01-01 01:00:00,FM-15,FEW:02 65 BKN:07 180,10.0,,22.0,-5.6,50.0,10.0,230.0,30.05,,,0.0,Broken,"[, , ]",1
2015-01-01 02:00:00,FM-15,BKN:07 150 BKN:07 200,10.0,,22.0,-5.6,50.0,11.0,230.0,30.03,,,0.0,Broken,"[, , ]",1
2015-01-01 03:00:00,FM-15,SCT:04 130 BKN:07 170,10.0,,24.0,-4.4,46.0,13.0,240.0,30.01,6.0,,0.0,Broken,"[, , ]",1
2015-01-01 04:00:00,FM-15,CLR:00,10.0,,22.0,-5.6,52.0,15.0,230.0,30.0,,,0.0,Clear,"[, , ]",1


In [439]:
# Select columns of interest
weather_cols = ['Weather', 'TempF', 'TempC', 'Humidity', 'WindSpeed', 'WindDirection', 'Pressure', 'Precip', ]
weather_df = weather_df[weather_cols]
weather_df.head()

Unnamed: 0,Weather,TempF,TempC,Humidity,WindSpeed,WindDirection,Pressure,Precip
2015-01-01 00:00:00,1,22.0,-5.6,50.0,10.0,220.0,30.07,0.0
2015-01-01 01:00:00,1,22.0,-5.6,50.0,10.0,230.0,30.05,0.0
2015-01-01 02:00:00,1,22.0,-5.6,50.0,11.0,230.0,30.03,0.0
2015-01-01 03:00:00,1,24.0,-4.4,46.0,13.0,240.0,30.01,0.0
2015-01-01 04:00:00,1,22.0,-5.6,52.0,15.0,230.0,30.0,0.0


In [440]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 26285 entries, 2015-01-01 00:00:00 to 2017-12-31 23:00:00
Data columns (total 8 columns):
Weather          26285 non-null int64
TempF            26282 non-null float64
TempC            26282 non-null float64
Humidity         26283 non-null float64
WindSpeed        26278 non-null float64
WindDirection    25924 non-null float64
Pressure         26239 non-null float64
Precip           24364 non-null float64
dtypes: float64(7), int64(1)
memory usage: 3.1 MB


### Trip Data

In [441]:
with open('start.pkl', 'rb') as f:
    start_df = pickle.load(f)

In [442]:
# Add work day to df
start_df['WorkDay'] = (start_df.index.weekday < 6) * 1

# # Encode user type
# start_df.UserType = start_df.UserType.map({'Member': 1, 'Casual': 2})

In [443]:
def get_season(doy):
    # "day of year" ranges for the northern hemisphere
    spring = range(80, 172)
    summer = range(172, 264)
    fall = range(264, 355)
    # winter = everything else

    if doy in spring:
        season = 'Spring'
    elif doy in summer:
        season = 'Summer'
    elif doy in fall:
        season = 'Fall'
    else:
        season = 'Winter'
    return season

start_df['Season'] = [get_season(xi) for xi in start_df.index.dayofyear]

In [444]:
# Add MA holidays
from workalendar.usa import Massachusetts

cal = Massachusetts()
ma_holidays = []
for yr in [2015, 2016, 2017]:
    yr_holidays = cal.holidays(yr)
    yr_holidays = [x[0] for x in yr_holidays]
    ma_holidays += yr_holidays

def get_holidays(d):
    if d.date() in ma_holidays:
        return 1
    else:
        return 0

start_df.Holiday = [get_holidays(d) for d in start_df.index]    

# # Designate Holidays
# start_df['Holiday'] = 0    
# idx_date = start_df.index.date
# for hdate in ma_holidays:
#     idx = (idx_date == hdate)
#     start_df.loc[idx, 'Holiday'] = 1



In [445]:
start_df.head()

Unnamed: 0_level_0,Duration,BikeID,UserType,BirthYear,ID,Name,Lat,Lon,Year,Month,Week,Day,Hour,Quarter,DayOfWeek,WorkDay,Season
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2015-01-01 00:21:44,542,277,Member,1984,115,Porter Square Station,42.387995,-71.119084,2015,1,1,1,0,1,3,1,Winter
2015-01-01 00:27:03,438,648,Member,1985,80,MIT Stata Center at Vassar St / Main St,42.361962,-71.092053,2015,1,1,1,0,1,3,1,Winter
2015-01-01 00:31:31,254,555,Member,1974,91,One Kendall Square at Hampshire St / Portland St,42.366277,-71.09169,2015,1,1,1,0,1,3,1,Winter
2015-01-01 00:53:46,432,1307,Member,1987,115,Porter Square Station,42.387995,-71.119084,2015,1,1,1,0,1,3,1,Winter
2015-01-01 01:07:06,735,177,Casual,1986,105,Lower Cambridgeport at Magazine St/Riverside Rd,42.356954,-71.113687,2015,1,1,1,1,1,3,1,Winter


In [446]:
start_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3627543 entries, 2015-01-01 00:21:44 to 2017-12-31 23:46:18
Data columns (total 17 columns):
Duration     int64
BikeID       int64
UserType     object
BirthYear    object
ID           int64
Name         object
Lat          float64
Lon          float64
Year         int64
Month        int64
Week         int64
Day          int64
Hour         int64
Quarter      int64
DayOfWeek    int64
WorkDay      int64
Season       object
dtypes: float64(2), int64(11), object(4)
memory usage: 658.2+ MB


### Resampling and Linking Data

In [447]:
sample_rate = 'H'

#### Resample and Fix Weather Data

In [448]:
# data_df = pd.concat([weather_df, pd.get_dummies(weather_df.Weather)], axis=1)
# data_df.head()

In [449]:
# R_weather = weather_df.resample('H').mean()
# R_weather = weather_df.resample('D').mean()
# R_weather = weather_df.resample('W').mean()
# R_weather = weather_df.resample('M').mean()
R_weather = weather_df.resample(sample_rate).mean()
R_weather.Weather = weather_df.Weather.resample(sample_rate).max()
# Interpolate to fill in NaNs
R_weather = R_weather.interpolate('linear', axis=0)
R_weather.head()

Unnamed: 0,Weather,TempF,TempC,Humidity,WindSpeed,WindDirection,Pressure,Precip
2015-01-01 00:00:00,1.0,22.0,-5.6,50.0,10.0,220.0,30.07,0.0
2015-01-01 01:00:00,1.0,22.0,-5.6,50.0,10.0,230.0,30.05,0.0
2015-01-01 02:00:00,1.0,22.0,-5.6,50.0,11.0,230.0,30.03,0.0
2015-01-01 03:00:00,1.0,24.0,-4.4,46.0,13.0,240.0,30.01,0.0
2015-01-01 04:00:00,1.0,22.0,-5.6,52.0,15.0,230.0,30.0,0.0


In [450]:
R_weather.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 26304 entries, 2015-01-01 00:00:00 to 2017-12-31 23:00:00
Freq: H
Data columns (total 8 columns):
Weather          26304 non-null float64
TempF            26304 non-null float64
TempC            26304 non-null float64
Humidity         26304 non-null float64
WindSpeed        26304 non-null float64
WindDirection    26304 non-null float64
Pressure         26304 non-null float64
Precip           26304 non-null float64
dtypes: float64(8)
memory usage: 1.8 MB


#### Resample and Fix Trip Data

In [451]:
data2_df = pd.concat([start_df, pd.get_dummies(start_df.UserType)], axis=1)
data2_df = pd.concat([data2_df, pd.get_dummies(start_df.Season)], axis=1)

data2_df

In [452]:
# x = data2_df.BikeID.resample('H').count()
# x = data2_df.BikeID.resample('D').count()
# x = data2_df.BikeID.resample('W').count()
# x = data2_df.BikeID.resample('M').count()
x = data2_df.BikeID.resample(sample_rate).count()
x.name = 'Count'
# y = data2_df.resample('H').mean()
# y = data2_df.resample('D').mean()
# y = data2_df.resample('W').mean()
# y = data2_df.resample('M').mean()
y =  data2_df.resample(sample_rate).mean()
y = y.drop(columns=['BikeID', 'ID', 'Lat', 'Lon', 'Casual', 'Member', 'Fall', 'Spring', 'Summer', 'Winter'])
R_trip = pd.concat([x, y], axis=1)

In [453]:
R_trip.head()

Unnamed: 0_level_0,Count,Duration,Year,Month,Week,Day,Hour,Quarter,DayOfWeek,WorkDay
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-01-01 00:00:00,4,416.5,2015.0,1.0,1.0,1.0,0.0,1.0,3.0,1.0
2015-01-01 01:00:00,3,768.333333,2015.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0
2015-01-01 02:00:00,1,338.0,2015.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0
2015-01-01 03:00:00,0,,,,,,,,,
2015-01-01 04:00:00,1,429.0,2015.0,1.0,1.0,1.0,4.0,1.0,3.0,1.0


In [454]:
R_trip.Duration = R_trip.Duration.fillna(0)
R_trip.Year = R_trip.index.year
R_trip.Month = R_trip.index.month
R_trip.Week = R_trip.index.week
R_trip.Day = R_trip.index.day
R_trip.Hour = R_trip.index.hour
R_trip.Quarter = R_trip.index.quarter
R_trip.WorkDay = (R_trip.index.weekday < 6) * 1
R_trip.DayOfWeek = R_trip.index.dayofweek
R_trip['DayOfYear'] = R_trip.index.dayofyear

In [455]:
R_trip.head()

Unnamed: 0_level_0,Count,Duration,Year,Month,Week,Day,Hour,Quarter,DayOfWeek,WorkDay,DayOfYear
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015-01-01 00:00:00,4,416.5,2015,1,1,1,0,1,3,1,1
2015-01-01 01:00:00,3,768.333333,2015,1,1,1,1,1,3,1,1
2015-01-01 02:00:00,1,338.0,2015,1,1,1,2,1,3,1,1
2015-01-01 03:00:00,0,0.0,2015,1,1,1,3,1,3,1,1
2015-01-01 04:00:00,1,429.0,2015,1,1,1,4,1,3,1,1


In [456]:
R_trip['Season'] = [get_season(xi) for xi in R_trip.index.dayofyear]
R_trip.Season = R_trip.Season.map({'Winter': 4, 'Fall': 3, 'Summer': 2, 'Spring': 1})
R_trip.head()

Unnamed: 0_level_0,Count,Duration,Year,Month,Week,Day,Hour,Quarter,DayOfWeek,WorkDay,DayOfYear,Season
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-01-01 00:00:00,4,416.5,2015,1,1,1,0,1,3,1,1,4
2015-01-01 01:00:00,3,768.333333,2015,1,1,1,1,1,3,1,1,4
2015-01-01 02:00:00,1,338.0,2015,1,1,1,2,1,3,1,1,4
2015-01-01 03:00:00,0,0.0,2015,1,1,1,3,1,3,1,1,4
2015-01-01 04:00:00,1,429.0,2015,1,1,1,4,1,3,1,1,4


In [457]:
def get_holidays(d):
    if d.date() in ma_holidays:
        return 1
    else:
        return 0

R_trip['Holiday'] = [get_holidays(d) for d in R_trip.index]

In [458]:
R_trip.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 26304 entries, 2015-01-01 00:00:00 to 2017-12-31 23:00:00
Freq: H
Data columns (total 13 columns):
Count        26304 non-null int64
Duration     26304 non-null float64
Year         26304 non-null int64
Month        26304 non-null int64
Week         26304 non-null int64
Day          26304 non-null int64
Hour         26304 non-null int64
Quarter      26304 non-null int64
DayOfWeek    26304 non-null int64
WorkDay      26304 non-null int64
DayOfYear    26304 non-null int64
Season       26304 non-null int64
Holiday      26304 non-null int64
dtypes: float64(1), int64(12)
memory usage: 4.1 MB


In [459]:
R_trip.head()

Unnamed: 0_level_0,Count,Duration,Year,Month,Week,Day,Hour,Quarter,DayOfWeek,WorkDay,DayOfYear,Season,Holiday
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2015-01-01 00:00:00,4,416.5,2015,1,1,1,0,1,3,1,1,4,1
2015-01-01 01:00:00,3,768.333333,2015,1,1,1,1,1,3,1,1,4,1
2015-01-01 02:00:00,1,338.0,2015,1,1,1,2,1,3,1,1,4,1
2015-01-01 03:00:00,0,0.0,2015,1,1,1,3,1,3,1,1,4,1
2015-01-01 04:00:00,1,429.0,2015,1,1,1,4,1,3,1,1,4,1


#### Combine Data

In [460]:
data_df = pd.concat([R_trip, R_weather], axis=1)

In [461]:
data_df.head()

Unnamed: 0_level_0,Count,Duration,Year,Month,Week,Day,Hour,Quarter,DayOfWeek,WorkDay,DayOfYear,Season,Holiday,Weather,TempF,TempC,Humidity,WindSpeed,WindDirection,Pressure,Precip
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 00:00:00,4,416.5,2015,1,1,1,0,1,3,1,1,4,1,1.0,22.0,-5.6,50.0,10.0,220.0,30.07,0.0
2015-01-01 01:00:00,3,768.333333,2015,1,1,1,1,1,3,1,1,4,1,1.0,22.0,-5.6,50.0,10.0,230.0,30.05,0.0
2015-01-01 02:00:00,1,338.0,2015,1,1,1,2,1,3,1,1,4,1,1.0,22.0,-5.6,50.0,11.0,230.0,30.03,0.0
2015-01-01 03:00:00,0,0.0,2015,1,1,1,3,1,3,1,1,4,1,1.0,24.0,-4.4,46.0,13.0,240.0,30.01,0.0
2015-01-01 04:00:00,1,429.0,2015,1,1,1,4,1,3,1,1,4,1,1.0,22.0,-5.6,52.0,15.0,230.0,30.0,0.0


In [462]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 26304 entries, 2015-01-01 00:00:00 to 2017-12-31 23:00:00
Freq: H
Data columns (total 21 columns):
Count            26304 non-null int64
Duration         26304 non-null float64
Year             26304 non-null int64
Month            26304 non-null int64
Week             26304 non-null int64
Day              26304 non-null int64
Hour             26304 non-null int64
Quarter          26304 non-null int64
DayOfWeek        26304 non-null int64
WorkDay          26304 non-null int64
DayOfYear        26304 non-null int64
Season           26304 non-null int64
Holiday          26304 non-null int64
Weather          26304 non-null float64
TempF            26304 non-null float64
TempC            26304 non-null float64
Humidity         26304 non-null float64
WindSpeed        26304 non-null float64
WindDirection    26304 non-null float64
Pressure         26304 non-null float64
Precip           26304 non-null float64
dtypes: float64(9), int64(12)
me

In [463]:
# # Filling in NaNs with mean of column
# # [Note: This is probably not a good idea]
# data_df.Precip = data_df.Precip.fillna(data_df.Precip.mean())
# data_df.TempC = data_df.TempC.fillna(data_df.TempC.mean())
# data_df.TempF = data_df.TempF.fillna(data_df.TempF.mean())
# data_df.Humidity = data_df.Humidity.fillna(data_df.Humidity.mean())
# data_df.WindSpeed = data_df.WindSpeed.fillna(data_df.WindSpeed.mean())
# data_df.WindDirection = data_df.WindDirection.fillna(data_df.WindDirection.mean())
# data_df.Pressure = data_df.Pressure.fillna(data_df.Pressure.mean())

In [464]:
# data_df.head()

In [465]:
# data_df.info()

In [466]:
with open('hourly_data.pkl', 'wb') as f:
    pickle.dump(data_df, f)

### SARIMAX with StatsModels

In [467]:
endog = data_df.Count
exog = data_df[['TempC','WindSpeed','Precip']]
# exog = sm.tools.add_constant(exog)

In [468]:
exog.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 26304 entries, 2015-01-01 00:00:00 to 2017-12-31 23:00:00
Freq: H
Data columns (total 3 columns):
TempC        26304 non-null float64
WindSpeed    26304 non-null float64
Precip       26304 non-null float64
dtypes: float64(3)
memory usage: 2.1 MB


In [469]:
# # Day
# sarima_mod = sm.tsa.statespace.SARIMAX(endog, exog=exog, order=(0,1,4), seasonal_order=(0,1,4,7), trend='c').fit()
# Week
sarima_mod = sm.tsa.statespace.SARIMAX(endog, exog=exog, order=(1,1,1), seasonal_order=(1,1,0,52), trend='c').fit()
# # Month
# sarima_mod = sm.tsa.statespace.SARIMAX(endog, exog=exog, order=(0,1,1), seasonal_order=(0,1,0,12), trend='c').fit()

print(sarima_mod.summary())

plt.plot(sarima_mod.resid, "bo")
# # Day
# print(plot_acf(sarima_mod.resid, lags=100))
# print(plot_pacf(sarima_mod.resid, lags=100))
# Week
print(plot_acf(sarima_mod.resid, lags=104))
print(plot_pacf(sarima_mod.resid, lags=104))
# # Month
# print(plot_acf(sarima_mod.resid, lags=24))
# print(plot_pacf(sarima_mod.resid, lags=24))

MemoryError: 

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
sar.plot_diagnostics(fig=fig);

In [None]:
#turn weekly series into dataframe and add prediction column
df_day = endog.to_frame()
df_day.rename(columns={"BikeID" : "Count"}, inplace=True)

In [None]:
def bound_prediction(predict):
    predict[predict < 0] = 0
    return predict

In [None]:
#plot weekly actual and forecast
df_day['Forecast'] = bound_prediction(sarima_mod.predict())
df_day
df_day.plot(figsize=(14, 6));

In [None]:
#add daily dates to df_day
start = dt.strptime("2018-01-01", "%Y-%m-%d")
day_list = [start + relativedelta(days=x) for x in range(0,365)]
future_day = pd.DataFrame(index=day_list, columns= df_day.columns)
df_day = pd.concat([df_day, future_day])

In [None]:
df_day.info()

In [None]:
start_idx = 1096
future_days = 30
end_idx = start_idx+future_days
df_day['Forecast'] = bound_prediction(sarima_mod.predict(start=start_idx, end=end_idx, dynamic=True))
df_day
df_day.plot(figsize=(14, 6));

### With Prophet

In [470]:
# Python
import pandas as pd
import numpy as np
from fbprophet import Prophet

In [471]:
df1 = data_df.Count.resample('W').sum()
df = df1.copy()
df = data_df.Count

In [472]:
df = df.reset_index()
df = df.rename(columns={'Time': 'ds', 'Count': 'y'})
df.head()

Unnamed: 0,ds,y
0,2015-01-01 00:00:00,4
1,2015-01-01 01:00:00,3
2,2015-01-01 02:00:00,1
3,2015-01-01 03:00:00,0
4,2015-01-01 04:00:00,1


In [473]:
# df.loc[df.TempC.isna(), 'TempC'] = df.TempC.mean()

In [474]:
m = Prophet(interval_width=0.95)
# m.add_regressor('TempC')
m.fit(df);

  elif np.issubdtype(np.asarray(v).dtype, float):


In [475]:
# future = m.make_future_dataframe(periods=52*3, freq='w')
future = m.make_future_dataframe(periods=52*3, freq='w')
# future_temp = np.random.uniform(df.TempC.min(), df.TempC.max(), size=future.shape)
# future['TempC'] = future_temp
future.tail()

Unnamed: 0,ds
26455,2020-11-29 23:00:00
26456,2020-12-06 23:00:00
26457,2020-12-13 23:00:00
26458,2020-12-20 23:00:00
26459,2020-12-27 23:00:00


In [None]:
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
m.plot(forecast);

In [None]:
fig = m.plot_components(forecast);

In [None]:
forecast_data = forecast.set_index('ds')
forecast_data = forecast_data[forecast_data.index.date > (dt(2018, 1, 1).date())]

In [None]:
forecast_data

In [None]:
fig,ax = plt.subplots(figsize=(8,6))
forecast_data[['yhat_lower', 'yhat', 'yhat_upper']].plot(ax=ax)

In [None]:
fig,ax = plt.subplots(figsize=(8,6))
forecast_data[['yhat_lower', 'yhat', 'yhat_upper']].plot(ax=ax)