In [100]:
import pandas as pd
import pickle
import joblib
from datetime import timedelta

def show_next_forecasts(start_date_str, end_date_str):
    # Load the data
    bike_data = pd.read_csv("/Users/lucazosso/Desktop/IE_Course/Term_2/Python II/Group_Assignement/python_grp3/data/bike_data_cleaned_features.csv", index_col='date', parse_dates=['date'])
    X_prep = pickle.load(open("/Users/lucazosso/Desktop/IE_Course/Term_2/Python II/Group_Assignement/python_grp3/streamlite_app/X_prep.pkl", 'rb'))
    model = joblib.load("/Users/lucazosso/Desktop/IE_Course/Term_2/Python II/Group_Assignement/python_grp3/streamlite_app/xgb_best_model.pkl")

    # Ensure the index is recognized as datetime
    bike_data.index = pd.to_datetime(bike_data.index)
    X_prep_df = pd.DataFrame(X_prep, index=bike_data.index)

    # Convert string inputs to datetime
    start_date = pd.to_datetime(start_date_str)
    end_date = pd.to_datetime(end_date_str)
    
    # Retrieve the total_bike_ct for the input date range
    y_time_range = bike_data.loc[start_date:end_date, 'total_bike_ct']
    
    # Calculate the date range in hours
    date_range_hours = (end_date - start_date).total_seconds() / 3600
    
    # Adjust dates for the next period forecast
    next_start_date = end_date + timedelta(days=1)
    next_end_date = next_start_date + (end_date - start_date)
    
    # Filter X_prep for the forecast period using correct datetime indexing
    X_test = X_prep_df.loc[next_start_date:next_end_date]

    # Predict using the model
    preds = model.predict(X_test)
    
    # Compute confidence interval for each prediction
    preds_mean = preds.mean()
    preds_std = preds.std()
    preds_lower = preds_mean - 1.96 * preds_std
    preds_upper = preds_mean + 1.96 * preds_std

    return y_time_range, preds, date_range_hours, preds_lower, preds_upper



In [101]:
# Example usage:
start = '2011-01-01'
end = '2011-01-02'
yt, yt_1, hours, preds_lower, preds_upper = show_next_forecasts(start, end)
print(f"Date range in hours: {hours}")
print(f"yt: {yt}")
print(f"Predictions: {yt_1}")

Date range in hours: 24.0
yt: date
2011-01-01 00:00:00     16
2011-01-01 01:00:00     40
2011-01-01 02:00:00     32
2011-01-01 03:00:00     13
2011-01-01 04:00:00      1
2011-01-01 05:00:00      1
2011-01-01 06:00:00      2
2011-01-01 07:00:00      3
2011-01-01 08:00:00      8
2011-01-01 09:00:00     14
2011-01-01 10:00:00     36
2011-01-01 11:00:00     56
2011-01-01 12:00:00     84
2011-01-01 13:00:00     94
2011-01-01 14:00:00    106
2011-01-01 15:00:00    110
2011-01-01 16:00:00     93
2011-01-01 17:00:00     67
2011-01-01 18:00:00     35
2011-01-01 19:00:00     37
2011-01-01 20:00:00     36
2011-01-01 21:00:00     34
2011-01-01 22:00:00     28
2011-01-01 23:00:00     39
2011-01-02 00:00:00     17
Name: total_bike_ct, dtype: int64
Predictions: [  3.9226925   2.1304994   1.2024482   3.1502924  26.45015    64.49219
 154.74      100.64156    45.833595   53.285183   69.65953    62.363365
  68.70695    72.99269    94.88722   159.2961    145.61067   107.861855
  53.88501    49.187122   26

In [90]:
# Get the start date from yt
start_date = yt.index[0]

# Get the end date from yt and calculate the new end date
end_date = yt.index[-1]
new_end_date = end_date + (end_date - start_date)

# Create the new date range
new_index = pd.date_range(start=start_date, end=new_end_date, freq='H')

print(new_index)

DatetimeIndex(['2011-01-01 00:00:00', '2011-01-01 01:00:00',
               '2011-01-01 02:00:00', '2011-01-01 03:00:00',
               '2011-01-01 04:00:00', '2011-01-01 05:00:00',
               '2011-01-01 06:00:00', '2011-01-01 07:00:00',
               '2011-01-01 08:00:00', '2011-01-01 09:00:00',
               '2011-01-01 10:00:00', '2011-01-01 11:00:00',
               '2011-01-01 12:00:00', '2011-01-01 13:00:00',
               '2011-01-01 14:00:00', '2011-01-01 15:00:00',
               '2011-01-01 16:00:00', '2011-01-01 17:00:00',
               '2011-01-01 18:00:00', '2011-01-01 19:00:00',
               '2011-01-01 20:00:00', '2011-01-01 21:00:00',
               '2011-01-01 22:00:00', '2011-01-01 23:00:00',
               '2011-01-02 00:00:00', '2011-01-02 01:00:00',
               '2011-01-02 02:00:00', '2011-01-02 03:00:00',
               '2011-01-02 04:00:00', '2011-01-02 05:00:00',
               '2011-01-02 06:00:00', '2011-01-02 07:00:00',
               '2011-01-

In [91]:
# Combine yt and yt_1 into a single dataframe, note that yt_1 is a numpy array and should have for index the last part from the new index
yt_1_df = pd.DataFrame(yt_1, index=new_index[-yt_1.shape[0]:], columns=['yt_1'])
# Concatenate yt and yt_1_df along the column axis
yt_combined = pd.concat([yt, yt_1_df], axis=1)

# Print yt_combined
print(yt_combined)


                     total_bike_ct        yt_1
2011-01-01 00:00:00           16.0         NaN
2011-01-01 01:00:00           40.0         NaN
2011-01-01 02:00:00           32.0         NaN
2011-01-01 03:00:00           13.0         NaN
2011-01-01 04:00:00            1.0         NaN
2011-01-01 05:00:00            1.0         NaN
2011-01-01 06:00:00            2.0         NaN
2011-01-01 07:00:00            3.0         NaN
2011-01-01 08:00:00            8.0         NaN
2011-01-01 09:00:00           14.0         NaN
2011-01-01 10:00:00           36.0         NaN
2011-01-01 11:00:00           56.0         NaN
2011-01-01 12:00:00           84.0         NaN
2011-01-01 13:00:00           94.0         NaN
2011-01-01 14:00:00          106.0         NaN
2011-01-01 15:00:00          110.0         NaN
2011-01-01 16:00:00           93.0         NaN
2011-01-01 17:00:00           67.0         NaN
2011-01-01 18:00:00           35.0         NaN
2011-01-01 19:00:00           37.0         NaN
2011-01-01 20

In [92]:
# Using plotly plot total_bike_ct and the forecasted values(yt_1) in the same plot, yt_1 should be in a different color and dotted
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=yt_combined.index, y=yt_combined['total_bike_ct'], mode='lines', name='total_bike_ct'))
fig.add_trace(go.Scatter(x=yt_combined.index, y=yt_combined['yt_1'], mode='lines', name='yt_1', line=dict(dash='dot')))
fig.show()

In [96]:
# using yt_combined, sum all the hours for each day for both columns
yt_combined_day = yt_combined.resample('D').sum()
print(yt_combined_day)

            total_bike_ct         yt_1
2011-01-01          985.0     0.000000
2011-01-02           17.0  1384.423706
2011-01-03            0.0     5.848783


In [94]:
def resample_df_agg(freq, aggtype='sum'):
    if isinstance(yt_combined, pd.DataFrame) and isinstance(yt_combined.index, pd.DatetimeIndex):
        resampled_data = yt_combined.resample(freq).agg(aggtype)
        return resampled_data
    else:
        print("yt_combined should be a DataFrame with a DateTimeIndex.")
        return None



In [95]:
# Example usage:
resampled_data = resample_df_agg('D', 'sum')
print(resampled_data)

            total_bike_ct         yt_1
2011-01-01          985.0     0.000000
2011-01-02           17.0  1384.423706
2011-01-03            0.0     5.848783


In [102]:
data = pd.read_csv("/Users/lucazosso/Desktop/IE_Course/Term_2/Python II/Group_Assignement/python_grp3/data/bike_data_cleaned_features.csv", index_col='date', parse_dates=['date'])

data.head()

Unnamed: 0_level_0,season,year,month,hour,is_holiday,weekday,is_workingday,weather_condition,temp_celsius,atem_celsius,...,Rolling_Avg_Rent_24_h,Rolling_Avg_Rent_168_h,hourly_rental_deviation,hourly_rental_deviation_label,Lagged_Rent_1h,count_prev_week_same_hour,temp_type_diff_ratio,hourly_trend,daily_trend,monthly_trend
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-01 00:00:00,1,0,1,0,0,6,0,1,0.24,0.2879,...,16.0,16.0,-20.222222,below_average,0.0,0.0,-0.959583,53.898072,190.209793,94.424773
2011-01-01 01:00:00,1,0,1,1,0,6,0,1,0.22,0.2727,...,28.0,28.0,6.555556,above_average,16.0,0.0,-1.019545,33.375691,190.209793,94.424773
2011-01-01 02:00:00,1,0,1,2,0,6,0,1,0.22,0.2727,...,29.333333,29.333333,6.555556,above_average,40.0,0.0,-1.019545,22.86993,190.209793,94.424773
2011-01-01 03:00:00,1,0,1,3,0,6,0,1,0.24,0.2879,...,25.25,25.25,0.444444,above_average,32.0,0.0,-0.959583,11.727403,190.209793,94.424773
2011-01-01 04:00:00,1,0,1,4,0,6,0,1,0.24,0.2879,...,20.4,20.4,-2.888889,below_average,13.0,0.0,-0.959583,6.352941,190.209793,94.424773


In [115]:
data.columns

Index(['season', 'year', 'month', 'hour', 'is_holiday', 'weekday',
       'is_workingday', 'weather_condition', 'temp_celsius', 'atem_celsius',
       'humidity', 'windspeed', 'casual_user_ct', 'registered_user_ct',
       'total_bike_ct', 'day_part', 'rush_hour', 'casual_to_registered_ratio',
       'Rolling_Avg_Rent_2_h', 'Rolling_Avg_Rent_3_h', 'Rolling_Avg_Rent_4_h',
       'Rolling_Avg_Rent_5_h', 'Rolling_Avg_Rent_6_h', 'Rolling_Avg_Rent_12_h',
       'Rolling_Avg_Rent_24_h', 'Rolling_Avg_Rent_168_h',
       'hourly_rental_deviation', 'hourly_rental_deviation_label',
       'Lagged_Rent_1h', 'count_prev_week_same_hour', 'temp_type_diff_ratio',
       'hourly_trend', 'daily_trend', 'monthly_trend', 'temp_celsius_real',
       'atem_celsius_real', 'humidity_real', 'windspeed_real'],
      dtype='object')

In [105]:
# Define the mapping dictionary
weather_condition_mapping = {
    1: 'Clear and Variably Cloudy Conditions, No Precipitation',
    2: 'Misty Conditions with Varied Cloud Cover',
    3: 'Light Precipitation and Thunderstorms',
    4: 'Severe Weather Conditions'
}

# Apply the mapping to the 'weather_condition' column
data['weather_condition'] = data['weather_condition'].map(weather_condition_mapping)



In [107]:
sorted_unique_temp_celsius = sorted(data['temp_celsius'].unique().tolist())
sorted_unique_temp_celsius


[0.02,
 0.04,
 0.06,
 0.08,
 0.1,
 0.12,
 0.14,
 0.16,
 0.18,
 0.2,
 0.22,
 0.24,
 0.26,
 0.28,
 0.3,
 0.32,
 0.34,
 0.36,
 0.38,
 0.4,
 0.42,
 0.44,
 0.46,
 0.48,
 0.5,
 0.52,
 0.54,
 0.56,
 0.58,
 0.6,
 0.62,
 0.64,
 0.66,
 0.68,
 0.7,
 0.72,
 0.74,
 0.76,
 0.78,
 0.8,
 0.82,
 0.84,
 0.86,
 0.88,
 0.9,
 0.92,
 0.94,
 0.96,
 0.98,
 1.0]

In [108]:
import numpy as np

# According to the UCI Machine Learning Repository,
# the temperatures in the bike sharing dataset are 
# normalized based on a -8 to +39 degrees Celsius scale.

# Your array of normalized temperatures
temps_normalized = np.array([0.24, 0.22, 0.2 , 0.32, 0.38, 0.36, 0.42, 0.46, 0.44, 0.4 , 0.34,
       0.3 , 0.26, 0.16, 0.14, 0.18, 0.12, 0.28, 0.1 , 0.08, 0.06, 0.04,
       0.02, 0.52, 0.56, 0.58, 0.6 , 0.48, 0.54, 0.5 , 0.66, 0.64, 0.62,
       0.68, 0.7 , 0.74, 0.76, 0.72, 0.78, 0.82, 0.8 , 0.86, 0.88, 0.9 ,
       0.84, 0.92, 0.94, 0.96, 0.98, 1.  ])

# Convert back to Celsius
temps_celsius = temps_normalized * (39 - (-8)) + (-8)

print(temps_celsius)


[ 3.28  2.34  1.4   7.04  9.86  8.92 11.74 13.62 12.68 10.8   7.98  6.1
  4.22 -0.48 -1.42  0.46 -2.36  5.16 -3.3  -4.24 -5.18 -6.12 -7.06 16.44
 18.32 19.26 20.2  14.56 17.38 15.5  23.02 22.08 21.14 23.96 24.9  26.78
 27.72 25.84 28.66 30.54 29.6  32.42 33.36 34.3  31.48 35.24 36.18 37.12
 38.06 39.  ]


In [113]:
data['temp_celsius_real'] = data['temp_celsius'] * (39 - (-8)) + (-8)
data['atem_celsius_real'] = data['atem_celsius'] * (39 - (-8)) + (-8)
data['humidity_real'] = data['humidity'] * 100

#km/h, you can multiply by the maximum speed (67 km/h):

data['windspeed_real'] = data['windspeed'] * 67

In [114]:
data[['temp_celsius_real', 'atem_celsius_real', 'humidity_real', 'windspeed_real']].head()

Unnamed: 0_level_0,temp_celsius_real,atem_celsius_real,humidity_real,windspeed_real
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-01-01 00:00:00,3.28,5.5313,81.0,0.0
2011-01-01 01:00:00,2.34,4.8169,80.0,0.0
2011-01-01 02:00:00,2.34,4.8169,80.0,0.0
2011-01-01 03:00:00,3.28,5.5313,75.0,0.0
2011-01-01 04:00:00,3.28,5.5313,75.0,0.0


In [117]:
data.columns

Index(['season', 'year', 'month', 'hour', 'is_holiday', 'weekday',
       'is_workingday', 'weather_condition', 'temp_celsius', 'atem_celsius',
       'humidity', 'windspeed', 'casual_user_ct', 'registered_user_ct',
       'total_bike_ct', 'day_part', 'rush_hour', 'casual_to_registered_ratio',
       'Rolling_Avg_Rent_2_h', 'Rolling_Avg_Rent_3_h', 'Rolling_Avg_Rent_4_h',
       'Rolling_Avg_Rent_5_h', 'Rolling_Avg_Rent_6_h', 'Rolling_Avg_Rent_12_h',
       'Rolling_Avg_Rent_24_h', 'Rolling_Avg_Rent_168_h',
       'hourly_rental_deviation', 'hourly_rental_deviation_label',
       'Lagged_Rent_1h', 'count_prev_week_same_hour', 'temp_type_diff_ratio',
       'hourly_trend', 'daily_trend', 'monthly_trend', 'temp_celsius_real',
       'atem_celsius_real', 'humidity_real', 'windspeed_real', 'day'],
      dtype='object')

In [124]:
data = pd.read_csv("/Users/lucazosso/Desktop/IE_Course/Term_2/Python II/Group_Assignement/python_grp3/data/bike_data_cleaned_features.csv", index_col='date', parse_dates=['date'])

data['weather_condition']

date
2011-01-01 00:00:00    1
2011-01-01 01:00:00    1
2011-01-01 02:00:00    1
2011-01-01 03:00:00    1
2011-01-01 04:00:00    1
                      ..
2012-12-31 19:00:00    2
2012-12-31 20:00:00    2
2012-12-31 21:00:00    1
2012-12-31 22:00:00    1
2012-12-31 23:00:00    1
Name: weather_condition, Length: 17379, dtype: int64

In [147]:
!pip install pydeck



In [148]:
# Use Folium to display the map od Washington DC, and then display a circle of radius of the size of the bike_count
import streamlit as st
import pydeck as pdk

# Example coordinates for Washington DC
latitude, longitude = 38.8951, -77.0364
# Example data - replace this with your dataset's bike usage
bike_usage = 1000  # This value could be dynamically set based on your dataset

# Define the map layer
map_layer = pdk.Layer(
    "ScatterplotLayer",
    data=[{
        "position": [longitude, latitude],
        "radius": bike_usage,  # Radius based on bike usage
        "color": [255, 0, 0],  # Example color (red)
    }],
    get_position="position",
    get_radius="radius",
    get_color="color",
    pickable=True,
    opacity=0.8,
)

# Set the view for the map
view_state = pdk.ViewState(latitude=latitude, longitude=longitude, zoom=11)

# Render the map with the circle overlay in Streamlit
st.pydeck_chart(pdk.Deck(layers=[map_layer], initial_view_state=view_state))


2024-03-20 17:34:47.573 
  command:

    streamlit run /Users/lucazosso/opt/anaconda3/lib/python3.9/site-packages/ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [149]:
%streamlit run /Users/lucazosso/opt/anaconda3/lib/python3.9/site-packages/ipykernel_launcher.py

UsageError: Line magic function `%streamlit` not found.
