In [1]:
# Cell 1: Import and load data
import importlib
import data_loader
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import ipywidgets as widgets
from IPython.display import display

importlib.reload(data_loader)
from data_loader import load_sample_data, preprocess_data

# Load sample data
stations_info, status_data = load_sample_data()
merged_data = preprocess_data(stations_info, status_data)

# Display basic statistics
print(merged_data.describe())

# Add a dropdown to select specific columns for detailed statistics
def show_column_stats(column):
    print(merged_data[column].describe())
    
    if merged_data[column].dtype in ['int64', 'float64']:
        plt.figure(figsize=(10, 6))
        merged_data[column].hist()
        plt.title(f'Histogram of {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.show()

column_dropdown = widgets.Dropdown(
    options=merged_data.columns,
    description='Select column:',
    disabled=False,
)

output = widgets.Output()

def on_change(change):
    with output:
        output.clear_output()
        show_column_stats(change.new)

column_dropdown.observe(on_change, names='value')

display(column_dropdown, output)

widgets.interactive(show_column_stats, column=column_dropdown)

INFO:data_loader:Successfully connected to MongoDB
INFO:data_loader:Loaded 516 stations
INFO:data_loader:Attempting to load status data for September 2024
INFO:data_loader:Loaded 4424667 status records
INFO:data_loader:Successfully loaded data for September 2024
INFO:data_loader:Starting data preprocessing
INFO:data_loader:Merged data shape: (4424667, 12)
INFO:data_loader:Data shape after filtering non-existent stations: (4424667, 12)
INFO:data_loader:Data preprocessing completed


         station_id  num_bikes_available  num_docks_available  \
count  4.424667e+06         4.424667e+06         4.424667e+06   
mean   2.655210e+02         1.041275e+01         1.525376e+01   
min    1.000000e+00         0.000000e+00         0.000000e+00   
25%    1.350000e+02         3.000000e+00         8.000000e+00   
50%    2.660000e+02         8.000000e+00         1.600000e+01   
75%    3.950000e+02         1.600000e+01         2.200000e+01   
max    5.430000e+02         1.980000e+02         9.900000e+01   
std    1.512070e+02         1.012694e+01         9.115089e+00   

                       last_reported      altitude           lat  \
count                        4424667  4.404476e+06  4.424667e+06   
mean   2024-09-16 00:03:25.161469696  3.576893e+01  4.139935e+01   
min              2024-09-01 00:00:34  2.000000e+00  4.134677e+01   
25%              2024-09-08 12:05:51  9.000000e+00  4.138337e+01   
50%              2024-09-15 23:58:28  2.500000e+01  4.139578e+01   
75%   

Dropdown(description='Select column:', options=('_id_x', 'station_id', 'num_bikes_available', 'num_docks_avail…

Output()

interactive(children=(Dropdown(description='Select column:', options=('_id_x', 'station_id', 'num_bikes_availa…

In [3]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime, timedelta
from bokeh.models import DateRangeSlider


def plot_system_usage(start_date, end_date):
    filtered_data = merged_data[(merged_data['last_reported'] >= start_date) & (merged_data['last_reported'] <= end_date)]
    daily_usage = filtered_data.groupby('last_reported').agg({
        'num_bikes_available': 'mean',
        'num_docks_available': 'mean'
    }).reset_index()

    fig = make_subplots(specs=[[{"secondary_y": True}]])

    fig.add_trace(
        go.Scatter(x=daily_usage['last_reported'], y=daily_usage['num_bikes_available'], name="Available Bikes"),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=daily_usage['last_reported'], y=daily_usage['num_docks_available'], name="Available Docks"),
        secondary_y=True,
    )

    fig.update_layout(
        title_text="Overall System Usage",
        xaxis_title="Date",
    )

    fig.update_yaxes(title_text="Average Number of Bikes", secondary_y=False)
    fig.update_yaxes(title_text="Average Number of Docks", secondary_y=True)

    return fig

min_date = merged_data['last_reported'].min().date()
max_date = merged_data['last_reported'].max().date()

date_range = widgets.DateRangeSlider(
    value=[min_date, max_date],
    min=min_date,
    max=max_date,
    step=timedelta(days=1),
    description='Date Range'
)

plot_output = widgets.Output()

@plot_output.capture(clear_output=True)
def update_plot(change):
    start_date, end_date = change['new']
    fig = plot_system_usage(start_date, end_date)
    fig.show()

date_range.observe(update_plot, names='value')

display(date_range, plot_output)

ImportError: cannot import name 'DateRangeSlider' from 'ipywidgets' (c:\Users\danys\tfm\env\Lib\site-packages\ipywidgets\__init__.py)

In [None]:
# Cell 3: Interactive Station Usage by Time of Day
def plot_station_usage_by_time(station_id):
    station_data = merged_data[merged_data['station_id'] == station_id]
    pivot_data = station_data.pivot_table(values='num_bikes_available', index='hour', aggfunc='mean').reset_index()
    
    fig = px.line(pivot_data, x='hour', y='num_bikes_available', markers=True)
    fig.update_layout(
        title=f"Average Bike Availability for Station {station_id} by Time of Day",
        xaxis_title="Hour of Day",
        yaxis_title="Average Number of Available Bikes"
    )
    return fig

station_dropdown = widgets.Dropdown(
    options=sorted(merged_data['station_id'].unique()),
    description='Select station:',
    disabled=False,
)

def update_station_plot(station_id):
    fig = plot_station_usage_by_time(station_id)
    fig.show()

widgets.interactive(update_station_plot, station_id=station_dropdown)

In [None]:
# Cell 4: Interactive Scatter Plot
x_dropdown = widgets.Dropdown(options=merged_data.select_dtypes(include=[np.number]).columns, description='X-axis:')
y_dropdown = widgets.Dropdown(options=merged_data.select_dtypes(include=[np.number]).columns, description='Y-axis:')
color_dropdown = widgets.Dropdown(options=['None'] + list(merged_data.columns), description='Color by:')

def create_scatter_plot(x, y, color):
    if color == 'None':
        fig = px.scatter(merged_data, x=x, y=y)
    else:
        fig = px.scatter(merged_data, x=x, y=y, color=color)
    
    fig.update_layout(title=f'{y} vs {x}', xaxis_title=x, yaxis_title=y)
    return fig

def update_scatter_plot(x, y, color):
    fig = create_scatter_plot(x, y, color)
    fig.show()

widgets.interactive(update_scatter_plot, x=x_dropdown, y=y_dropdown, color=color_dropdown)