In [1]:
# %% [markdown]
# # Citywide Mobility Survey Analysis with Modeling
#
# This notebook explores the 2022 citywide mobility survey data and goes a step further by building a predictive model. We will:
# 
# 1. Perform exploratory data analysis.
# 2. Build a linear regression model to predict trip duration from trip distance.
# 3. Visualize the regression results with an interactive Plotly chart.
# 4. Add an interactive widget to allow users to input a trip distance and see the predicted trip duration.
#
# Replace or extend this analysis with additional features as needed.

# %%
# Import necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
import ipywidgets as widgets
from IPython.display import display

# %%
# Load the dataset (ensure the CSV file is in the same directory or adjust the path)
data_file = "C:/Users/dycha/OneDrive - Morgan State University/My_self/Data_self/Citywide_Mobility_Survey_Trip_2022.csv"
df = pd.read_csv(data_file)

# Display the first few rows to understand the structure
df.head()

# %%
# Print available columns to decide on analysis variables
print("Columns in dataset:", df.columns.tolist())

# %% [markdown]
# ## Exploratory Data Analysis
# 
# We will visualize key metrics such as trip durations, trip distances, and mode of transportation (if available). Adjust column names as needed.

# %%
# Plot the distribution of Trip Durations
if 'TripDuration' in df.columns:
    fig1 = px.histogram(df, x='TripDuration', nbins=30, 
                        title="Distribution of Trip Durations",
                        labels={'TripDuration': 'Trip Duration'})
    fig1.show()
else:
    print("Column 'TripDuration' not found in dataset.")

# %%
# Plot the distribution of Trip Distances
if 'TripDistance' in df.columns:
    fig2 = px.histogram(df, x='TripDistance', nbins=30, 
                        title="Distribution of Trip Distances",
                        labels={'TripDistance': 'Trip Distance'})
    fig2.show()
else:
    print("Column 'TripDistance' not found in dataset.")

# %%
# Plot trip counts by Mode if the column exists
if 'Mode' in df.columns:
    mode_counts = df['Mode'].value_counts().reset_index()
    mode_counts.columns = ['Mode', 'Count']
    fig3 = px.bar(mode_counts, x='Mode', y='Count', 
                  title="Trip Counts by Mode",
                  labels={'Count': 'Number of Trips', 'Mode': 'Transportation Mode'})
    fig3.show()
else:
    print("Column 'Mode' not found in dataset.")

# %%
# Scatter plot: Trip Distance vs Trip Duration, colored by Mode if available
if all(col in df.columns for col in ['TripDistance', 'TripDuration', 'Mode']):
    fig4 = px.scatter(df, x='TripDistance', y='TripDuration', color='Mode',
                      title="Trip Duration vs Trip Distance",
                      labels={'TripDistance': 'Trip Distance', 'TripDuration': 'Trip Duration'})
    fig4.show()
elif all(col in df.columns for col in ['TripDistance', 'TripDuration']):
    fig4 = px.scatter(df, x='TripDistance', y='TripDuration',
                      title="Trip Duration vs Trip Distance",
                      labels={'TripDistance': 'Trip Distance', 'TripDuration': 'Trip Duration'})
    fig4.show()
else:
    print("Required columns for scatter plot not found in dataset.")

# %% [markdown]
# ## Modeling: Predicting Trip Duration from Trip Distance
#
# We'll build a simple linear regression model that predicts the trip duration based on the trip distance.
# 
# **Note:** Ensure that the columns 'TripDistance' and 'TripDuration' are numeric and contain no missing values.

# %%
# Prepare the data for modeling
if all(col in df.columns for col in ['TripDistance', 'TripDuration']):
    df_model = df.dropna(subset=['TripDistance', 'TripDuration'])
    X = df_model[['TripDistance']]
    y = df_model['TripDuration']

    # Fit a linear regression model
    model = LinearRegression()
    model.fit(X, y)

    # Create predictions for plotting the regression line
    X_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
    y_pred = model.predict(X_range)

    # Plot the scatter of data points with the regression line
    scatter = go.Scatter(x=df_model['TripDistance'], y=df_model['TripDuration'],
                         mode='markers', name='Data Points')
    line = go.Scatter(x=X_range.flatten(), y=y_pred, mode='lines', name='Regression Line')

    fig_model = go.Figure(data=[scatter, line])
    fig_model.update_layout(title='Linear Regression: Trip Distance vs. Trip Duration',
                            xaxis_title='Trip Distance',
                            yaxis_title='Trip Duration')
    fig_model.show()

    # Output the model coefficients
    print("Model Coefficient (slope):", model.coef_[0])
    print("Model Intercept:", model.intercept_)
else:
    print("Columns 'TripDistance' and/or 'TripDuration' are missing for modeling.")

# %% [markdown]
# ## Interactive Prediction
#
# The following section uses an interactive slider (via ipywidgets) that lets you select a trip distance value. The notebook will display the predicted trip duration based on our regression model.

# %%
if all(col in df.columns for col in ['TripDistance', 'TripDuration']):
    # Create a slider for Trip Distance based on the data range
    distance_slider = widgets.FloatSlider(
        value=float(df_model['TripDistance'].mean()),
        min=float(df_model['TripDistance'].min()),
        max=float(df_model['TripDistance'].max()),
        step=0.1,
        description='Trip Distance:',
        continuous_update=False
    )

    prediction_output = widgets.Output()

    def predict_trip_duration(change):
        with prediction_output:
            prediction_output.clear_output()
            input_distance = np.array([[distance_slider.value]])
            predicted_duration = model.predict(input_distance)[0]
            print(f'Predicted Trip Duration for {distance_slider.value:.2f} units of distance: {predicted_duration:.2f}')

    distance_slider.observe(predict_trip_duration, names='value')
    display(distance_slider, prediction_output)
else:
    print("Cannot create interactive prediction widget without required columns.")

# %% [markdown]
# ## Next Steps and Customizations
#
# - **Advanced Modeling:** Extend the model by incorporating additional variables (e.g., mode, time of day) or try non-linear models (e.g., Random Forest, Gradient Boosting).
# - **Interactive Visualizations:** Enhance visualizations using Plotly's interactive features (filters, hover data, etc.).
# - **Embedding:** Once the notebook is polished, upload it to GitHub and integrate with [Binder](https://mybinder.org/) so visitors can run the notebook live.
#
# ### Binder Badge Example:
#
# Add the following Markdown badge to your website or GitHub README to enable interactive sessions:
#
# ```markdown
# [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/your-username/your-repository/main?filepath=Citywide_Mobility_Analysis.ipynb)
# ```
#
# Replace `your-username`, `your-repository`, and adjust the filepath as necessary.


Columns in dataset: ['hh_id', 'person_id', 'person_num', 'day_id', 'travel_dow', 'day_num', 'is_complete', 'travel_date', 'linked_trip_id', 'linked_leg_num', 'trip_id', 'trip_num', 'day_is_complete', 'arrive_dow', 'depart_dow', 'o_in_region', 'o_county', 'd_in_region', 'd_county', 'mode_type_nyc', 'r_mode_type_nyc', 'mode_1', 'mode_2', 'mode_3', 'mode_priority_nyc', 'transit_access', 'r_transit_access', 'transit_egress', 'r_transit_egress', 'park_location', 'park_type', 'park_pay', 'park_cost', 'ev_charge_station', 'ev_charge_station_level_1', 'ev_charge_station_level_2', 'ev_charge_station_level_3', 'ev_charge_station_level_4', 'ev_charge_station_level_5', 'ev_charge_station_level_997', 'ev_charge_station_level_998', 'tnc_type', 'taxi_type', 'taxi_pay', 'taxi_cost', 'transit_type', 'bike_park_loc', 'scooter_park_location', 'num_travelers', 'num_hh_travelers', 'num_non_hh_travelers', 'hh_member_1', 'hh_member_2', 'hh_member_3', 'hh_member_4', 'hh_member_5', 'hh_member_6', 'hh_member_7'