In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import xgboost as xgb
import joblib

In [None]:
# === 1. Load and preprocess data ===
df = pd.read_csv("yellow-tripdata-2025-01 (1).csv")  # Replace with actual path

# Inspect column names to identify the correct column names
print("Column names in the DataFrame:", df.columns)

# Rename columns if necessary to fix corrupted names
df.rename(columns={
    'pickup_dat"tpep_dropoff_datetime"': 'tpep_dropoff_datetime',
    'tpep_etime': 'tpep_pickup_datetime'
}, inplace=True)

# Convert to datetime
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['pickup_dayofweek'] = df['tpep_pickup_datetime'].dt.dayofweek

# Peak hour indicators
df['is_weekday'] = df['pickup_dayofweek'] < 5
df['is_morning_peak'] = df['is_weekday'] & df['pickup_hour'].between(7, 10)
df['is_evening_peak'] = df['is_weekday'] & df['pickup_hour'].between(16, 19)
df['is_morning_peak'] = df['is_morning_peak'].astype(int)
df['is_evening_peak'] = df['is_evening_peak'].astype(int)

# Categorical encoding with saved LabelEncoders
categorical_cols = ['VendorID', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID']
label_encoders = {}

for col in categorical_cols:
    df[col] = df[col].astype(str)
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  
    label_encoders[col] = le  # Save encoder for later
# === 2. Define features and target ===
# Calculate trip duration in hours
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60 / 60
df['congestion_level'] = pd.qcut(df['trip_duration'], q=3, labels=['Low', 'Medium', 'High'])

# Encode target
target_encoder = LabelEncoder()
df['congestion_level'] = target_encoder.fit_transform(df['congestion_level'])

# Drop unused columns
df = df.drop(columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'is_weekday', 'trip_duration', 'passenger_count', 'trip_distance', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'Airport_fee'], errors='ignore')
# Final feature set
features = ['pickup_hour', 'pickup_dayofweek', 'PULocationID', 'DOLocationID', 
            'VendorID', 'RatecodeID', 'store_and_fwd_flag', 
            'is_morning_peak', 'is_evening_peak']

X = df[features]
y = df['congestion_level']

# === 3. Split data ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  df = pd.read_csv("yellow-tripdata-2025-01 (1).csv")  # Replace with actual path


Column names in the DataFrame: Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'Airport_fee'],
      dtype='object')


In [14]:
# === 4. Grid Search ===
# Reduced parameter grid for faster execution
param_grid = {
    'n_estimators': [100],  # Reduced from [100, 200]
    'max_depth': [3, 5],
    'learning_rate': [0.1], # Reduced from [0.05, 0.1]
    'subsample': [0.7],     # Reduced from [0.7, 1.0]
    'colsample_bytree': [0.7] # Reduced from [0.7, 1.0]
}

xgb_clf = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring='accuracy',
    cv=2,  # Reduced from 3 for faster execution
    verbose=1,
    n_jobs=-1
)

print("Starting Grid Search...") # Added print statement
grid_search.fit(X_train, y_train)
print("Grid Search Finished.") # Added print statement

# === 5. Evaluate ===
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Best Parameters:", grid_search.best_params_)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=target_encoder.classes_))

Starting Grid Search...
Fitting 2 folds for each of 2 candidates, totalling 4 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Grid Search Finished.
Best Parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.7}

Classification Report:
               precision    recall  f1-score   support

        High       0.71      0.52      0.60    231585
         Low       0.55      0.63      0.59    231627
      Medium       0.46      0.52      0.49    231834

    accuracy                           0.56    695046
   macro avg       0.57      0.56      0.56    695046
weighted avg       0.57      0.56      0.56    695046



In [6]:
best_model = joblib.load('best_xgb_model.pkl')
label_encoders = joblib.load('label_encoders.pkl')
target_encoder = joblib.load('target_encoder.pkl')

In [7]:
new_trip = pd.DataFrame([{
    'pickup_hour': 8,
    'pickup_dayofweek': 2,
    'PULocationID': '121',
    'DOLocationID': '105',
    'VendorID': '2',
    'RatecodeID': '1',
    'store_and_fwd_flag': 'N',
    'is_morning_peak': 1,
    'is_evening_peak': 0
}])


# Define categorical columns
categorical_cols = ['PULocationID', 'DOLocationID', 'VendorID', 'RatecodeID', 'store_and_fwd_flag']

# Encode categorical features using the saved encoders
for col in categorical_cols:
    new_trip[col] = new_trip[col].astype(str)
    new_trip[col] = label_encoders[col].transform(new_trip[col])

# Predict
new_pred = best_model.predict(new_trip)
predicted_label = target_encoder.inverse_transform(new_pred)
print("Predicted Congestion Level for New Trip:", predicted_label[0])




Predicted Congestion Level for New Trip: High


In [None]:
import pandas as pd
import requests
from geopy.distance import geodesic
import folium

# Load coordinates from CSV
coord_df = pd.read_csv('location_coordinates_10000.csv')

# Build the location_coords dictionary
location_coords = {
    int(row['LocationID']): (row['Latitude'], row['Longitude'])
    for _, row in coord_df.iterrows()
}

# Example new trip data (replace this with your actual data source)
# Simulating a single-row DataFrame with pickup/dropoff LocationIDs


# Extract pickup and dropoff LocationIDs
pickup_id = int(new_trip['PULocationID'].iloc[0])
dropoff_id = int(new_trip['DOLocationID'].iloc[0])

# Ensure the pickup_id and dropoff_id exist in location_coords
if pickup_id not in location_coords or dropoff_id not in location_coords:
    raise KeyError(f"Missing coordinates for LocationID(s): "
                   f"{pickup_id if pickup_id not in location_coords else ''} "
                   f"{dropoff_id if dropoff_id not in location_coords else ''}")

pickup_coords = location_coords[pickup_id]
dropoff_coords = location_coords[dropoff_id]

# Query OSRM for the actual road route
url = f"http://router.project-osrm.org/route/v1/driving/{pickup_coords[1]},{pickup_coords[0]};{dropoff_coords[1]},{dropoff_coords[0]}?overview=full&geometries=geojson"
response = requests.get(url)
data = response.json()

# Extract and reformat route coordinates
route = data['routes'][0]['geometry']['coordinates']
route = [(lat, lon) for lon, lat in route]  # Convert to (lat, lon)

# Estimate trip duration (replace with actual value if available)
total_trip_duration = 1800  # Example: 30 minutes = 1800 seconds

# Compute segment durations and colors
num_segments = len(route) - 1
segment_durations = total_trip_duration / num_segments
segment_colors = []

for i in range(num_segments):
    start = route[i]
    end = route[i + 1]
    distance_km = geodesic(start, end).kilometers
    speed_kmh = distance_km / (segment_durations / 3600)

    # Assign colors based on speed
    if speed_kmh < 20:
        segment_colors.append('red')     # Congested
    elif speed_kmh < 40:
        segment_colors.append('yellow')  # Moderate
    else:
        segment_colors.append('green')   # Free flow

# Create folium map
m = folium.Map(location=pickup_coords, zoom_start=13)

# Add pickup/dropoff markers
folium.Marker(pickup_coords, tooltip='Pickup', icon=folium.Icon(color='green')).add_to(m)
folium.Marker(dropoff_coords, tooltip='Dropoff', icon=folium.Icon(color='red')).add_to(m)

# Draw colored segments
for i in range(num_segments):
    folium.PolyLine(
        [route[i], route[i + 1]],
        color=segment_colors[i],
        weight=4.5,
        opacity=0.8
    ).add_to(m)

# Save and display the map
m.save("congestion_route_map.html")
m


IndentationError: unexpected indent (1393520787.py, line 39)

In [2]:
import pandas as pd
import requests
from geopy.distance import geodesic
import folium

# Load coordinates from CSV
coord_df = pd.read_csv('location_coordinates_10000.csv')

# Build the location_coords dictionary
location_coords = {
    int(row['LocationID']): (row['Latitude'], row['Longitude'])
    for _, row in coord_df.iterrows()
}

# Simulated new trip data
new_trip = pd.DataFrame({
    'PULocationID': [132],
    'DOLocationID': [148]
})

# Optionally filter from a larger dataset
filtered_routes = pd.read_csv("yellow-tripdata-2025-01 (1).csv")  # Replace with actual file if needed

# Print the first few rows to see what columns are available
print("Available columns in the dataset:")
print(filtered_routes.columns.tolist())
print("\nFirst few rows of data:")
print(filtered_routes.head(5))

# Extract pickup and dropoff LocationIDs
pickup_id = int(new_trip['PULocationID'].iloc[0])
dropoff_id = int(new_trip['DOLocationID'].iloc[0])

# Ensure valid location IDs
if pickup_id not in location_coords or dropoff_id not in location_coords:
    raise KeyError(f"Missing coordinates for LocationID(s): "
                   f"{pickup_id if pickup_id not in location_coords else ''} "
                   f"{dropoff_id if dropoff_id not in location_coords else ''}")

pickup_coords = location_coords[pickup_id]
dropoff_coords = location_coords[dropoff_id]

# Query OSRM for route
url = f"http://router.project-osrm.org/route/v1/driving/{pickup_coords[1]},{pickup_coords[0]};{dropoff_coords[1]},{dropoff_coords[0]}?overview=full&geometries=geojson"
response = requests.get(url)

# Check if the response is successful
if response.status_code == 200:
    data = response.json()
else:
    raise Exception(f"Failed to fetch route from OSRM API. Status code: {response.status_code}, Response: {response.text}")

# Extract route
route = data['routes'][0]['geometry']['coordinates']
route = [(lat, lon) for lon, lat in route]

# Estimate duration (adjust as needed)
total_trip_duration = 1800  # 30 minutes
num_segments = len(route) - 1
print(route)
segment_durations = total_trip_duration / num_segments

segment_colors = []
for i in range(num_segments):
    start = route[i]
    end = route[i + 1]
    distance_km = geodesic(start, end).kilometers
    speed_kmh = distance_km / (segment_durations / 3600)

    if speed_kmh < 20:
        segment_colors.append('red')     # Congested
    elif speed_kmh < 40:
        segment_colors.append('yellow')  # Moderate
    else:
        segment_colors.append('green')   # Free flow

# Create map
m = folium.Map(location=pickup_coords, zoom_start=13)

# Add markers
folium.Marker(pickup_coords, tooltip='Pickup', icon=folium.Icon(color='green')).add_to(m)
folium.Marker(dropoff_coords, tooltip='Dropoff', icon=folium.Icon(color='red')).add_to(m)

# Draw route with congestion colors
for i in range(num_segments):
    folium.PolyLine(
        [route[i], route[i + 1]],
        color=segment_colors[i],
        weight=4.5,
        opacity=0.8
    ).add_to(m)

# Save map
m.save("congestion_route_map.html")
m


  filtered_routes = pd.read_csv("yellow-tripdata-2025-01 (1).csv")  # Replace with actual file if needed


Available columns in the dataset:
['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'Airport_fee']

First few rows of data:
   VendorID        tpep_pickup_datetime       tpep_dropoff_datetime  \
0         2  2025-01-12 18:12:29.000000  2025-01-12 18:32:07.000000   
1         2  2025-01-12 18:32:40.000000  2025-01-12 18:41:11.000000   
2         1  2025-01-12 18:01:34.000000  2025-01-12 18:15:35.000000   
3         1  2025-01-12 18:36:05.000000  2025-01-12 18:49:45.000000   
4         1  2025-01-12 18:51:39.000000  2025-01-12 19:14:16.000000   

  passenger_count  trip_distance RatecodeID store_and_fwd_flag  PULocationID  \
0               2           1.07          1                  N           100   
1               1          

In [16]:
# import pandas as pd
# import joblib
# import xgboost as xgb

# # === Load Encoders and Model ===
# xgb_model = joblib.load('best_xgb_model.pkl')
# label_encoders = joblib.load('label_encoders.pkl')
# target_encoder = joblib.load('target_encoder.pkl')

# # === Load Your Input Data ===
# # Replace with your actual CSV or DataFrame
# data = pd.read_csv('yellow-tripdata-2025-01 (1).csv')
# # === Preprocess Data ===
# # Convert datetime columns to datetime type and extract features
# if 'tpep_pickup_datetime' in data.columns:
#     data['tpep_pickup_datetime'] = pd.to_datetime(data['tpep_pickup_datetime'], errors='coerce')
#     data['pickup_hour'] = data['tpep_pickup_datetime'].dt.hour
#     data['pickup_dayofweek'] = data['tpep_pickup_datetime'].dt.dayofweek

# if 'tpep_dropoff_datetime' in data.columns:
#     data['tpep_dropoff_datetime'] = pd.to_datetime(data['tpep_dropoff_datetime'], errors='coerce')

# # Drop original datetime columns
# data = data.drop(columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime'], errors='ignore')

# # Convert object columns to category or numeric
# for col in data.select_dtypes(include=['object']).columns:
#     if col in label_encoders:
#         data[col] = data[col].map(lambda x: label_encoders[col].transform([x])[0] if x in label_encoders[col].classes_ else -1)
#     else:
#         data[col] = pd.to_numeric(data[col], errors='coerce').fillna(-1)

# # === Make Predictions ===
# preds_encoded = xgb_model.predict(data)

# # === Decode Predictions to Original Labels (if applicable) ===
# preds_decoded = target_encoder.inverse_transform(preds_encoded)

# # === Append and Display Results ===
# data['Predicted_Peak'] = preds_decoded
# print(data[['Predicted_Peak']])

# # Optionally save output
# data.to_csv('predicted_peakhours.csv', index=False)
