### Set Up

In [65]:
import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans


In [11]:
rides = pd.read_parquet("output/rides/part-00000-bbe7aca0-61f1-4f7b-bdf9-a7efdd66c11f-c000.snappy.parquet")
print(rides.columns)
rides.head()

Index(['event_id', 'ride_id', 'event_type', 'timestamp', 'user_id',
       'driver_id', 'pickup_latitude', 'pickup_longitude', 'pickup_address',
       'pickup_city', 'dropoff_latitude', 'dropoff_longitude',
       'dropoff_address', 'dropoff_city', 'distance_km',
       'estimated_duration_minutes', 'actual_duration_minutes', 'vehicle_type',
       'base_fare', 'surge_multiplier', 'total_fare', 'payment_method',
       'payment_status', 'payment_id', 'user_to_driver_rating',
       'driver_to_user_rating', 'user_comment', 'driver_comment',
       'canceled_by', 'cancellation_reason', 'cancellation_fee',
       'traffic_level', 'estimated_delay_minutes', 'driver_latitude',
       'driver_longitude', 'driver_heading', 'driver_speed_kmh', 'app_version',
       'platform', 'session_id'],
      dtype='object')


Unnamed: 0,event_id,ride_id,event_type,timestamp,user_id,driver_id,pickup_latitude,pickup_longitude,pickup_address,pickup_city,...,cancellation_fee,traffic_level,estimated_delay_minutes,driver_latitude,driver_longitude,driver_heading,driver_speed_kmh,app_version,platform,session_id
0,580c1c25ec24_5,R-7ab1d1b6fd,RIDE_COMPLETED,2025-01-01 08:11:15.778,U000205,D000090,40.40846,-3.70237,6528 Calle de Argumosa,Madrid,...,,HIGH,26,40.498333,-3.567498,309.957733,0.0,4.7.0,iOS,S-884574d4
1,a6f88d2bd2a9_3,R-8f0317bd51,DRIVER_ARRIVED,2025-01-01 07:22:13.399,U000071,D000071,40.461591,-3.676449,1314 Calle de Padre Damián,Madrid,...,,SEVERE,31,40.461591,-3.676449,310.453857,0.0,4.6.0,iOS,S-0db1fb52
2,9e531ddbb954_1,R-9f0cfbb2b9,RIDE_REQUESTED,2025-01-01 07:37:28.079,U000097,,40.416757,-3.703749,5935 Calle Gran Via,Madrid,...,,SEVERE,46,,,,,4.6.0,Android,S-c892e1bb
3,9e531ddbb954_2,R-9f0cfbb2b9,DRIVER_ASSIGNED,2025-01-01 07:37:39.756,U000097,D000078,40.416757,-3.703749,5935 Calle Gran Via,Madrid,...,,SEVERE,46,40.400801,-3.700461,247.237595,38.600246,4.6.0,Android,S-c892e1bb
4,9e531ddbb954_5,R-9f0cfbb2b9,RIDE_COMPLETED,2025-01-01 07:51:05.103,U000097,D000078,40.416757,-3.703749,5935 Calle Gran Via,Madrid,...,,SEVERE,46,40.416664,-3.703755,325.456879,0.0,4.6.0,Android,S-c892e1bb


In [10]:
special = pd.read_parquet("output\specials\part-00001-ddcf2c16-9f59-43db-93aa-d942715d5770-c000.snappy.parquet")
print(special.columns)
special.head()

Index(['event_type', 'event_name', 'venue_zone', 'venue_latitude',
       'venue_longitude', 'venue_address', 'venue_city', 'event_start',
       'event_end', 'arrivals_start', 'arrivals_end', 'departures_start',
       'departures_end', 'arrival_rides', 'departure_rides',
       'estimated_attendees'],
      dtype='object')


  special = pd.read_parquet("output\specials\part-00001-ddcf2c16-9f59-43db-93aa-d942715d5770-c000.snappy.parquet")


Unnamed: 0,event_type,event_name,venue_zone,venue_latitude,venue_longitude,venue_address,venue_city,event_start,event_end,arrivals_start,arrivals_end,departures_start,departures_end,arrival_rides,departure_rides,estimated_attendees
0,concert,Concert Event 1,retiro,40.4146,-3.6868,795 Retiro Calle,Madrid,2025-01-01 19:00:00,2025-01-01 22:00:00,2025-01-01 16:00:00,2025-01-01 19:15:00,2025-01-01 21:30:00,2025-01-02 00:00:00,1079,1214,5200
1,concert,Concert Event 2,airport,40.498299,-3.5676,598 Airport Calle,Madrid,2025-01-01 19:00:00,2025-01-01 22:00:00,2025-01-01 16:00:00,2025-01-01 19:15:00,2025-01-01 21:30:00,2025-01-02 00:00:00,2090,2351,8004
2,concert,Concert Event 1,retiro,40.4146,-3.6868,795 Retiro Calle,Madrid,2025-01-01 19:00:00,2025-01-01 22:00:00,2025-01-01 16:00:00,2025-01-01 19:15:00,2025-01-01 21:30:00,2025-01-02 00:00:00,1079,1214,5200
3,concert,Concert Event 2,airport,40.498299,-3.5676,598 Airport Calle,Madrid,2025-01-01 19:00:00,2025-01-01 22:00:00,2025-01-01 16:00:00,2025-01-01 19:15:00,2025-01-01 21:30:00,2025-01-02 00:00:00,2090,2351,8004
4,sports,Atlético de Madrid vs Sevilla FC Game,malasana,40.426498,-3.7025,301 Sports Stadium,Madrid,2025-01-01 16:00:00,2025-01-01 19:00:00,2025-01-01 14:00:00,2025-01-01 16:15:00,2025-01-01 18:45:00,2025-01-01 20:30:00,3113,4003,37080


### Creating User Profiles

#### Determining if Special Events' Rides

In [35]:
latitude_adjustment = 0.00449 # Equivalent to 0.5km
longitude_adjustment = 0.0059 # Equivalent to 0.5km
loc_spec_lat = special.iloc[0]["venue_latitude"]
loc_spec_lon = special.iloc[0]["venue_longitude"]

lat_interval = [loc_spec_lat - latitude_adjustment, loc_spec_lat + latitude_adjustment]
lon_interval = [loc_spec_lon - longitude_adjustment, loc_spec_lon + longitude_adjustment]


for row in rides.iterrows():
    if row[1]["pickup_latitude"] >= lat_interval[0] and row[1]["pickup_latitude"] <= lat_interval[1] and row[1]["pickup_longitude"] >= lon_interval[0] and row[1]["pickup_longitude"] <= lon_interval[1]:
        print("Leaving -> ride id: ", row[0])
    elif row[1]["dropoff_latitude"] >= lat_interval[0] and row[1]["dropoff_latitude"] <= lat_interval[1] and row[1]["dropoff_longitude"] >= lon_interval[0] and row[1]["dropoff_longitude"] <= lon_interval[1]:
        print("Coming -> ride id: ", row[0])



Coming -> ride id:  8
Coming -> ride id:  9
Coming -> ride id:  10
Coming -> ride id:  23
Coming -> ride id:  24
Coming -> ride id:  25
Coming -> ride id:  28
Coming -> ride id:  38
Coming -> ride id:  39
Coming -> ride id:  40
Leaving -> ride id:  44
Leaving -> ride id:  45
Leaving -> ride id:  46
Leaving -> ride id:  47
Coming -> ride id:  71
Coming -> ride id:  72
Leaving -> ride id:  73
Leaving -> ride id:  87
Leaving -> ride id:  92
Leaving -> ride id:  93
Leaving -> ride id:  94
Leaving -> ride id:  103
Leaving -> ride id:  115
Leaving -> ride id:  143
Coming -> ride id:  152
Coming -> ride id:  153
Coming -> ride id:  154
Coming -> ride id:  157


In [40]:
# Function to check if a point is within the defined area
def is_within_area(lat, lon):
    if pd.isna(lat) or pd.isna(lon):
        return False
    return (lat_interval[0] <= lat <= lat_interval[1] and 
            lon_interval[0] <= lon <= lon_interval[1])

def determine_ride_direction(row, venue_lat, venue_lon, distance_km=0.5):
    """
    Determines if a ride is coming to, leaving from, or unrelated to a venue.
    
    Parameters:
    - row: A row from the rides dataframe
    - venue_lat: Latitude of the venue
    - venue_lon: Longitude of the venue
    - distance_km: Distance threshold in kilometers
    
    Returns:
    - 'coming', 'leaving', or None
    """
    # Constants for latitude/longitude adjustments
    latitude_adjustment = 0.009898 * distance_km
    longitude_adjustment = 0.00118 * distance_km
    
    # Define intervals around the venue
    lat_interval = [venue_lat - latitude_adjustment, venue_lat + latitude_adjustment]
    lon_interval = [venue_lon - longitude_adjustment, venue_lon + longitude_adjustment]
    
    # Check if pickup/dropoff locations are within the area
    def is_within_area(lat, lon):
        if pd.isna(lat) or pd.isna(lon):
            return False
        return (lat_interval[0] <= lat <= lat_interval[1] and 
                lon_interval[0] <= lon <= lon_interval[1])
    
    pickup_in_area = is_within_area(row["pickup_latitude"], row["pickup_longitude"])
    dropoff_in_area = is_within_area(row["dropoff_latitude"], row["dropoff_longitude"])
    
    if pickup_in_area and not dropoff_in_area:
        return 'leaving'
    elif not pickup_in_area and dropoff_in_area:
        return 'coming'
    else:
        return None


In [39]:
a = determine_ride_direction(rides.iloc[72], loc_spec_lat, loc_spec_lon)
print(a)

coming


In [41]:
def analyze_rides_for_all_events(rides_df, special_df, distance_km=0.5):
    """
    Creates two new columns in the rides dataframe:
    1. 'event_direction': indicates if a ride was coming to, leaving from, or unrelated to any event
    2. 'event_name': indicates which event the ride was related to, if any
    
    Parameters:
    - rides_df: DataFrame containing ride data
    - special_df: DataFrame containing special event data
    - distance_km: Distance threshold in kilometers
    
    Returns:
    - rides_df with two new columns
    """
    # Create a dictionary of unique events with their coordinates
    # Use event_name as key and (lat, lon) as value
    events_dict = {}
    
    # Drop duplicates to get unique events (based on name, latitude, and longitude)
    unique_events = special_df.drop_duplicates(subset=['event_name', 'venue_latitude', 'venue_longitude', 'event_type'])
    
    for _, event in unique_events.iterrows():
        event_type = event['event_type']
        lat = event['venue_latitude']
        lon = event['venue_longitude']
        events_dict[event_type] = (lat, lon)
    
    # Initialize new columns
    rides_df['event_direction'] = None
    rides_df['event_name'] = None
    
    # Process each ride
    for idx, ride in rides_df.iterrows():
        # Check each event for this ride
        for event_name, (lat, lon) in events_dict.items():
            direction = determine_ride_direction(ride, lat, lon, distance_km)
            
            # If we found a relationship with this event, update the columns and break
            if direction is not None:
                rides_df.at[idx, 'event_direction'] = direction
                rides_df.at[idx, 'event_name'] = event_name
                break
    
    return rides_df

In [42]:
rides_with_event_info = analyze_rides_for_all_events(rides, special)

#### Creating User Vectors

In [46]:
# Subset of the data to be used for user profile creation
rides_subset = rides_with_event_info[["timestamp","user_id","pickup_latitude",
                       "pickup_longitude","dropoff_latitude","dropoff_longitude",
                       "event_direction","event_name",
                       "distance_km"]]
rides_subset["month"] = rides_subset["timestamp"].dt.month
rides_subset["day"] = rides_subset["timestamp"].dt.day
rides_subset["hour"] = rides_subset["timestamp"].dt.hour
rides_subset["day_of_week"] = rides_subset["timestamp"].dt.dayofweek
rides_subset["day_of_year"] = rides_subset["timestamp"].dt.dayofyear

rides_subset.drop(columns=["timestamp"], inplace=True)
rides_subset.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rides_subset["month"] = rides_subset["timestamp"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rides_subset["day"] = rides_subset["timestamp"].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rides_subset["hour"] = rides_subset["timestamp"].dt.hour
A value is trying to be set on a c

Unnamed: 0,user_id,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,event_direction,event_name,distance_km,month,day,hour,day_of_week,day_of_year
0,U000205,40.40846,-3.70237,40.498333,-3.567498,coming,concert,15.168999,1,1,8,2,1
1,U000071,40.461591,-3.676449,40.461646,-3.676667,,,0.019388,1,1,7,2,1
2,U000097,40.416757,-3.703749,40.416664,-3.703755,,,0.010333,1,1,7,2,1
3,U000097,40.416757,-3.703749,40.416664,-3.703755,,,0.010333,1,1,7,2,1
4,U000097,40.416757,-3.703749,40.416664,-3.703755,,,0.010333,1,1,7,2,1


In [52]:
# Create user profile features
user_features = pd.DataFrame()

# Group by user_id to create aggregated features
user_groups = rides_subset.groupby('user_id')

# Basic stats features
user_features['user_id'] = user_groups['user_id'].first()
user_features['total_rides'] = user_groups.size()
user_features['avg_distance_km'] = user_groups['distance_km'].mean()
user_features['max_distance_km'] = user_groups['distance_km'].max()


In [53]:
# Location features - average pickup and dropoff coordinates
user_features['avg_pickup_latitude'] = user_groups['pickup_latitude'].mean()
user_features['avg_pickup_longitude'] = user_groups['pickup_longitude'].mean()
user_features['avg_dropoff_latitude'] = user_groups['dropoff_latitude'].mean()
user_features['avg_dropoff_longitude'] = user_groups['dropoff_longitude'].mean()

In [54]:
# Time-based features
user_features['most_common_hour'] = user_groups['hour'].agg(lambda x: x.value_counts().index[0])
user_features['most_common_day_of_week'] = user_groups['day_of_week'].agg(lambda x: x.value_counts().index[0])
user_features['weekend_ride_ratio'] = user_groups['day_of_week'].apply(
    lambda x: sum(x.isin([5, 6])) / len(x) if len(x) > 0 else 0
)

In [57]:
# Event-related features - corrected with your naming convention
user_features['to_event_ratio'] = user_groups['event_direction'].apply(
    lambda x: sum(x == 'comming') / len(x) if len(x) > 0 else 0
)
user_features['from_event_ratio'] = user_groups['event_direction'].apply(
    lambda x: sum(x == 'leaving') / len(x) if len(x) > 0 else 0
)

# Get most common event attended per user - fixed to handle empty value_counts
user_features['most_common_event'] = user_groups['event_name'].agg(
    lambda x: x.value_counts().index[0] if len(x.value_counts()) > 0 else 'no_event'
)

In [59]:
# Calculate variance in ride patterns
user_features['distance_variance'] = user_groups['distance_km'].var()
user_features['hour_variance'] = user_groups['hour'].var()

# Reset index for the final dataframe
user_features = user_features.reset_index(drop=True)

# Display the features
print(user_features.shape)
print(user_features.columns)

(64, 16)
Index(['user_id', 'total_rides', 'avg_distance_km', 'max_distance_km',
       'avg_pickup_latitude', 'avg_pickup_longitude', 'avg_dropoff_latitude',
       'avg_dropoff_longitude', 'most_common_hour', 'most_common_day_of_week',
       'weekend_ride_ratio', 'to_event_ratio', 'from_event_ratio',
       'most_common_event', 'distance_variance', 'hour_variance'],
      dtype='object')


In [64]:
# Encoding the most common event
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
user_features["most_common_event"] = le.fit_transform(user_features["most_common_event"])
user_features["most_common_event"] = user_features["most_common_event"].astype(int)
user_features.head()

Unnamed: 0,user_id,total_rides,avg_distance_km,max_distance_km,avg_pickup_latitude,avg_pickup_longitude,avg_dropoff_latitude,avg_dropoff_longitude,most_common_hour,most_common_day_of_week,weekend_ride_ratio,to_event_ratio,from_event_ratio,most_common_event,distance_variance,hour_variance
0,U000001,3,5.264387,5.264387,40.461351,-3.676616,40.414646,-3.686804,7,2,0.0,0.0,0.0,1,0.0,0.333333
1,U000009,1,14.684679,14.684679,40.498452,-3.567522,40.416779,-3.703914,8,2,0.0,0.0,1.0,0,,
2,U000011,4,0.030949,0.030949,40.498093,-3.567495,40.498248,-3.567799,7,2,0.0,0.0,0.0,1,0.0,1.333333
3,U000014,3,0.015342,0.015342,40.416778,-3.703706,40.416882,-3.703825,8,2,0.0,0.0,0.0,1,0.0,0.333333
4,U000017,3,5.468232,5.468232,40.461546,-3.676575,40.416934,-3.703761,7,2,0.0,0.0,0.0,1,0.0,0.333333


### Creating Outlier Detector

#### Preparing Data

In [76]:
# Function to map levels to indices
def map_level(level, dictionary):
    if level in dictionary:
        return dictionary[level]
    else:
        # Return the next index after known categories as the "other" category
        return len(dictionary)

In [78]:
rides_outlier_subset = rides_with_event_info[["timestamp","pickup_latitude",
                       "pickup_longitude","dropoff_latitude","dropoff_longitude",
                       "traffic_level","distance_km","estimated_duration_minutes",
                       "actual_duration_minutes","estimated_delay_minutes",
                       "driver_speed_kmh"]]
rides_outlier_subset["month"] = rides_outlier_subset["timestamp"].dt.month
rides_outlier_subset["day"] = rides_outlier_subset["timestamp"].dt.day
rides_outlier_subset["hour"] = rides_outlier_subset["timestamp"].dt.hour
rides_outlier_subset["day_of_week"] = rides_outlier_subset["timestamp"].dt.dayofweek
rides_outlier_subset.drop(columns=["timestamp"], inplace=True)
rides_outlier_subset.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rides_outlier_subset["month"] = rides_outlier_subset["timestamp"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rides_outlier_subset["day"] = rides_outlier_subset["timestamp"].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rides_outlier_subset["hour"] = rides_outlier_subset["timest

Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,traffic_level,distance_km,estimated_duration_minutes,actual_duration_minutes,estimated_delay_minutes,driver_speed_kmh,month,day,hour,day_of_week
0,40.40846,-3.70237,40.498333,-3.567498,HIGH,15.168999,37,70.0,26,0.0,1,1,8,2
1,40.461591,-3.676449,40.461646,-3.676667,SEVERE,0.019388,12,,31,0.0,1,1,7,2
2,40.416757,-3.703749,40.416664,-3.703755,SEVERE,0.010333,10,,46,,1,1,7,2
3,40.416757,-3.703749,40.416664,-3.703755,SEVERE,0.010333,10,,46,38.600246,1,1,7,2
4,40.416757,-3.703749,40.416664,-3.703755,SEVERE,0.010333,10,0.0,46,0.0,1,1,7,2


In [80]:
# Get unique traffic levels
unique_traffic_levels = rides_outlier_subset["traffic_level"].unique()

# Create a mapping dictionary with an index for each unique traffic level
traffic_levels = {}
for i, level in enumerate(unique_traffic_levels):
    traffic_levels[level] = i

# Apply the mapping function
rides_outlier_subset["traffic_level"] = rides_outlier_subset["traffic_level"].apply(map_level, args=(traffic_levels,))

# Alternatively, you can use the map function with a default value
# rides_outlier_subset["traffic_level"] = rides_outlier_subset["traffic_level"].map(
#     traffic_levels).fillna(len(traffic_levels)).astype(int)

rides_outlier_subset.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rides_outlier_subset["traffic_level"] = rides_outlier_subset["traffic_level"].apply(map_level, args=(traffic_levels,))


Unnamed: 0,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,traffic_level,distance_km,estimated_duration_minutes,actual_duration_minutes,estimated_delay_minutes,driver_speed_kmh,month,day,hour,day_of_week
0,40.40846,-3.70237,40.498333,-3.567498,0,15.168999,37,70.0,26,0.0,1,1,8,2
1,40.461591,-3.676449,40.461646,-3.676667,1,0.019388,12,,31,0.0,1,1,7,2
2,40.416757,-3.703749,40.416664,-3.703755,1,0.010333,10,,46,,1,1,7,2
3,40.416757,-3.703749,40.416664,-3.703755,1,0.010333,10,,46,38.600246,1,1,7,2
4,40.416757,-3.703749,40.416664,-3.703755,1,0.010333,10,0.0,46,0.0,1,1,7,2


#### Fitting & Testing

In [None]:
from sklearn.svm import OneClassSVM
from joblib import dump, load

one_svm = OneClassSVM(nu=0.01, kernel="rbf", gamma=0.01)
one_svm.fit(user_features)

In [None]:
scores = one_svm.decision_function(user_features)

In [None]:
dump(one_svm, 'oneclass_svm_model.joblib')

loaded_model = load('oneclass_svm_model.joblib')

predictions = loaded_model.predict(user_features)