In [None]:
import pandas as pd
import requests
import pymysql
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import NearestNeighbors
from geopy.distance import geodesic
import folium

In [None]:

def fetch_communiyt_taxi_comapny_location(url):
    response = requests.get(url)
    data = response.json()
    return pd.json_normalize(data)

# Fetch urls for community, taxi, company, location
urls = {
    "community": "http://127.0.0.1:5000/community",
    "taxi": "http://127.0.0.1:5000/taxi",
    "company": "http://127.0.0.1:5000/company",
    "location": "http://127.0.0.1:5000/location"
}


def fetch_all_pages(base_url, start_page, end_page):
    all_data_frames = []
    for page in range(start_page, end_page + 1):
        try:
            response = requests.get(f"{base_url}/?page={page}")
            response.raise_for_status()  # Raises an error for non-200 responses
            page_data = response.json()
            page_frame = pd.json_normalize(page_data['data'])
            all_data_frames.append(page_frame)
        except requests.RequestException as e:
            print(f"An error occurred on page {page}: {e}")
            break  # or 'continue' to skip this page and move to the next
    return pd.concat(all_data_frames, ignore_index=True)

# Usage
base_url = "http://127.0.0.1:5000/trips/4"
start_page = 1
end_page = 95



In [None]:
community = fetch_communiyt_taxi_comapny_location(urls["community"])
taxi = fetch_communiyt_taxi_comapny_location(urls["taxi"])
company = fetch_communiyt_taxi_comapny_location(urls["company"])
location = fetch_communiyt_taxi_comapny_location(urls["location"])
all_trips = fetch_all_pages(base_url, start_page, end_page)



In [None]:
trips = all_trips.copy()

In [None]:
display(trips.head())
display(community.head())
display(taxi.head())
display(company.head())
display(location.head())


In [None]:
display(trips.shape)
display(community.shape)
display(taxi.shape)
display(company.shape)
display(location.shape)


In [None]:
display(trips.info())
display(community.info())
display(taxi.info())
display(company.info())
display(location.info())

In [None]:
trips = pd.merge(trips, company, left_on='company_id', right_on='company_id', how='left')
trips = pd.merge(trips, community, left_on='dropoff_community_area', right_on='community_number', how='left')
trips = pd.merge(trips, community, left_on='pickup_community_area', right_on='community_number', how='left')
trips = pd.merge(trips, location, left_on='dropoff_location', right_on='location_coordinates', how='left')
trips = pd.merge(trips, location, left_on='pickup_location', right_on='location_coordinates', how='left')

trips.drop(columns=['population_y', 'community_number_y', 'population_x', 'community_number_x', 'zip', 
                        'dispatch_phone', 'email', 'taxi_exterior_color', 'city_state', 'business_phone',
                   'dropoff_community_area','pickup_community_area', 'address_x','location_coordinates_x','location_coordinates_y'], inplace=True)


trips.rename(columns={'community_name_x': 'dropoff_community_area', 'community_name_y': 'pickup_community_area','address_y': 'dropoff_address','address': 'pickup_address',}, inplace=True)




In [None]:
trips.columns

In [None]:
trips.isnull().sum()

In [None]:
trips = trips.dropna()

In [None]:
trips.columns

In [None]:
trips = trips[['unique_key','taxi_id','company','trip_start_timestamp','trip_end_timestamp','trip_seconds','trip_miles','trip_total','payment_type','pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','pickup_community_area','dropoff_community_area','pickup_address','dropoff_address']]

In [None]:
trips.info()

In [None]:
trips['trip_start_timestamp'] = pd.to_datetime(trips['trip_start_timestamp'], format='%a, %d %b %Y %H:%M:%S GMT', utc=True)
trips['trip_end_timestamp'] = pd.to_datetime(trips['trip_end_timestamp'], format='%a, %d %b %Y %H:%M:%S GMT', utc=True)
trips['trip_total'] = trips['trip_total'].astype('float64')
trips['pickup_latitude'] = trips['pickup_latitude'].astype('float64')
trips['pickup_longitude'] = trips['pickup_longitude'].astype('float64')
trips['dropoff_latitude'] = trips['dropoff_latitude'].astype('float64')
trips['dropoff_longitude'] = trips['dropoff_longitude'].astype('float64')

In [None]:
trips.info()

In [None]:
trips_sampled = trips.sample(frac=0.01)

In [None]:
trips_sampled.to_csv('trips_sampled.csv', index=False)

In [2]:
import pandas as pd
trips_sampled = pd.read_csv('trips_sampled.csv')

In [None]:
# Convert 'trip_start_timestamp' to datetime
trips_sampled['trip_start_timestamp'] = pd.to_datetime(trips_sampled['trip_start_timestamp'])

# Extract year and month
trips_sampled['year_month'] = trips_sampled['trip_start_timestamp'].dt.to_period('M')

# Group by year_month and count unique keys
monthly_counts = trips_sampled.groupby('year_month')['unique_key'].nunique()

# Plotting
plt.figure(figsize=(12, 6))
monthly_counts.plot(kind='line')
plt.title('Monthly Total Counts of Trips')
plt.xlabel('Month')
plt.ylabel('Number of Trips')
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Setting the aesthetic style of the plots
sns.set(style="whitegrid")

# EDA: Demand Analysis Over Time

# Analyzing demand patterns across different hours of the day
hourly_demand = trips_sampled.groupby('hour').size().reset_index(name='count')
plt.figure(figsize=(12, 6))
sns.barplot(x='hour', y='count', data=hourly_demand)
plt.title('Taxi Demand by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Pickups')
plt.show()

# Analyzing demand patterns across different days of the week
daily_demand = trips_sampled.groupby('weekday').size().reset_index(name='count')
plt.figure(figsize=(12, 6))
sns.barplot(x='weekday', y='count', data=daily_demand)
plt.title('Taxi Demand by Day of Week')
plt.xlabel('Day of Week (0=Monday, 6=Sunday)')
plt.ylabel('Number of Pickups')
plt.show()


# Modeling Process
- **Feature Engineering**: Extracts time-related features from trip_start_timestamp.
- **Counting Pickups**: Groups the data by pickup latitude and longitude to count pickups at each location.
- **Clustering**: Applies KMeans clustering to identify popular areas.
- **Cluster Centers**: Calculates the center of each cluster for the Nearest Neighbors model.
- **Nearest Neighbors Model**: Trains the model to find the nearest clusters based on a given location.
- **Recommendation Function**: Takes current location, time, and date, and recommends top 5 pickup locations based on historical popularity and proximity.

In [3]:
# Feature Engineering
trips_sampled['trip_start_timestamp'] = pd.to_datetime(trips_sampled['trip_start_timestamp'])
trips_sampled['weekday'] = trips_sampled['trip_start_timestamp'].dt.weekday
trips_sampled['hour'] = trips_sampled['trip_start_timestamp'].dt.hour

  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
# Counting the number of pickups
pickup_counts = trips_sampled.groupby(['pickup_latitude', 'pickup_longitude']).size().reset_index(name='pickup_count')

# Clustering Pickup Locations
kmeans = KMeans(n_clusters=10, random_state=0)
pickup_counts['cluster'] = kmeans.fit_predict(pickup_counts[['pickup_latitude', 'pickup_longitude']])

# Merge the 'cluster' column back to the original data
trips_sampled = trips_sampled.merge(pickup_counts[['pickup_latitude', 'pickup_longitude', 'cluster']], 
                                    on=['pickup_latitude', 'pickup_longitude'], 
                                    how='left')


In [None]:
# Determine Cluster Centers
cluster_centers = pickup_counts.groupby('cluster').mean()[['pickup_latitude', 'pickup_longitude']]

# Train Nearest Neighbors Model
nearest_neighbors_model = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nearest_neighbors_model.fit(cluster_centers)

In [None]:
# Function to get location details using Nominatim API
def get_location_details(lat, lon):
    url = f"https://nominatim.openstreetmap.org/reverse?lat={lat}&lon={lon}&format=json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        location_name = data.get('display_name')
        return pd.Series([location_name])
    else:
        return pd.Series(["Error", "Error"])

In [None]:
# Function to Recommend Pickup Locations
def recommend_pickup_locations(current_location, current_time, current_date, model, cluster_data, historical_data):
    # Finding the nearest clusters to the current location
    distances, indices = model.kneighbors([current_location])
    nearest_clusters = indices[0]

    # Filtering historical data for the given time and date
    filtered_data = historical_data[(historical_data['hour'] == current_time) & (historical_data['weekday'] == current_date)]

    # Ranking the nearest clusters based on their historical popularity
    ranked_clusters = filtered_data[filtered_data['cluster'].isin(nearest_clusters)].groupby('cluster').size().reset_index(name='count')
    ranked_clusters = ranked_clusters.sort_values(by='count', ascending=False).head(5)

    # Merging with cluster_data to get the location details
    recommended_locations = ranked_clusters.merge(cluster_data, on='cluster')

    # Adding address and distance information
    recommended_locations[['address']] = recommended_locations.apply(lambda row: get_location_details(row['pickup_latitude'], row['pickup_longitude']), axis=1)
    recommended_locations['distance_km'] = recommended_locations.apply(lambda row: geodesic(current_location, (row['pickup_latitude'], row['pickup_longitude'])).kilometers, axis=1)

    return recommended_locations

In [11]:
def create_map(current_location, recommended_locations):
    # Create a map centered around the current location
    map = folium.Map(location=current_location, zoom_start=12)

    # Add a marker for the current location
    folium.Marker(
        current_location, 
        popup='Current Location', 
        icon=folium.Icon(color='red', icon='info-sign')
    ).add_to(map)

    # Add markers for the recommended pickup locations
    for idx, row in recommended_locations.iterrows():
        popup_info = f"{idx+1}. {row['address']} (Distance: {row['distance_km']:.2f} km)"
        folium.Marker(
            [row['pickup_latitude'], row['pickup_longitude']],
            popup=popup_info,
            icon=folium.Icon(color='blue', icon='star')
        ).add_to(map)

    return map


In [14]:
# Example usage of the function
example_location = (41.90, -87.65)  # Example current location
example_time = 15  # 3 PM
example_date = 4  # Friday

# Assuming nearest_neighbors_model, cluster_centers, and trips_sampled are already defined
recommended_pickups = recommend_pickup_locations(example_location, example_time, example_date, nearest_neighbors_model, cluster_centers, trips_sampled)
pickup_map = create_map(example_location, recommended_pickups)

display(recommended_pickups)
display(pickup_map)



Unnamed: 0,cluster,count,pickup_latitude,pickup_longitude,address,distance_km
0,8,44,41.894928,-87.63454,"206-208, West Huron Street, River North, Chica...",1.401163
1,4,7,41.907498,-87.684663,"1423, North Oakley Boulevard, Wicker Park, Wes...",2.9943
2,5,3,41.849678,-87.646896,"2316-2330, South Halsted Street, Lower West Si...",5.595254
3,6,3,41.972026,-87.670577,"1618-1624, West Ainslie Street, Winnemac, Upto...",8.180048
4,0,1,41.937757,-87.653603,"3049, North Sheffield Avenue, Northalsted, Lak...",4.204364
