<a href="https://colab.research.google.com/github/felixkemboi/de_zoomcamp_homework/blob/main/Week_One_Homework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=38dbab84c15f48f31e3d2677f6fe18c9c174d9d8952b574e2a23c29f1c1be715
  Stored in directory: /root/.cache/pip/wheels/40/b3/0f/a40dbd1c6861731779f62cc4babcb234387e11d697df70ee97
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [3]:
import pandas as pd
import wget
import gzip
import shutil
import os
from datetime import datetime

def download_and_prepare_data():
    if not os.path.exists('green_tripdata_2019-10.csv'):
        print("Downloading green taxi data...")
        wget.download('https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-10.csv.gz')
        with gzip.open('green_tripdata_2019-10.csv.gz', 'rb') as f_in:
            with open('green_tripdata_2019-10.csv', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    if not os.path.exists('taxi_zone_lookup.csv'):
        print("\nDownloading zone lookup data...")
        wget.download('https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv')

def question3_trip_segments(df):
    mask = (df['lpep_pickup_datetime'] >= '2019-10-01') & (df['lpep_pickup_datetime'] < '2019-11-01')
    october_trips = df[mask]

    segments = {
        'up_to_1': (october_trips['trip_distance'] <= 1).sum(),
        'between_1_3': ((october_trips['trip_distance'] > 1) & (october_trips['trip_distance'] <= 3)).sum(),
        'between_3_7': ((october_trips['trip_distance'] > 3) & (october_trips['trip_distance'] <= 7)).sum(),
        'between_7_10': ((october_trips['trip_distance'] > 7) & (october_trips['trip_distance'] <= 10)).sum(),
        'over_10': (october_trips['trip_distance'] > 10).sum()
    }

    return segments

def question4_longest_trip(df):
    df['pickup_date'] = pd.to_datetime(df['lpep_pickup_datetime']).dt.date
    result = df.groupby('pickup_date')['trip_distance'].max()
    max_distance_date = result.idxmax()
    return max_distance_date

def question5_top_pickup_zones(df, zones_df):
    mask = pd.to_datetime(df['lpep_pickup_datetime']).dt.date == pd.to_datetime('2019-10-18').date()
    oct_18_trips = df[mask]
    merged = oct_18_trips.merge(zones_df, left_on='PULocationID', right_on='LocationID')
    zone_totals = merged.groupby('Zone')['total_amount'].sum()
    top_zones = zone_totals[zone_totals > 13000].sort_values(ascending=False)
    return top_zones

def question6_largest_tip(df, zones_df):
    mask = pd.to_datetime(df['lpep_pickup_datetime']).dt.to_period('M') == pd.Period('2019-10')
    october_trips = df[mask]
    merged = october_trips.merge(
        zones_df,
        left_on='PULocationID',
        right_on='LocationID'
    ).merge(
        zones_df,
        left_on='DOLocationID',
        right_on='LocationID',
        suffixes=('_pickup', '_dropoff')
    )
    east_harlem_trips = merged[merged['Zone_pickup'] == 'East Harlem North']
    max_tip_zone = east_harlem_trips.loc[east_harlem_trips['tip_amount'].idxmax(), 'Zone_dropoff']
    return max_tip_zone

def main():
    download_and_prepare_data()
    print("Loading taxi trip data...")
    df = pd.read_csv('green_tripdata_2019-10.csv')
    print("Loading zone lookup data...")
    zones_df = pd.read_csv('taxi_zone_lookup.csv')
    print("\nQuestion 3 - Trip Segments:")
    segments = question3_trip_segments(df)
    print(f"Up to 1 mile: {segments['up_to_1']}")
    print(f"1-3 miles: {segments['between_1_3']}")
    print(f"3-7 miles: {segments['between_3_7']}")
    print(f"7-10 miles: {segments['between_7_10']}")
    print(f"Over 10 miles: {segments['over_10']}")
    print("\nQuestion 4 - Longest Trip Day:")
    longest_day = question4_longest_trip(df)
    print(f"Date with longest trip: {longest_day}")
    print("\nQuestion 5 - Top Pickup Zones:")
    top_zones = question5_top_pickup_zones(df, zones_df)
    print("Zones with total_amount > $13,000:")
    print(top_zones)
    print("\nQuestion 6 - Largest Tip:")
    max_tip_zone = question6_largest_tip(df, zones_df)
    print(f"Zone with largest tip from East Harlem North: {max_tip_zone}")

if __name__ == "__main__":
    main()

Loading taxi trip data...


  df = pd.read_csv('green_tripdata_2019-10.csv')


Loading zone lookup data...

Question 3 - Trip Segments:
Up to 1 mile: 104830
1-3 miles: 198995
3-7 miles: 109642
7-10 miles: 27686
Over 10 miles: 35201

Question 4 - Longest Trip Day:
Date with longest trip: 2019-10-31

Question 5 - Top Pickup Zones:
Zones with total_amount > $13,000:
Zone
East Harlem North      18686.68
East Harlem South      16797.26
Morningside Heights    13029.79
Name: total_amount, dtype: float64

Question 6 - Largest Tip:
Zone with largest tip from East Harlem North: JFK Airport
