In [1]:
pip install dlt

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: C:\Users\User\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import requests

BASE_API_URL = "https://us-central1-dlthub-analytics.cloudfunctions.net/data_engineering_zoomcamp_api"

# I call this a paginated getter
# as it's a function that gets data
# and also paginates until there is no more data
# by yielding pages, we "microbatch", which speeds up downstream processing

def paginated_getter():
    page_number = 1

    while True:
        # Set the query parameters
        params = {'page': page_number}

        # Make the GET request to the API
        response = requests.get(BASE_API_URL, params=params)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        page_json = response.json()
        print(f'got page number {page_number} with {len(page_json)} records')

        # if the page has no records, stop iterating
        if page_json:
            yield page_json
            page_number += 1
        else:
            # No more data, break the loop
            break

if __name__ == '__main__':
    # Use the generator to iterate over pages
    for page_data in paginated_getter():
        # Process each page as needed
        print(page_data)

got page number 1 with 1000 records
[{'End_Lat': 40.742963, 'End_Lon': -73.980072, 'Fare_Amt': 45.0, 'Passenger_Count': 1, 'Payment_Type': 'Credit', 'Rate_Code': None, 'Start_Lat': 40.641525, 'Start_Lon': -73.787442, 'Tip_Amt': 9.0, 'Tolls_Amt': 4.15, 'Total_Amt': 58.15, 'Trip_Distance': 17.52, 'Trip_Dropoff_DateTime': '2009-06-14 23:48:00', 'Trip_Pickup_DateTime': '2009-06-14 23:23:00', 'mta_tax': None, 'store_and_forward': None, 'surcharge': 0.0, 'vendor_name': 'VTS'}, {'End_Lat': 40.740187, 'End_Lon': -74.005698, 'Fare_Amt': 6.5, 'Passenger_Count': 1, 'Payment_Type': 'Credit', 'Rate_Code': None, 'Start_Lat': 40.722065, 'Start_Lon': -74.009767, 'Tip_Amt': 1.0, 'Tolls_Amt': 0.0, 'Total_Amt': 8.5, 'Trip_Distance': 1.56, 'Trip_Dropoff_DateTime': '2009-06-18 17:43:00', 'Trip_Pickup_DateTime': '2009-06-18 17:35:00', 'mta_tax': None, 'store_and_forward': None, 'surcharge': 1.0, 'vendor_name': 'VTS'}, {'End_Lat': 40.718043, 'End_Lon': -74.004745, 'Fare_Amt': 12.5, 'Passenger_Count': 5, 'Pay

In [3]:
import requests
import json

url = "https://storage.googleapis.com/dtc_zoomcamp_api/yellow_tripdata_2009-06.jsonl"

def download_and_read_jsonl(url):
    response = requests.get(url)
    response.raise_for_status()  # Raise an HTTPError for bad responses
    data = response.text.splitlines()
    parsed_data = [json.loads(line) for line in data]
    return parsed_data
   

downloaded_data = download_and_read_jsonl(url)

if downloaded_data:
    # Process or print the downloaded data as needed
    print(downloaded_data[:5])  # Print the first 5 entries as an example


[{'vendor_name': 'VTS', 'Trip_Pickup_DateTime': '2009-06-14 23:23:00', 'Trip_Dropoff_DateTime': '2009-06-14 23:48:00', 'Passenger_Count': 1, 'Trip_Distance': 17.52, 'Start_Lon': -73.787442, 'Start_Lat': 40.641525, 'Rate_Code': None, 'store_and_forward': None, 'End_Lon': -73.980072, 'End_Lat': 40.742963, 'Payment_Type': 'Credit', 'Fare_Amt': 45.0, 'surcharge': 0.0, 'mta_tax': None, 'Tip_Amt': 9.0, 'Tolls_Amt': 4.15, 'Total_Amt': 58.15}, {'vendor_name': 'VTS', 'Trip_Pickup_DateTime': '2009-06-18 17:35:00', 'Trip_Dropoff_DateTime': '2009-06-18 17:43:00', 'Passenger_Count': 1, 'Trip_Distance': 1.56, 'Start_Lon': -74.009767, 'Start_Lat': 40.722065, 'Rate_Code': None, 'store_and_forward': None, 'End_Lon': -74.005698, 'End_Lat': 40.740187, 'Payment_Type': 'Credit', 'Fare_Amt': 6.5, 'surcharge': 1.0, 'mta_tax': None, 'Tip_Amt': 1.0, 'Tolls_Amt': 0.0, 'Total_Amt': 8.5}, {'vendor_name': 'VTS', 'Trip_Pickup_DateTime': '2009-06-10 18:08:00', 'Trip_Dropoff_DateTime': '2009-06-10 18:27:00', 'Passeng

In [4]:
import requests
import json

def download_and_yield_rows(url):
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise an HTTPError for bad responses

    for line in response.iter_lines():
        if line:
            yield json.loads(line)

# Replace the URL with your actual URL
url = "https://storage.googleapis.com/dtc_zoomcamp_api/yellow_tripdata_2009-06.jsonl"

# Use the generator to iterate over rows with minimal memory usage
for row in download_and_yield_rows(url):
    # Process each row as needed
    print(row)

{'vendor_name': 'VTS', 'Trip_Pickup_DateTime': '2009-06-14 23:23:00', 'Trip_Dropoff_DateTime': '2009-06-14 23:48:00', 'Passenger_Count': 1, 'Trip_Distance': 17.52, 'Start_Lon': -73.787442, 'Start_Lat': 40.641525, 'Rate_Code': None, 'store_and_forward': None, 'End_Lon': -73.980072, 'End_Lat': 40.742963, 'Payment_Type': 'Credit', 'Fare_Amt': 45.0, 'surcharge': 0.0, 'mta_tax': None, 'Tip_Amt': 9.0, 'Tolls_Amt': 4.15, 'Total_Amt': 58.15}
{'vendor_name': 'VTS', 'Trip_Pickup_DateTime': '2009-06-18 17:35:00', 'Trip_Dropoff_DateTime': '2009-06-18 17:43:00', 'Passenger_Count': 1, 'Trip_Distance': 1.56, 'Start_Lon': -74.009767, 'Start_Lat': 40.722065, 'Rate_Code': None, 'store_and_forward': None, 'End_Lon': -74.005698, 'End_Lat': 40.740187, 'Payment_Type': 'Credit', 'Fare_Amt': 6.5, 'surcharge': 1.0, 'mta_tax': None, 'Tip_Amt': 1.0, 'Tolls_Amt': 0.0, 'Total_Amt': 8.5}
{'vendor_name': 'VTS', 'Trip_Pickup_DateTime': '2009-06-10 18:08:00', 'Trip_Dropoff_DateTime': '2009-06-10 18:27:00', 'Passenger_

                                                                    ASSIGNMENT





In [12]:
# Question 1

def square_root_generator(limit):
    n = 1
    while n <= limit:
        yield n ** 0.5
        n += 1

# Example usage:
limit = 5
generator = square_root_generator(limit)

for sqrt_value in generator:
    print(sqrt_value)

1.0
1.4142135623730951
1.7320508075688772
2.0
2.23606797749979


In [13]:
# Question 1

def square_root_generator(limit):
    n = 1
    while n <= limit:
        yield n ** 0.5
        n += 1

# Example usage:
limit = 5
generator = square_root_generator(limit)

# Summing up the outputs
sum_of_outputs = sum(generator)

print("Sum of the outputs:", sum_of_outputs)



Sum of the outputs: 8.382332347441762


In [14]:
# Question 2

def square_root_generator(limit):
    n = 1
    while n <= limit:
        yield n ** 0.5
        n += 1

# Example usage:
limit = 13
generator = square_root_generator(limit)

# Advance the generator to the 13th value
for _ in range(12):  # We iterate 12 times because we want to reach the 13th value
    next(generator)

# Get the 13th number yielded by the generator
thirteenth_number = next(generator)

print("The 13th number yielded by the generator:", thirteenth_number)


The 13th number yielded by the generator: 3.605551275463989


In [15]:
# Question 3

def people_1():
    for i in range(1, 6):
        yield {"ID": i, "Name": f"Person_{i}", "Age": 25 + i, "City": "City_A"}

for person in people_1():
    print(person)

{'ID': 1, 'Name': 'Person_1', 'Age': 26, 'City': 'City_A'}
{'ID': 2, 'Name': 'Person_2', 'Age': 27, 'City': 'City_A'}
{'ID': 3, 'Name': 'Person_3', 'Age': 28, 'City': 'City_A'}
{'ID': 4, 'Name': 'Person_4', 'Age': 29, 'City': 'City_A'}
{'ID': 5, 'Name': 'Person_5', 'Age': 30, 'City': 'City_A'}


In [16]:
# Question 3
def people_1():
    for i in range(1, 6):
        yield {"ID": i, "Name": f"Person_{i}", "Age": 25 + i, "City": "City_A"}

# Initialize sum of ages
sum_of_ages = 0

# Iterate over the generator and calculate sum of ages
for person in people_1():
    sum_of_ages += person["Age"]

print("Sum of ages of all people:", sum_of_ages)


Sum of ages of all people: 140


In [17]:
# Question 3

def people_1():
    for i in range(1, 6):
        yield {"ID": i, "Name": f"Person_{i}", "Age": 25 + i, "City": "City_A"}

def people_2():
    for i in range(3, 9):
        yield {"ID": i, "Name": f"Person_{i}", "Age": 30 + i, "City": "City_B", "Occupation": f"Job_{i}"}

# Initialize a list to store all people
all_people = []

# Append dictionaries from the first generator
for person in people_1():
    all_people.append(person)

# Append dictionaries from the second generator
for person in people_2():
    all_people.append(person)

# Calculate the sum of ages
sum_of_ages = sum(person["Age"] for person in all_people)

print("Sum of ages of all people:", sum_of_ages)


Sum of ages of all people: 353


In [18]:
# Question 4


def people_1():
    for i in range(1, 6):
        yield {"ID": i, "Name": f"Person_{i}", "Age": 25 + i, "City": "City_A"}

def people_2():
    for i in range(3, 9):
        yield {"ID": i, "Name": f"Person_{i}", "Age": 30 + i, "City": "City_B", "Occupation": f"Job_{i}"}

# Initialize a dictionary to store records with ID as the key
records = {}

# Load data from the first generator
for person in people_1():
    records[person["ID"]] = person

# Load data from the second generator and merge with existing records
for person in people_2():
    records[person["ID"]] = person

# Calculate the sum of ages
sum_of_ages = sum(person["Age"] for person in records.values())

print("Sum of ages of all people loaded:", sum_of_ages)


Sum of ages of all people loaded: 266
