In [1]:
from pathlib import Path
import requests

def download_one_file_of_raw_data(year: int, month: int) -> Path:
    URL = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month:02d}.parquet'
    response = requests.get(URL)

    if response.status_code == 200:
        path = f'../data/raw/rides_{year}-{month:02d}.parquet'
        open(path, "wb").write(response.content)
        return path
    else:
        raise Exception(f'{URL} is not available')

In [2]:
download_one_file_of_raw_data(year=2022, month=1)

'../data/raw/rides_2022-01.parquet'

In [1]:
import pandas as pd

rides = pd.read_parquet('../data/raw/rides_2022-01.parquet')

rides.head(20)

ModuleNotFoundError: No module named 'pandas'

In [10]:
# Only need two columns 

rides = rides[['tpep_pickup_datetime', 'PULocationID']]

In [11]:
# rename columns 

rides.rename(columns={
    'tpep_pickup_datetime': 'pickup_datetime',
    'PULocationID': 'pickup_location_id'
},inplace=True)

In [12]:
rides.head(20)

Unnamed: 0,pickup_datetime,pickup_location_id
0,2022-01-01 00:35:40,142
1,2022-01-01 00:33:43,236
2,2022-01-01 00:53:21,166
3,2022-01-01 00:25:21,114
4,2022-01-01 00:36:48,68
5,2022-01-01 00:40:15,138
6,2022-01-01 00:20:50,233
7,2022-01-01 00:13:04,238
8,2022-01-01 00:30:02,166
9,2022-01-01 00:48:52,236


In [13]:
# Validate data

rides['pickup_datetime'].describe()

  rides['pickup_datetime'].describe()


count                 2463931
unique                1423522
top       2022-01-26 07:57:00
freq                       12
first     2008-12-31 22:23:09
last      2022-05-18 20:41:57
Name: pickup_datetime, dtype: object

In [15]:
# Clean data because this dataset is supposed to be only 01/2022
# Keep days after 01/01/2022 and before 02/01/2022

rides = rides[rides.pickup_datetime >='2022-01-01']
rides = rides[rides.pickup_datetime <'2022-02-01']
rides['pickup_datetime'].describe()

  rides['pickup_datetime'].describe()


count                 2463879
unique                1423471
top       2022-01-26 07:57:00
freq                       12
first     2022-01-01 00:00:08
last      2022-01-31 23:59:58
Name: pickup_datetime, dtype: object

In [16]:
# store data

rides.to_parquet('../data/transformed/validated_rides_2022_01.parquet')