In [None]:
# Yellow taxi data for Feb. 2019 and March 2019

In [3]:
import pandas as pd
import json

In [2]:
color="yellow"
year = 2019
month = 2
dataset_file = f"{color}_tripdata_{year}-{month:02}"
    
dataset_url = f"https://github.com/DataTalksClub/nyc-tlc-data/releases/download/{color}/{dataset_file}.csv.gz"

In [3]:
head = pd.read_csv(dataset_url, nrows=1)

In [6]:
head.dtypes.to_dict()

{'VendorID': dtype('int64'),
 'tpep_pickup_datetime': dtype('O'),
 'tpep_dropoff_datetime': dtype('O'),
 'passenger_count': dtype('int64'),
 'trip_distance': dtype('float64'),
 'RatecodeID': dtype('int64'),
 'store_and_fwd_flag': dtype('O'),
 'PULocationID': dtype('int64'),
 'DOLocationID': dtype('int64'),
 'payment_type': dtype('int64'),
 'fare_amount': dtype('int64'),
 'extra': dtype('float64'),
 'mta_tax': dtype('float64'),
 'tip_amount': dtype('int64'),
 'tolls_amount': dtype('int64'),
 'improvement_surcharge': dtype('float64'),
 'total_amount': dtype('float64'),
 'congestion_surcharge': dtype('int64')}

In [None]:
YELLOW_SCHEMA = {
    "VendorID": int,
    "tpep_pickup_datetime": str,
    "tpep_dropoff_datetime": str,
    "passenger_count": int,
    "trip_distance": float,
    "RatecodeID": int,
    "store_and_fwd_flag": str,
    "PULocationID": float,
    "DOLocationID": float,
    "payment_type": int,
    "fare_amount": float,
    "extra": float,
    "mta_tax": float,
    "tip_amount": float,
    "tolls_amount": float,
    "improvement_surcharge": float,
    "total_amount": float,
    "congestion_surcharge": float
}

YELLOW_DATE_COLS = ["tpep_pickup_datetime", "tpep_dropoff_datetime"]

In [7]:
import json

In [4]:
with open("taxi_schemas.json", "r") as f:
    data = json.load(f)

In [7]:
yellow = data["yellow"]
green = data["green"]

In [8]:
[i for i in yellow.keys() if i not in green.keys()]

['tpep_pickup_datetime', 'tpep_dropoff_datetime']

In [10]:
[i for i in green.keys() if i not in yellow.keys()]

['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'ehail_fee', 'trip_type']

In [14]:
combined = list(green.keys())
combined.extend(list(yellow.keys()))
combined = list(set(combined))

In [21]:
combined_schema = dict()
for col in combined:
    green_type = None
    yellow_type = None
    green_only=False
    yellow_only=False
    
    if col in green.keys():
        green_type = green[col]
    else:
        #print(f"{col} not in green")
        yellow_only = True
    
    if col in yellow.keys():
        yellow_type = yellow[col]
    else:
        #print(f"{col} not in yellow")
        green_only = True
    
    if yellow_only:
        combined_schema[col] = yellow_type
    if green_only:
        combined_schema[col] = green_type
    
    if (green_type is not None) and (yellow_type is not None):
        if green_type != yellow_type:
            print(f"{col}: green: {green_type}; yellow: {yellow_type}")
        else:
            combined_schema[col] = yellow_type 

payment_type: green: str; yellow: int
passenger_count: green: float; yellow: int
PULocationID: green: str; yellow: float
VendorID: green: float; yellow: int
RatecodeID: green: str; yellow: int
DOLocationID: green: str; yellow: float


In [26]:
combined_schema["color"] = str
combined_schema["payment_type"] = int
combined_schema["passenger_count"] = int
combined_schema["PULocationID"] = str
combined_schema["VendorID"] = int
combined_schema["RatecodeID"] = int
combined_schema["DOLocationID"] = int

combined_schema["pep_pickup_datetime"] = "datetime"
combined_schema["pep_dropoff_datetime"] = "datetime"

In [24]:
combined_schema.pop("lpep_dropoff_datetime")
combined_schema.pop("lpep_pickup_datetime")
combined_schema.pop("tpep_dropoff_datetime")
combined_schema.pop("tpep_pickup_datetime")

'datetime'

In [27]:
combined_schema

{'fare_amount': 'float',
 'trip_type': 'str',
 'store_and_fwd_flag': 'str',
 'tolls_amount': 'float',
 'trip_distance': 'float',
 'congestion_surcharge': 'float',
 'mta_tax': 'float',
 'extra': 'float',
 'ehail_fee': 'float',
 'improvement_surcharge': 'float',
 'tip_amount': 'float',
 'total_amount': 'float',
 'color': str,
 'payment_type': int,
 'passenger_count': int,
 'PULocationID': str,
 'VendorID': int,
 'RatecodeID': int,
 'DOLocationID': int,
 'pep_pickup_datetime': 'datetime',
 'pep_dropoff_datetime': 'datetime'}

In [28]:

for k in green.keys():
    if k in combined_schema.keys():
        green[k] = combined_schema[k]

In [29]:
for k in yellow.keys():
    if k in combined_schema.keys():
        yellow[k] = combined_schema[k]

In [31]:
green

{'VendorID': int,
 'lpep_pickup_datetime': 'datetime',
 'lpep_dropoff_datetime': 'datetime',
 'store_and_fwd_flag': 'str',
 'RatecodeID': int,
 'PULocationID': str,
 'DOLocationID': int,
 'passenger_count': int,
 'trip_distance': 'float',
 'fare_amount': 'float',
 'extra': 'float',
 'mta_tax': 'float',
 'tip_amount': 'float',
 'tolls_amount': 'float',
 'ehail_fee': 'float',
 'improvement_surcharge': 'float',
 'total_amount': 'float',
 'payment_type': int,
 'trip_type': 'str',
 'congestion_surcharge': 'float'}

In [32]:
combined_schema

{'fare_amount': 'float',
 'trip_type': 'str',
 'store_and_fwd_flag': 'str',
 'tolls_amount': 'float',
 'trip_distance': 'float',
 'congestion_surcharge': 'float',
 'mta_tax': 'float',
 'extra': 'float',
 'ehail_fee': 'float',
 'improvement_surcharge': 'float',
 'tip_amount': 'float',
 'total_amount': 'float',
 'color': str,
 'payment_type': int,
 'passenger_count': int,
 'PULocationID': str,
 'VendorID': int,
 'RatecodeID': int,
 'DOLocationID': int,
 'pep_pickup_datetime': 'datetime',
 'pep_dropoff_datetime': 'datetime'}