# NORMALIZE AND CLEAN DATA

In [1]:
import pandas as pd
import sys
import os

# Add project root to Python path to allow module imports
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import custom utility functions for data processing
from src.utils.normalizing import normalize          # Standardize or scale data
from src.utils.qa_rules import run_quality_check, summarize_qa_flags  # Apply and summarize QA rules
from src.utils.cleaning import clean                 # Perform data cleaning

## Normalize Data of a month using built functions
**Objective:** Load the raw January data and apply the `normalize` function (from `src.utils.normalizing`).
* **Enrich data:** Add `PU/DO_Borough`, `payment_type_name`, etc.
* **Feature Engineering:** Create derived columns like `trip_duration`, `avg_speed`, and `pickup_day_of_week`.
* **Feature Selection:** Drop irrelevant columns identified in Notebook 1, specifically 'airport_fee' since it has only 5 nonnull values; 'store_and_fwd_flag', 'VendorID' because they are irrelevant to analysis; 'mta_tax','improvement_surcharge' due to their low variance.

In [2]:
df1 = pd.read_parquet("../raw/yellow_tripdata_2021-01.parquet")
df1_normalized = normalize(df1)
print("Successfully normalized January data")

Successfully normalized January data


In [3]:
print("First 10 rows of data before normalized: ")
df1.head(10)

First 10 rows of data before normalized: 


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.1,1.0,N,142,43,2,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5,
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.2,1.0,N,238,151,2,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0,
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.7,1.0,N,132,165,1,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0,
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.6,1.0,N,138,132,1,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0,
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5,
5,1,2021-01-01 00:16:29,2021-01-01 00:24:30,1.0,1.6,1.0,N,224,68,1,8.0,3.0,0.5,2.35,0.0,0.3,14.15,2.5,
6,1,2021-01-01 00:00:28,2021-01-01 00:17:28,1.0,4.1,1.0,N,95,157,2,16.0,0.5,0.5,0.0,0.0,0.3,17.3,0.0,
7,1,2021-01-01 00:12:29,2021-01-01 00:30:34,1.0,5.7,1.0,N,90,40,2,18.0,3.0,0.5,0.0,0.0,0.3,21.8,2.5,
8,1,2021-01-01 00:39:16,2021-01-01 01:00:13,1.0,9.1,1.0,N,97,129,4,27.5,0.5,0.5,0.0,0.0,0.3,28.8,0.0,
9,1,2021-01-01 00:26:12,2021-01-01 00:39:46,2.0,2.7,1.0,N,263,142,1,12.0,3.0,0.5,3.15,0.0,0.3,18.95,2.5,


In [4]:
print("First 10 rows of data after normalized with new columns at the end: ")
df1_normalized.head(10)

First 10 rows of data after normalized with new columns at the end: 


Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,...,PU_Borough,PU_Zone,DO_Borough,DO_Zone,trip_duration_seconds,trip_duration_minutes,avg_speed_mph,pickup_day_of_week,is_weekend,computed_total_amount
0,2021-01-01 00:30:10-05:00,2021-01-01 00:36:12-05:00,1.0,2.1,1.0,142,43,2,8.0,3.0,...,Manhattan,Lincoln Square East,Manhattan,Central Park,362.0,6.0,20.88,Friday,False,14.3
1,2021-01-01 00:51:20-05:00,2021-01-01 00:52:19-05:00,1.0,0.2,1.0,238,151,2,3.0,0.5,...,Manhattan,Upper West Side North,Manhattan,Manhattan Valley,59.0,1.0,12.2,Friday,False,4.3
2,2021-01-01 00:43:30-05:00,2021-01-01 01:11:06-05:00,1.0,14.7,1.0,132,165,1,42.0,0.5,...,Queens,JFK Airport,Brooklyn,Midwood,1656.0,28.0,31.96,Friday,False,51.95
3,2021-01-01 00:15:48-05:00,2021-01-01 00:31:01-05:00,0.0,10.6,1.0,138,132,1,29.0,0.5,...,Queens,LaGuardia Airport,Queens,JFK Airport,913.0,15.0,41.8,Friday,False,36.35
4,2021-01-01 00:31:49-05:00,2021-01-01 00:48:21-05:00,1.0,4.94,1.0,68,33,1,16.5,0.5,...,Manhattan,East Chelsea,Brooklyn,Brooklyn Heights,992.0,17.0,17.93,Friday,False,24.36
5,2021-01-01 00:16:29-05:00,2021-01-01 00:24:30-05:00,1.0,1.6,1.0,224,68,1,8.0,3.0,...,Manhattan,Stuy Town/Peter Cooper Village,Manhattan,East Chelsea,481.0,8.0,11.98,Friday,False,16.65
6,2021-01-01 00:00:28-05:00,2021-01-01 00:17:28-05:00,1.0,4.1,1.0,95,157,2,16.0,0.5,...,Queens,Forest Hills,Queens,Maspeth,1020.0,17.0,14.47,Friday,False,17.3
7,2021-01-01 00:12:29-05:00,2021-01-01 00:30:34-05:00,1.0,5.7,1.0,90,40,2,18.0,3.0,...,Manhattan,Flatiron,Brooklyn,Carroll Gardens,1085.0,18.0,18.91,Friday,False,24.3
8,2021-01-01 00:39:16-05:00,2021-01-01 01:00:13-05:00,1.0,9.1,1.0,97,129,4,27.5,0.5,...,Brooklyn,Fort Greene,Queens,Jackson Heights,1257.0,21.0,26.06,Friday,False,28.8
9,2021-01-01 00:26:12-05:00,2021-01-01 00:39:46-05:00,2.0,2.7,1.0,263,142,1,12.0,3.0,...,Manhattan,Yorkville West,Manhattan,Lincoln Square East,814.0,14.0,11.94,Friday,False,21.45


In [5]:
print("Data after normalized info: ")
df1_normalized.info()

Data after normalized info: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1369769 entries, 0 to 1369768
Data columns (total 26 columns):
 #   Column                 Non-Null Count    Dtype                           
---  ------                 --------------    -----                           
 0   tpep_pickup_datetime   1369769 non-null  datetime64[ns, America/New_York]
 1   tpep_dropoff_datetime  1369769 non-null  datetime64[ns, America/New_York]
 2   passenger_count        1271417 non-null  float64                         
 3   trip_distance          1369769 non-null  float64                         
 4   RatecodeID             1271417 non-null  float64                         
 5   PULocationID           1369769 non-null  int64                           
 6   DOLocationID           1369769 non-null  int64                           
 7   payment_type           1369769 non-null  int64                           
 8   fare_amount            1369769 non-null  float64               

## Applying QA steps 
**Objective:** Apply the `run_quality_check` function (from `src.utils.qa_rules`) to the normalized data.
* This will return `df1_flag`, a DataFrame containing 11 boolean flag columns based on 11 rules.
* Then, use `summarize_qa_flags` to generate the summary string ("count/pct%") for the report.

In [6]:
df1_flag = run_quality_check(df1_normalized, 1)
january = summarize_qa_flags(df1_flag)
print("Successfully run quality check!")

Successfully run quality check!


In [7]:
print("First 10 rows of January's flag: ")
df1_flag.head(10)

First 10 rows of January's flag: 


Unnamed: 0,is_duplicate,missing_datetime,invalid_time_order,invalid_month,invalid_duration,invalid_distance,invalid_speed,suspicious_zero_fare,short_duration_long_distance,excessive_speed,...,invalid_tip_amount,invalid_extra,invalid_tolls_amount,invalid_total_amount,fare_total_mismatch,invalid_payment_type,invalid_ratecode,unusual_passenger_count,invalid_zone,total_violations
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,1
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,1
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,1
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,1
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,1


In [8]:
print("Summary of January's flag, 0-10 indicates rule ID, row 22 is the sum of trips that has violations: ")
january

Summary of January's flag, 0-10 indicates rule ID, row 22 is the sum of trips that has violations: 


(0             0/0.0%
 1             0/0.0%
 2         5642/0.41%
 3            24/0.0%
 4        16119/1.18%
 5        19952/1.46%
 6        24893/1.82%
 7          300/0.02%
 8         6338/0.46%
 9         1745/0.13%
 10            5/0.0%
 11        7411/0.54%
 12           59/0.0%
 13        2292/0.17%
 14         139/0.01%
 15        7114/0.52%
 16     387093/28.26%
 17            0/0.0%
 18       98352/7.18%
 19        52095/3.8%
 20       27137/1.98%
 21    528620/38.592%
 dtype: object,
 2)

## Clean Data of a month using built function
**Objective:** Apply the `clean` function (from `src.utils.cleaning`).
* This function will take `df1_normalized` and `df1_flag` as input.
* It will filter and remove rows that violate the rules according to our defined strategy.

In [9]:
df1_cleaned, df1_standard = clean(df1_normalized, df1_flag)
print("Successfully cleaned data!")
print("Cleaned data has shape: ", df1_cleaned.shape)

Successfully cleaned data!
Cleaned data has shape:  (1337724, 26)


In [10]:
print("Data after cleaned info: ")
df1_cleaned.info()

Data after cleaned info: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337724 entries, 0 to 1369768
Data columns (total 26 columns):
 #   Column                 Non-Null Count    Dtype                           
---  ------                 --------------    -----                           
 0   tpep_pickup_datetime   1337724 non-null  datetime64[ns, America/New_York]
 1   tpep_dropoff_datetime  1337724 non-null  datetime64[ns, America/New_York]
 2   passenger_count        1247243 non-null  float64                         
 3   trip_distance          1337724 non-null  float64                         
 4   RatecodeID             1247243 non-null  float64                         
 5   PULocationID           1337724 non-null  int64                           
 6   DOLocationID           1337724 non-null  int64                           
 7   payment_type           1337724 non-null  int64                           
 8   fare_amount            1337724 non-null  float64                  

In [11]:
print("More information about cleaned data (numerical values):")
df1_cleaned.describe()

More information about cleaned data (numerical values):


Unnamed: 0,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,tip_amount,tolls_amount,total_amount,congestion_surcharge,trip_duration_seconds,trip_duration_minutes,avg_speed_mph,computed_total_amount
count,1247243.0,1337724.0,1247243.0,1337724.0,1337724.0,1337724.0,1337724.0,1337724.0,1337724.0,1337724.0,1337724.0,1247243.0,1337724.0,1337724.0,1337724.0,1337724.0
mean,1.415983,4.642777,1.021859,165.0282,161.7914,1.186402,11.96457,0.9883682,1.943221,0.2436578,17.41679,2.277203,845.4224,14.08998,18.87452,18.05787
std,1.064052,391.7903,0.3903941,67.5061,71.92654,0.5588281,12.458,1.233038,2.532931,1.509896,14.24043,0.7332955,3758.1,62.63579,1399.749,14.19914
min,0.0,0.01,1.0,1.0,1.0,0.0,-150.0,-2.75,0.0,0.0,-152.8,-2.5,31.0,1.0,0.01,-152.8
25%,1.0,1.02,1.0,125.0,107.0,1.0,6.0,0.0,0.0,0.0,10.8,2.5,341.0,6.0,9.6,11.44
50%,1.0,1.7,1.0,162.0,162.0,1.0,8.5,0.5,1.86,0.0,13.8,2.5,548.0,9.0,11.89,14.76
75%,1.0,3.05,1.0,236.0,236.0,1.0,13.0,2.5,2.75,0.0,18.96,2.5,877.0,15.0,15.29,19.8
max,8.0,263163.3,99.0,265.0,265.0,4.0,6960.5,8.25,1140.44,811.75,7661.28,2.5,1729062.0,28818.0,1054122.0,7661.28


## Clean Data of 12 months and save it to processed


In [12]:
from pathlib import Path

raw_dir = Path("../raw")
raw_files = list(raw_dir.glob("yellow_tripdata_2021-*.parquet"))

cleaned_dir = Path("../processed/cleaned_data")
flag_dir = Path("../processed/flags_for_analysis")

reports_dir = Path("../reports")

### Create quality assurance summary

In [13]:
id = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,'NaN']

rule = ['is_duplicate', 'missing_datetime', 'invalid_time_order', 'invalid_month',
        'invalid_duration', 'invalid_distance', 'invalid_speed', 'suspicious_zero_fare',
        'short_duration_long_distance', 'excessive_speed', 'excessive_duration',
        'invalid_fare_amount', 'invalid_tip_amount', 'invalid_extra', 'invalid_tolls_amount',
        'invalid_total_amount', 'fare_total_mismatch',
        'invalid_payment_type', 'invalid_ratecode', 'unusual_passenger_count', 'invalid_zone', 'NaN']


action = ['Exclude', 'Exclude', 'Exclude', 'Exclude', 'Exclude', 'Exclude', 'Exclude', 
          'Flag', 'Flag', 'Flag', 'Flag', 'Flag', 'Flag', 'Flag', 'Flag', 'Flag', 
          'Flag', 'Flag', 'Flag', 'Flag', 'Flag', 'Total:']

rule_definition = {'ID': id, 'Rule': rule, 'Action': action}
final_qa_report_df = pd.DataFrame(data = rule_definition, columns=['ID', 'Rule', 'Action'])
final_qa_report_df


Unnamed: 0,ID,Rule,Action
0,1.0,is_duplicate,Exclude
1,2.0,missing_datetime,Exclude
2,3.0,invalid_time_order,Exclude
3,4.0,invalid_month,Exclude
4,5.0,invalid_duration,Exclude
5,6.0,invalid_distance,Exclude
6,7.0,invalid_speed,Exclude
7,8.0,suspicious_zero_fare,Flag
8,9.0,short_duration_long_distance,Flag
9,10.0,excessive_speed,Flag


In [14]:
for file in raw_files:
    # Check if cleaned file already exists:
    cleaned_out = cleaned_dir / f"cleaned_{file.name}"
    if cleaned_out.exists():
        print(f"Skipped (already exists): {cleaned_out.name}")
    
    # Take month's name:
    month_str = file.name.split('-')[1].split('.')[0]
    month_col_name = pd.to_datetime(f'2021-{month_str}-01').strftime('%B') 
    month_int = pd.to_datetime(f"2021-{month_str}-01").month

    print(f"--- Processing: {month_col_name}: {file.name} ---")
    df_file = pd.read_parquet(file)
    
    # Normalize data:
    normalized_file = normalize(df_file)

    # Applying QA rules, add QA summary to final QA report:
    file_flag = run_quality_check(normalized_file, month_int)
    flag_summary, threshold = summarize_qa_flags(file_flag)
    final_qa_report_df[month_col_name] = flag_summary

    # Clean data and take flag of not removed rows:
    file_cleaned, file_standard = clean(normalized_file, file_flag, threshold)

    # Save cleaned file to folder processed/cleaned_data
    file_cleaned.to_parquet(cleaned_out, index=False, engine="pyarrow")
    print(f"Saved cleaned file: {cleaned_out.name}")
    
    # Save flags for later analysis to folder processed/flags_for_analysis
    flag_out = flag_dir / f"flag_{file.name}"
    if flag_out.exists():
        print(f"Skipped (already exists): {flag_out.name}")
    else:
        file_standard.to_parquet(flag_out, index=False, engine="pyarrow")
        print(f"Saved flag file for upcoming analysis: {flag_out.name}")

# Save QA summary to folder reports
qa_path = reports_dir / "qa_summary.csv"
final_qa_report_df.to_csv(qa_path, index=False)
print(f"QA report saved to {qa_path}")
print("Data is ready for analysis!")


--- Processing: January: yellow_tripdata_2021-01.parquet ---
Saved cleaned file: cleaned_yellow_tripdata_2021-01.parquet
Saved flag file for upcoming analysis: flag_yellow_tripdata_2021-01.parquet
--- Processing: February: yellow_tripdata_2021-02.parquet ---
Saved cleaned file: cleaned_yellow_tripdata_2021-02.parquet
Saved flag file for upcoming analysis: flag_yellow_tripdata_2021-02.parquet
--- Processing: March: yellow_tripdata_2021-03.parquet ---
Saved cleaned file: cleaned_yellow_tripdata_2021-03.parquet
Saved flag file for upcoming analysis: flag_yellow_tripdata_2021-03.parquet
--- Processing: April: yellow_tripdata_2021-04.parquet ---
Saved cleaned file: cleaned_yellow_tripdata_2021-04.parquet
Saved flag file for upcoming analysis: flag_yellow_tripdata_2021-04.parquet
--- Processing: May: yellow_tripdata_2021-05.parquet ---
Saved cleaned file: cleaned_yellow_tripdata_2021-05.parquet
Saved flag file for upcoming analysis: flag_yellow_tripdata_2021-05.parquet
--- Processing: June: 

### View May's data after cleaned

In [15]:
df_may = pd.read_parquet("../processed/cleaned_data/cleaned_yellow_tripdata_2021-05.parquet")
df_may.head(10)

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,...,PU_Borough,PU_Zone,DO_Borough,DO_Zone,trip_duration_seconds,trip_duration_minutes,avg_speed_mph,pickup_day_of_week,is_weekend,computed_total_amount
0,2021-05-01 00:37:18-04:00,2021-05-01 00:41:07-04:00,2.0,0.7,1.0,141,263,1,5.0,3.0,...,Manhattan,Lenox Hill West,Manhattan,Yorkville West,229.0,4.0,11.0,Saturday,True,13.5
1,2021-05-01 00:43:01-04:00,2021-05-01 00:49:19-04:00,1.0,1.4,1.0,263,75,2,6.5,3.0,...,Manhattan,Yorkville West,Manhattan,East Harlem South,378.0,6.0,13.33,Saturday,True,12.8
2,2021-05-01 00:05:54-04:00,2021-05-01 00:31:46-04:00,1.0,5.7,1.0,142,129,2,21.5,3.0,...,Manhattan,Lincoln Square East,Queens,Jackson Heights,1552.0,26.0,13.22,Saturday,True,27.8
3,2021-05-01 00:08:21-04:00,2021-05-01 00:19:20-04:00,1.0,3.04,1.0,231,97,1,11.5,0.5,...,Manhattan,TriBeCa/Civic Center,Brooklyn,Fort Greene,659.0,11.0,16.61,Saturday,True,18.36
4,2021-05-01 00:32:44-04:00,2021-05-01 00:48:44-04:00,1.0,4.04,1.0,148,17,1,15.5,0.5,...,Manhattan,Lower East Side,Brooklyn,Bedford,960.0,16.0,15.15,Saturday,True,25.09
5,2021-05-01 00:25:53-04:00,2021-05-01 00:36:21-04:00,1.0,3.6,1.0,148,112,1,13.0,3.0,...,Manhattan,Lower East Side,Brooklyn,Greenpoint,628.0,10.0,20.64,Saturday,True,22.65
6,2021-05-01 00:49:19-04:00,2021-05-01 01:01:18-04:00,2.0,3.1,1.0,148,229,1,12.0,3.0,...,Manhattan,Lower East Side,Manhattan,Sutton Place/Turtle Bay North,719.0,12.0,15.52,Saturday,True,18.3
7,2021-05-01 00:06:23-04:00,2021-05-01 00:18:50-04:00,1.0,2.9,1.0,68,143,2,11.5,3.0,...,Manhattan,East Chelsea,Manhattan,Lincoln Square West,747.0,12.0,13.98,Saturday,True,17.8
8,2021-05-01 00:15:20-04:00,2021-05-01 00:40:17-04:00,1.0,5.77,1.0,239,7,2,21.5,0.5,...,Manhattan,Upper West Side South,Queens,Astoria,1497.0,25.0,13.88,Saturday,True,25.3
9,2021-05-01 00:39:48-04:00,2021-05-01 00:50:09-04:00,2.0,1.87,1.0,148,249,1,9.5,0.5,...,Manhattan,Lower East Side,Manhattan,West Village,621.0,10.0,10.84,Saturday,True,14.8


In [16]:
df_may.columns

Index(['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count',
       'trip_distance', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'payment_type', 'fare_amount', 'extra', 'tip_amount', 'tolls_amount',
       'total_amount', 'congestion_surcharge', 'ratecodeID_name',
       'payment_type_name', 'PU_Borough', 'PU_Zone', 'DO_Borough', 'DO_Zone',
       'trip_duration_seconds', 'trip_duration_minutes', 'avg_speed_mph',
       'pickup_day_of_week', 'is_weekend', 'computed_total_amount'],
      dtype='object')

## View QA summary 

In [17]:
qa_summary = pd.read_csv("../reports/qa_summary.csv")
qa_summary

Unnamed: 0,ID,Rule,Action,January,February,March,April,May,June,July,August,September,October,November,December
0,1.0,is_duplicate,Exclude,0/0.0%,0/0.0%,0/0.0%,0/0.0%,0/0.0%,0/0.0%,0/0.0%,0/0.0%,0/0.0%,0/0.0%,12/0.0%,0/0.0%
1,2.0,missing_datetime,Exclude,0/0.0%,0/0.0%,0/0.0%,0/0.0%,0/0.0%,0/0.0%,0/0.0%,0/0.0%,0/0.0%,0/0.0%,10118/0.29%,0/0.0%
2,3.0,invalid_time_order,Exclude,5642/0.41%,6358/0.46%,6430/0.33%,2965/0.14%,2810/0.11%,2117/0.07%,2017/0.07%,2566/0.09%,2446/0.08%,1710/0.05%,955/0.03%,1174/0.04%
3,4.0,invalid_month,Exclude,24/0.0%,31/0.0%,33/0.0%,42/0.0%,94/0.0%,123/0.0%,130/0.0%,526/0.02%,105/0.0%,111/0.0%,9137/0.26%,108/0.0%
4,5.0,invalid_duration,Exclude,16119/1.18%,17437/1.27%,20400/1.06%,19510/0.9%,22499/0.9%,24740/0.87%,27180/0.96%,28167/1.01%,28976/0.98%,30122/0.87%,29908/0.86%,30878/0.96%
5,6.0,invalid_distance,Exclude,19952/1.46%,18866/1.38%,24430/1.27%,31459/1.45%,32875/1.31%,34150/1.2%,37311/1.32%,37514/1.35%,56141/1.89%,42778/1.24%,35326/1.02%,37010/1.15%
6,7.0,invalid_speed,Exclude,24893/1.82%,24629/1.8%,29918/1.55%,33271/1.53%,34053/1.36%,34211/1.21%,36791/1.3%,37802/1.36%,56284/1.9%,42464/1.23%,34536/0.99%,36783/1.14%
7,8.0,suspicious_zero_fare,Flag,300/0.02%,335/0.02%,365/0.02%,368/0.02%,376/0.01%,494/0.02%,554/0.02%,644/0.02%,607/0.02%,650/0.02%,740/0.02%,704/0.02%
8,9.0,short_duration_long_distance,Flag,6338/0.46%,7797/0.57%,7911/0.41%,5679/0.26%,5574/0.22%,4632/0.16%,4572/0.16%,5170/0.19%,5168/0.17%,3964/0.11%,2935/0.08%,3091/0.1%
9,10.0,excessive_speed,Flag,1745/0.13%,2609/0.19%,2872/0.15%,4680/0.22%,5534/0.22%,5122/0.18%,5137/0.18%,5375/0.19%,5486/0.19%,4868/0.14%,4455/0.13%,4445/0.14%


### QA summary intepretation
- **fare–total mismatches**, **invalid distance/speed**, and **unusual passenger counts** contribute the largest share of flags across all months, likely caused by sensor (taximeter or GPS) inaccuracies, fare calculation inconsistencies.
- **missing datetime**, **invalid payment type**, and **invalid tip amount** occur very rarely, showing that well recorded and reliable transactional fields.