# NORMALIZE AND CLEAN DATA

In [None]:
import pandas as pd
import sys
import os

# Add project root to Python path to allow module imports
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import custom utility functions for data processing
from src.utils.normalizing import normalize          # Standardize or scale data
from src.utils.qa_rules import run_quality_check, summarize_qa_flags  # Apply and summarize QA rules
from src.utils.cleaning import clean                 # Perform data cleaning

## Normalize Data of a month using built functions
**Objective:** Load the raw January data and apply the `normalize` function (from `src.utils.normalizing`).
* **Enrich data:** Add `PU/DO_Borough`, `payment_type_name`, etc.
* **Feature Engineering:** Create derived columns like `trip_duration`, `avg_speed`, and `pickup_day_of_week`.
* **Feature Selection:** Drop irrelevant columns identified in Notebook 1, specifically 'airport_fee' since it has only 5 nonnull values; 'store_and_fwd_flag', 'VendorID' because they are irrelevant to analysis; 'mta_tax','improvement_surcharge' due to their low variance.

In [None]:
df1 = pd.read_parquet("../raw/yellow_tripdata_2021-01.parquet")
df1_normalized = normalize(df1)
print("Successfully normalized January data")

In [None]:
print("First 10 rows of data before normalized: ")
df1.head(10)

In [None]:
print("First 10 rows of data after normalized with new columns at the end: ")
df1_normalized.head(10)

In [None]:
print("Data after normalized info: ")
df1_normalized.info()

## Applying QA steps 
**Objective:** Apply the `run_quality_check` function (from `src.utils.qa_rules`) to the normalized data.
* This will return `df1_flag`, a DataFrame containing 11 boolean flag columns based on 11 rules.
* Then, use `summarize_qa_flags` to generate the summary string ("count/pct%") for the report.

In [None]:
df1_flag = run_quality_check(df1_normalized)
january = summarize_qa_flags(df1_flag)
print("Successfully run quality check!")

In [None]:
print("First 10 rows of January's flag: ")
df1_flag.head(10)

In [None]:
print("Summary of January's flag, 0-10 indicates rule ID, row 11 is the sum of trips that has violations: ")
january

## Clean Data of a month using built function
**Objective:** Apply the `clean` function (from `src.utils.cleaning`).
* This function will take `df1_normalized` and `df1_flag` as input.
* It will filter and remove rows that violate the rules according to our defined strategy.

In [None]:
df1_cleaned, df1_standard = clean(df1_normalized, df1_flag)
print("Successfully cleaned data!")
print("Cleaned data has shape: ", df1_cleaned.shape)

In [None]:
print("Data after cleaned info: ")
df1_cleaned.info()

In [None]:
print("More information about cleaned data (numerical values):")
df1_cleaned.describe()

## Clean Data of 12 months and save it to processed
