# AGGREGATE KPI AND VISUALIZE DATA

In [None]:
import pandas as pd
import sys
import os

# Add project root to Python path to allow module imports
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.utils.kpi import aggregate_kpis
from src.utils.visualization import visualize_summary, visualize_customer_segments, visualize_temporal_trends, visualize_trip_characteristics, visualize_geographical_analysis

# KPI

## Daily

In [None]:
df1 = pd.read_parquet('../processed/cleaned_data/cleaned_yellow_tripdata_2021-01.parquet')
df1_flag = pd.read_parquet('../processed/flags_for_analysis/flag_yellow_tripdata_2021-01.parquet')
df1_kpi = aggregate_kpis(df1, df1_flag)
df1_kpi_Daily = df1_kpi['Daily']
df1_kpi['Daily'].head()

## Weekly

In [None]:
df1_kpi['Weekly'].head()

## Monthly

In [None]:
df1_kpi['Monthly'].head()

# KPI 12 months

In [None]:
# List of months for 2021
months = [f"{i:02d}" for i in range(1, 13)]

daily_list = []
weekly_list = []
monthly_list = []

# Create directory if it doesn't exist
os.makedirs("../processed/kpi_report/", exist_ok=True)

for month in months:
    df = pd.read_parquet(f"../processed/cleaned_data/cleaned_yellow_tripdata_2021-{month}.parquet")
    df_flags = pd.read_parquet(f"../processed/flags_for_analysis/flag_yellow_tripdata_2021-{month}.parquet")
    df_kpi = aggregate_kpis(df, df_flags)
    
    daily_list.append(df_kpi['Daily'])
    weekly_list.append(df_kpi['Weekly'])
    monthly_list.append(df_kpi['Monthly'])

# Concatenate all months
daily_all = pd.concat(daily_list, ignore_index=True)
weekly_all = pd.concat(weekly_list, ignore_index=True)
monthly_all = pd.concat(monthly_list, ignore_index=True)

# Save to CSV files
daily_all.to_csv("../processed/kpi_report/kpi_daily_2021.csv", index=False)
weekly_all.to_csv("../processed/kpi_report/kpi_weekly_2021.csv", index=False)
monthly_all.to_csv("../processed/kpi_report/kpi_monthly_2021.csv", index=False)

# Visualiztion

In [None]:
df1_visuals1 = visualize_summary(df1, df1_kpi_Daily)

In [None]:
df1_visuals2 = visualize_customer_segments(df1, df1_flag)

In [None]:
df1_visuals3 = visualize_temporal_trends(df1, df1_flag)

In [None]:
df1_visuals4 = visualize_trip_characteristics(df1, df1_flag)

In [None]:
df1_visuals5 = visualize_geographical_analysis(df1, df1_flag)

# Visualize 12 months

In [None]:
import matplotlib.pyplot as plt

months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
month_names = {
    '01': 'January', '02': 'February', '03': 'March', '04': 'April',
    '05': 'May', '06': 'June', '07': 'July', '08': 'August',
    '09': 'September', '10': 'October', '11': 'November', '12': 'December'
}

for month in months:
    print(f"Processing month: {month}")
    
    # Đọc dữ liệu
    df = pd.read_parquet(f'../processed/cleaned_data/cleaned_yellow_tripdata_2021-{month}.parquet')
    df_flag = pd.read_parquet(f'../processed/flags_for_analysis/flag_yellow_tripdata_2021-{month}.parquet')
    df_kpi = aggregate_kpis(df, df_flag)
    df_kpi_Daily = df_kpi['Daily']
    
    # Tạo thư mục cho tháng
    month_folder = os.path.join('../figures', month_names[month])
    os.makedirs(month_folder, exist_ok=True)
    
    # Gọi các hàm visualization
    visualize_summary(df, df_kpi_Daily)
    # Lưu tất cả figures hiện tại
    for i, fig_num in enumerate(plt.get_fignums()):
        fig = plt.figure(fig_num)
        if i == 0:
            fig.savefig(os.path.join(month_folder, 'revenue_per_day.png'), dpi=300, bbox_inches='tight')
        elif i == 1:
            fig.savefig(os.path.join(month_folder, 'trips_per_day.png'), dpi=300, bbox_inches='tight')
        elif i == 2:
            fig.savefig(os.path.join(month_folder, 'trips_per_week_heatmap.png'), dpi=300, bbox_inches='tight')
        plt.close(fig)
    
    visualize_customer_segments(df, df_flag)
    for i, fig_num in enumerate(plt.get_fignums()):
        fig = plt.figure(fig_num)
        if i == 0:
            fig.savefig(os.path.join(month_folder, 'payment_type_distribution.png'), dpi=300, bbox_inches='tight')
        elif i == 1:
            fig.savefig(os.path.join(month_folder, 'passenger_count_distribution.png'), dpi=300, bbox_inches='tight')
        elif i == 2:
            fig.savefig(os.path.join(month_folder, 'tip_correlation_matrix.png'), dpi=300, bbox_inches='tight')
        plt.close(fig)
    
    visualize_temporal_trends(df, df_flag)
    for i, fig_num in enumerate(plt.get_fignums()):
        fig = plt.figure(fig_num)
        if i == 0:
            fig.savefig(os.path.join(month_folder, 'avg_speed_per_hour.png'), dpi=300, bbox_inches='tight')
        elif i == 1:
            fig.savefig(os.path.join(month_folder, 'trips_per_hour.png'), dpi=300, bbox_inches='tight')
        elif i == 2:
            fig.savefig(os.path.join(month_folder, 'revenue_per_hour.png'), dpi=300, bbox_inches='tight')
        plt.close(fig)
    
    visualize_trip_characteristics(df, df_flag)
    for i, fig_num in enumerate(plt.get_fignums()):
        fig = plt.figure(fig_num)
        if i == 0:
            fig.savefig(os.path.join(month_folder, 'trip_distance_distribution.png'), dpi=300, bbox_inches='tight')
        elif i == 1:
            fig.savefig(os.path.join(month_folder, 'trip_duration_distribution.png'), dpi=300, bbox_inches='tight')
        plt.close(fig)
    
    visualize_geographical_analysis(df, df_flag)
    for i, fig_num in enumerate(plt.get_fignums()):
        fig = plt.figure(fig_num)
        if i == 0:
            fig.savefig(os.path.join(month_folder, 'top10_pickup_zones.png'), dpi=300, bbox_inches='tight')
        elif i == 1:
            fig.savefig(os.path.join(month_folder, 'top10_dropoff_zones.png'), dpi=300, bbox_inches='tight')
        plt.close(fig)
    
    print(f"Saved all figures for {month_names[month]} to {month_folder}")
    print("-" * 50)