In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

df = pd.read_csv('../output/pa-as-yo-go-ap.csv')

#filter out rows where file column contains src/test/java
df = df[~df['file'].str.contains('src/test/java')]
# filter out rows where type column is equal tu enum
df = df[df['type'] != 'enum']

# Strip any leading or trailing spaces from column names
df.columns = df.columns.str.strip()

df = df[['tag_date', 'tag_name', 'class', 'wmc', 'dit', 'noc', 'cbo', 'rfc', 'lcom']]
df.columns = ['TagDate', 'TagName', 'ClassName', 'WMC', 'DIT', 'NOC', 'CBO', 'RFC', 'LCOM']
df.columns = df.columns.str.strip()
df['TagDate'] = pd.to_datetime(df['TagDate'])

# Generate histograms and KDE plots for each metric from both DataFrames and display them side by side using subplots
metrics = ['WMC', 'DIT', 'NOC', 'CBO', 'RFC', 'LCOM']

# group rows by TagDate, and plot histograms for each metric, update the ode below to plot one row per date
dfs = {repo_name: repo_df for repo_name, repo_df in df.groupby('TagDate')}

num_repos = len(dfs)
num_metrics = len(metrics)

fig, axes = plt.subplots(nrows=num_repos, ncols=num_metrics, figsize=(5 * num_metrics, 5 * num_repos), sharey=False, sharex=False)

for row, (repo_name, df) in enumerate(dfs.items()):
    for col, metric in enumerate(metrics):
        ax = axes[row, col]
        sns.histplot(df[metric], bins=50, kde=True, ax=ax, edgecolor='black')
        ax.set_title(f'{metric} - {repo_name}', fontsize=14, fontweight='bold')
        ax.set_xlabel(metric, fontsize=12)
        ax.set_ylabel('Frequency', fontsize=12)
        ax.grid(True, linestyle='--', alpha=0.7)
        ax.text(0.95, 0.95, f'Classes: {len(df[metric])}\n\nMean: {df[metric].mean():.2f}\nMedian: {df[metric].median():.2f}\nMax: {df[metric].max():.2f}', 
                transform=ax.transAxes, verticalalignment='top', horizontalalignment='right')
        ax.set_xlim(left=df[metric].min())

plt.tight_layout()
plt.show()

