# Exploratory Data Analysis â€” Video Game Sales

This notebook uses the `project_games` package to demonstrate the modular analysis pipeline.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from project_games.config import load_config
from project_games.data.loader import load_processed_data
from project_games.analysis.temporal import games_per_year, filter_relevant_period, significant_years
from project_games.analysis.platform import platform_total_sales, platform_growth_analysis
from project_games.analysis.genre import genre_sales_summary, classify_genres
from project_games.analysis.regional import top_platforms_by_region, top_genres_by_region
from project_games.analysis.hypothesis import run_configured_tests
from project_games.visualization.plots_matplotlib import (
    plot_games_per_year,
    plot_platform_evolution,
    plot_boxplot_by_group,
    plot_regional_bars,
    plot_hypothesis_result,
)

sns.set_theme(style="whitegrid")
cfg = load_config()
df = load_processed_data()
df_rel = filter_relevant_period(df, cfg)
print(f"Full dataset: {len(df):,} rows | Relevant period: {len(df_rel):,} rows")

## 1. Temporal Analysis

In [None]:
gpy = games_per_year(df)
plot_games_per_year(gpy)

sig = significant_years(df)
print(f"Peak year: {gpy.idxmax()} ({gpy.max()} games)")
print(f"Significant years (>= mean): {sig.index.min()}-{sig.index.max()} ({len(sig)} years)")

## 2. Platform Analysis

In [None]:
ps = platform_total_sales(df_rel)
top_platforms = ps.head(10).index.tolist()

fig, ax = plt.subplots(figsize=(10, 5))
ps.head(10).plot.barh(ax=ax)
ax.set_title("Top 10 Platforms by Total Sales")
ax.set_xlabel("Sales ($M)")
plt.tight_layout()
plt.show()

growth = platform_growth_analysis(df_rel)
growth.head(10)

In [None]:
plot_boxplot_by_group(df_rel, "platform", groups=top_platforms[:5])

## 3. Genre Analysis

In [None]:
gs = genre_sales_summary(df_rel)
display(gs)

tiers = classify_genres(df_rel)
print(f"High-sales genres: {', '.join(tiers['high_sales'])}")
print(f"Low-sales genres:  {', '.join(tiers['low_sales'])}")

## 4. Regional Analysis

In [None]:
plat_by_region = top_platforms_by_region(df_rel)
genre_by_region = top_genres_by_region(df_rel)

plot_regional_bars(plat_by_region, title="Top 5 Platforms by Region")
plot_regional_bars(genre_by_region, title="Top 5 Genres by Region")

## 5. Hypothesis Tests

In [None]:
results = run_configured_tests(df_rel, cfg)
for r in results:
    print(r.summary())
    plot_hypothesis_result(df_rel, r)
    print()