# Cricket Match Data Exploration

This notebook explores the Cricsheet data to understand patterns and features for our prediction model.

## Setup


In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
from datetime import datetime

from config import DATABASE_PATH

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print(f"Database path: {DATABASE_PATH}")


In [None]:
# Connect to database
conn = sqlite3.connect(DATABASE_PATH)
conn.row_factory = sqlite3.Row

# Helper function to run queries
def query_df(sql, params=None):
    return pd.read_sql_query(sql, conn, params=params)


## 1. Database Overview


In [None]:
# Get table counts
tables = ['teams', 'players', 'venues', 'matches', 'innings', 'deliveries', 'player_match_stats']

counts = {}
for table in tables:
    try:
        result = query_df(f"SELECT COUNT(*) as count FROM {table}")
        counts[table] = result['count'].iloc[0]
    except:
        counts[table] = 0

print("Database Summary")
print("=" * 40)
for table, count in counts.items():
    print(f"{table:25} {count:>10,} rows")


## 2. Match Analysis


In [None]:
# Load matches data
matches_df = query_df("""
    SELECT 
        m.*,
        t1.name as team1_name,
        t2.name as team2_name,
        w.name as winner_name,
        tw.name as toss_winner_name,
        v.name as venue_name,
        v.city as venue_city
    FROM matches m
    LEFT JOIN teams t1 ON m.team1_id = t1.team_id
    LEFT JOIN teams t2 ON m.team2_id = t2.team_id
    LEFT JOIN teams w ON m.winner_id = w.team_id
    LEFT JOIN teams tw ON m.toss_winner_id = tw.team_id
    LEFT JOIN venues v ON m.venue_id = v.venue_id
""")

matches_df['date'] = pd.to_datetime(matches_df['date'])
matches_df['year'] = matches_df['date'].dt.year
matches_df['month'] = matches_df['date'].dt.month

print(f"Total matches: {len(matches_df):,}")
print(f"Date range: {matches_df['date'].min()} to {matches_df['date'].max()}")
matches_df.head()


In [None]:
# Match type distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart for match types
type_counts = matches_df['match_type'].value_counts()
axes[0].pie(type_counts.values, labels=type_counts.index, autopct='%1.1f%%')
axes[0].set_title('Match Type Distribution')

# Matches per year by type
yearly_matches = matches_df.groupby(['year', 'match_type']).size().unstack(fill_value=0)
yearly_matches.plot(kind='bar', ax=axes[1])
axes[1].set_title('Matches per Year by Type')
axes[1].set_xlabel('Year')
axes[1].set_ylabel('Number of Matches')

plt.tight_layout()
plt.show()


In [None]:
# Toss decision impact
toss_df = matches_df[matches_df['winner_id'].notna()].copy()
toss_df['toss_winner_won'] = toss_df['toss_winner_id'] == toss_df['winner_id']

toss_win_rate = toss_df.groupby(['match_type', 'toss_decision'])['toss_winner_won'].mean() * 100

print("\nWin Rate When Winning Toss (%)")
print(toss_win_rate.unstack())


## 3. Score Analysis


In [None]:
# Load innings data
innings_df = query_df("""
    SELECT 
        i.*,
        m.match_type,
        m.date
    FROM innings i
    JOIN matches m ON i.match_id = m.match_id
""")

# Score distributions by match type
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

t20_scores = innings_df[innings_df['match_type'] == 'T20']['total_runs']
odi_scores = innings_df[innings_df['match_type'] == 'ODI']['total_runs']

axes[0].hist(t20_scores, bins=30, edgecolor='black', alpha=0.7)
axes[0].set_title('T20 Score Distribution')
axes[0].set_xlabel('Runs')
axes[0].axvline(t20_scores.mean(), color='red', linestyle='--', label=f'Mean: {t20_scores.mean():.0f}')
axes[0].legend()

axes[1].hist(odi_scores, bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_title('ODI Score Distribution')
axes[1].set_xlabel('Runs')
axes[1].axvline(odi_scores.mean(), color='red', linestyle='--', label=f'Mean: {odi_scores.mean():.0f}')
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"\nT20 Score Statistics: Mean={t20_scores.mean():.1f}, Std={t20_scores.std():.1f}")
print(f"ODI Score Statistics: Mean={odi_scores.mean():.1f}, Std={odi_scores.std():.1f}")


## 4. ELO Analysis (if available)


In [None]:
# Check if ELO data exists
try:
    elo_check = query_df("SELECT COUNT(*) as count FROM team_elo_history")
    if elo_check['count'].iloc[0] > 0:
        # Load and display team rankings
        rankings = query_df("""
            SELECT t.name, e.elo_t20, e.elo_odi
            FROM team_current_elo e
            JOIN teams t ON e.team_id = t.team_id
            ORDER BY e.elo_t20 DESC
            LIMIT 15
        """)
        print("Top 15 Teams by T20 ELO:")
        print(rankings.to_string(index=False))
    else:
        print("ELO data not yet calculated. Run: python -m src.elo.calculator")
except Exception as e:
    print(f"ELO data not available: {e}")


## 5. Close Connection


In [None]:
conn.close()
print("Database connection closed.")
