# PVP Leaderboard - Descriptive Analysis
**Dataset**: 1,500 anonymized player entries  
**Category**: PVP (Player vs Player)  
**Focus**: Combat stats, kill/death patterns, arrow accuracy

In [None]:
# Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Styling
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

# Load data
df = pd.read_parquet('../data/processed/leaderboard_full.parquet')
pvp = df[df['category'] == 'pvp'].copy()

# Calculate derived metrics
pvp['kd_ratio'] = (pvp['kills'] / pvp['deaths']).replace([np.inf, -np.inf], np.nan)
pvp['arrow_accuracy'] = (pvp['arrows_hit'] / pvp['arrows_shot'] * 100).replace([np.inf, -np.inf], np.nan)

print(f"Loaded {len(pvp):,} players")
pvp.head()

## 1. Distribution of Kills

In [None]:
plt.figure(figsize=(12, 6))
plt.hist(pvp['kills'], bins=50, alpha=0.7, color='crimson', edgecolor='black')
plt.axvline(pvp['kills'].mean(), color='blue', linestyle='--', linewidth=2, label=f'Mean: {pvp["kills"].mean():.0f}')
plt.axvline(pvp['kills'].median(), color='orange', linestyle='--', linewidth=2, label=f'Median: {pvp["kills"].median():.0f}')
plt.title('Distribution of Total Kills', fontsize=16, fontweight='bold')
plt.xlabel('Kills', fontsize=12)
plt.ylabel('Number of Players', fontsize=12)
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Mean Kills: {pvp['kills'].mean():.2f}")
print(f"Median Kills: {pvp['kills'].median():.2f}")
print(f"Max Kills: {pvp['kills'].max():,}")

## 2. K/D Ratio Distribution

In [None]:
# Remove extreme outliers for better visualization
kd_clean = pvp['kd_ratio'][pvp['kd_ratio'] < 5].dropna()

plt.figure(figsize=(12, 6))
plt.hist(kd_clean, bins=50, alpha=0.7, color='darkgreen', edgecolor='black')
plt.axvline(1.0, color='red', linestyle='--', linewidth=2, label='K/D = 1.0 (Balanced)')
plt.axvline(kd_clean.median(), color='orange', linestyle='--', linewidth=2, label=f'Median: {kd_clean.median():.2f}')
plt.title('K/D Ratio Distribution', fontsize=16, fontweight='bold')
plt.xlabel('K/D Ratio', fontsize=12)
plt.ylabel('Number of Players', fontsize=12)
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Average K/D: {pvp['kd_ratio'].mean():.3f}")
print(f"Players with K/D > 1.0: {(pvp['kd_ratio'] > 1.0).sum()} ({(pvp['kd_ratio'] > 1.0).mean()*100:.1f}%)")

## 3. Kills vs Deaths Scatter + Regression

In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(pvp['deaths'], pvp['kills'], alpha=0.5, s=40, c='purple')

# Linear regression
z = np.polyfit(pvp['deaths'], pvp['kills'], 1)
p = np.poly1d(z)
plt.plot(pvp['deaths'], p(pvp['deaths']), "r--", linewidth=3, label=f'Trend: y = {z[0]:.2f}x + {z[1]:.0f}')

# 1:1 line
max_val = max(pvp['deaths'].max(), pvp['kills'].max())
plt.plot([0, max_val], [0, max_val], 'g--', linewidth=2, alpha=0.5, label='K/D = 1.0')

plt.title('Kills vs Deaths with Linear Regression', fontsize=16, fontweight='bold')
plt.xlabel('Deaths', fontsize=12)
plt.ylabel('Kills', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

corr = pvp[['kills', 'deaths']].corr().iloc[0, 1]
print(f"Correlation: {corr:.3f}")

## 4. Arrow Accuracy Distribution

In [None]:
# Remove extreme outliers
acc_clean = pvp['arrow_accuracy'][pvp['arrow_accuracy'] <= 100].dropna()

plt.figure(figsize=(12, 6))
plt.hist(acc_clean, bins=50, alpha=0.7, color='teal', edgecolor='black')
plt.axvline(acc_clean.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {acc_clean.mean():.1f}%')
plt.axvline(acc_clean.median(), color='orange', linestyle='--', linewidth=2, label=f'Median: {acc_clean.median():.1f}%')
plt.title('Arrow Accuracy Distribution', fontsize=16, fontweight='bold')
plt.xlabel('Accuracy (%)', fontsize=12)
plt.ylabel('Number of Players', fontsize=12)
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Average Accuracy: {acc_clean.mean():.2f}%")
print(f"Players with >50% accuracy: {(acc_clean > 50).sum()}")

## 5. Arrows Shot vs Arrows Hit

In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(pvp['arrows_shot'], pvp['arrows_hit'], alpha=0.5, s=40, c='navy')

# Regression
z = np.polyfit(pvp['arrows_shot'], pvp['arrows_hit'], 1)
p = np.poly1d(z)
plt.plot(pvp['arrows_shot'], p(pvp['arrows_shot']), "r--", linewidth=3, label=f'Trend: y = {z[0]:.3f}x + {z[1]:.0f}')

plt.title('Arrows Shot vs Arrows Hit', fontsize=16, fontweight='bold')
plt.xlabel('Arrows Shot', fontsize=12)
plt.ylabel('Arrows Hit', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

corr = pvp[['arrows_shot', 'arrows_hit']].corr().iloc[0, 1]
print(f"Correlation: {corr:.3f}")
print(f"Average accuracy from slope: {z[0]*100:.1f}%")

## 6. Damage Dealt Distribution

In [None]:
plt.figure(figsize=(12, 6))
plt.hist(pvp['damage_dealt'], bins=50, alpha=0.7, color='orange', edgecolor='black')
plt.axvline(pvp['damage_dealt'].mean(), color='blue', linestyle='--', linewidth=2, label=f'Mean: {pvp["damage_dealt"].mean():.1f} ♥')
plt.axvline(pvp['damage_dealt'].median(), color='red', linestyle='--', linewidth=2, label=f'Median: {pvp["damage_dealt"].median():.1f} ♥')
plt.title('Damage Dealt Distribution', fontsize=16, fontweight='bold')
plt.xlabel('Damage (Hearts)', fontsize=12)
plt.ylabel('Number of Players', fontsize=12)
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Total Damage Dealt: {pvp['damage_dealt'].sum():,.0f} ♥")

## 7. Correlation Heatmap

In [None]:
# Select numeric columns
numeric_cols = ['kills', 'deaths', 'kd_ratio', 'killstreak', 'damage_dealt', 'arrows_shot', 'arrows_hit', 'arrow_accuracy']
corr_matrix = pvp[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            fmt='.3f', square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap - PVP Data', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nStrongest Correlations:")
corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        corr_pairs.append({
            'var1': corr_matrix.columns[i],
            'var2': corr_matrix.columns[j],
            'correlation': corr_matrix.iloc[i, j]
        })
corr_df = pd.DataFrame(corr_pairs).sort_values('correlation', ascending=False, key=abs)
print(corr_df.head(5))

## 8. Top 20 Players by Kills

In [None]:
top_20 = pvp.nlargest(20, 'kills')[['player_id', 'kills', 'deaths', 'kd_ratio', 'killstreak']]

plt.figure(figsize=(14, 8))
x = range(len(top_20))
plt.bar(x, top_20['kills'], alpha=0.7, color='crimson', edgecolor='black')
plt.xticks(x, [f"P{i+1}" for i in range(len(top_20))], rotation=0)
plt.title('Top 20 Players by Total Kills', fontsize=16, fontweight='bold')
plt.xlabel('Player Rank', fontsize=12)
plt.ylabel('Total Kills', fontsize=12)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print(top_20.to_string(index=False))

## 9. Killstreak Distribution

In [None]:
plt.figure(figsize=(12, 6))
plt.hist(pvp['killstreak'], bins=50, alpha=0.7, color='gold', edgecolor='black')
plt.axvline(pvp['killstreak'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {pvp["killstreak"].mean():.1f}')
plt.axvline(pvp['killstreak'].median(), color='blue', linestyle='--', linewidth=2, label=f'Median: {pvp["killstreak"].median():.1f}')
plt.title('Best Killstreak Distribution', fontsize=16, fontweight='bold')
plt.xlabel('Killstreak', fontsize=12)
plt.ylabel('Number of Players', fontsize=12)
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Max Killstreak: {pvp['killstreak'].max():.0f}")
print(f"Players with killstreak >50: {(pvp['killstreak'] > 50).sum()}")

## 10. K/D Ratio vs Arrow Accuracy

In [None]:
# Clean data
mask = (pvp['kd_ratio'] < 5) & (pvp['arrow_accuracy'] <= 100)
plot_data = pvp[mask].dropna(subset=['kd_ratio', 'arrow_accuracy'])

plt.figure(figsize=(12, 8))
plt.scatter(plot_data['arrow_accuracy'], plot_data['kd_ratio'], alpha=0.5, s=40, c='forestgreen')

# Trend line
z = np.polyfit(plot_data['arrow_accuracy'], plot_data['kd_ratio'], 1)
p = np.poly1d(z)
plt.plot(plot_data['arrow_accuracy'], p(plot_data['arrow_accuracy']), "r--", linewidth=3, 
         label=f'Trend: y = {z[0]:.3f}x + {z[1]:.2f}')

plt.title('Skill Correlation: Arrow Accuracy vs K/D Ratio', fontsize=16, fontweight='bold')
plt.xlabel('Arrow Accuracy (%)', fontsize=12)
plt.ylabel('K/D Ratio', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

corr = plot_data[['arrow_accuracy', 'kd_ratio']].corr().iloc[0, 1]
print(f"Correlation: {corr:.3f}")
print(f"Interpretation: {'Positive' if corr > 0 else 'Negative'} correlation - better accuracy {'→' if corr > 0 else '←'} better K/D")

## Summary Statistics

In [None]:
print("=" * 80)
print("COMPREHENSIVE PVP SUMMARY")
print("=" * 80)
print(f"\nTotal Players: {len(pvp):,}")
print(f"Total Kills: {pvp['kills'].sum():,}")
print(f"Total Deaths: {pvp['deaths'].sum():,}")
print(f"Total Arrows Shot: {pvp['arrows_shot'].sum():,}")
print(f"Total Arrows Hit: {pvp['arrows_hit'].sum():,}")
print(f"Total Damage Dealt: {pvp['damage_dealt'].sum():,.0f} ♥")
print(f"\nAverage K/D Ratio: {pvp['kd_ratio'].mean():.3f}")
print(f"Average Arrow Accuracy: {pvp['arrow_accuracy'].mean():.2f}%")
print(f"Average Killstreak: {pvp['killstreak'].mean():.1f}")
print("\nDetailed Statistics:")
print(pvp[['kills', 'deaths', 'kd_ratio', 'killstreak', 'damage_dealt', 'arrow_accuracy']].describe())
print("=" * 80)