# Volunteer Performance Dataset Generator & Analyzer

This notebook provides an interactive interface for generating and analyzing volunteer performance datasets. It combines the functionality of both the dataset generator and analyzer tools into a single, easy-to-use interface.

## Features
- **Dataset Generation**: Create realistic volunteer datasets with customizable parameters
- **Statistical Analysis**: Analyze performance metrics and correlations
- **Promotion Criteria**: Get data-driven suggestions for volunteer promotion criteria
- **Interactive Visualizations**: Explore data with charts and graphs
- **Filtering Options**: Focus analysis on specific score ranges or performance levels


## Setup and Imports


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

# Import our custom classes
from volunteer_dataset_generator import VolunteerDatasetGenerator, Task, Volunteer
from volunteer_analyzer import VolunteerAnalyzer

print("✅ All libraries imported successfully!")
print("📊 Ready to generate and analyze volunteer datasets")


## Part 1: Dataset Generation

### Configure Dataset Parameters

Customize your dataset generation parameters below. The default values match the user's request.


In [None]:
# Dataset Configuration (you may change these values)
DATASET_SIZE = 734
HIGH_PERFORMERS = 0.45  # 45% high performers
LOW_PERFORMERS = 0.35   # 35% low performers
AVERAGE_PERFORMERS = 1.0 - HIGH_PERFORMERS - LOW_PERFORMERS  # 20% average performers
MIN_TASKS = 2
MAX_TASKS = 20
RANDOM_SEED = 69  # For reproducible results
OUTPUT_FILENAME = 'volunteers.csv'

print(f"📋 Dataset Configuration:")
print(f"   Size: {DATASET_SIZE} volunteers")
print(f"   High Performers: {HIGH_PERFORMERS*100:.0f}%")
print(f"   Average Performers: {AVERAGE_PERFORMERS*100:.0f}%")
print(f"   Low Performers: {LOW_PERFORMERS*100:.0f}%")
print(f"   Tasks per volunteer: {MIN_TASKS}-{MAX_TASKS}")
print(f"   Output file: {OUTPUT_FILENAME}")
print(f"   Random seed: {RANDOM_SEED}")


### Generate the Dataset


In [None]:
# Initialize the generator
generator = VolunteerDatasetGenerator(seed=RANDOM_SEED)

# Define the performance distribution
distribution = {
    'high_performer': HIGH_PERFORMERS,
    'average_performer': AVERAGE_PERFORMERS,
    'low_performer': LOW_PERFORMERS
}

print("🔄 Generating dataset...")

# Generate the dataset
volunteers = generator.generate_dataset(
    dataset_size=DATASET_SIZE,
    distribution=distribution,
    task_range=(MIN_TASKS, MAX_TASKS)
)

# Save to CSV
generator.save_to_csv(volunteers, OUTPUT_FILENAME)

print(f"✅ Dataset generated successfully!")
print(f"📁 Saved to: {OUTPUT_FILENAME}")
print(f"👥 Total volunteers: {len(volunteers)}")


### Dataset Statistics Overview


In [None]:
# Print basic statistics
generator.print_statistics(volunteers)

# Create a DataFrame for easier manipulation
df = pd.DataFrame([
    {
        'Name': v.name,
        'Total_Score': v.total_score,
        'Tasks_Completed': v.tasks_completed,
        'Average_Mark': v.average_mark,
        'Average_Rating': v.average_rating
    } for v in volunteers
])

print("\n📊 Dataset Preview:")
display(df.head(10))

print("\n📈 Quick Statistics:")
display(df.describe())


### Visualize Dataset Distribution


In [None]:
# Create comprehensive visualizations
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Total Score Distribution', 'Tasks Completed Distribution', 
                   'Average Mark Distribution', 'Average Rating Distribution'),
    specs=[[{"secondary_y": False}, {"secondary_y": False}],
           [{"secondary_y": False}, {"secondary_y": False}]]
)

# Total Score histogram
fig.add_trace(
    go.Histogram(x=df['Total_Score'], name='Total Score', nbinsx=30, 
                marker_color='lightblue', opacity=0.7),
    row=1, col=1
)

# Tasks Completed histogram
fig.add_trace(
    go.Histogram(x=df['Tasks_Completed'], name='Tasks Completed', nbinsx=20,
                marker_color='lightgreen', opacity=0.7),
    row=1, col=2
)

# Average Mark histogram
fig.add_trace(
    go.Histogram(x=df['Average_Mark'], name='Average Mark', nbinsx=25,
                marker_color='lightcoral', opacity=0.7),
    row=2, col=1
)

# Average Rating histogram
fig.add_trace(
    go.Histogram(x=df['Average_Rating'], name='Average Rating', nbinsx=25,
                marker_color='lightyellow', opacity=0.7),
    row=2, col=2
)

fig.update_layout(height=600, showlegend=False, 
                 title_text="Dataset Distribution Overview")
fig.show()

print("📊 Dataset generation complete! Moving to analysis...")


## Part 2: Dataset Analysis

### Load and Initialize Analyzer


In [None]:
# Initialize the analyzer with the generated dataset
print("🔍 Initializing analyzer...")
analyzer = VolunteerAnalyzer(
    csv_file=OUTPUT_FILENAME,
    score_filter='above',
    min_score=0 # by default, all volunteers are included; this ensures we only include volunteers with a score above 0
)

print(f"✅ Analyzer loaded successfully!")
print(f"📊 Dataset: {len(analyzer.df)} volunteers loaded for analysis")


### Basic Statistical Analysis


In [None]:
# Run basic statistics
analyzer.basic_statistics()

### Percentile Analysis


In [None]:
# Analyze performance percentiles
analyzer.percentile_analysis()

### Score-Only Promotion Criteria

The analyzer now uses a simplified, score-only approach for promotion criteria. This makes decisions clearer and easier to implement.


In [None]:
# Get promotion criteria suggestions (default: top 10%, percentile method)
analyzer.suggest_promotion_criteria()

### Flexible Promotion Threshold Analysis

Test different percentile targets to find the right promotion threshold for your organization!


In [None]:
# Test different percentile targets with simplified score-only analysis
print("🎯 TESTING DIFFERENT PERCENTILE TARGETS")
print("="*50)

# Test top 5% (very selective)
print("\n1️⃣ TOP 5% PROMOTION CRITERIA:")
analyzer.show_promotion_thresholds(target_percentile=5.0)

# Test top 15% (moderate)
print("\n2️⃣ TOP 15% PROMOTION CRITERIA:")
analyzer.show_promotion_thresholds(target_percentile=15.0)

# Test top 25% (more inclusive)
print("\n3️⃣ TOP 25% PROMOTION CRITERIA:")
analyzer.show_promotion_thresholds(target_percentile=25.0)

### Threshold Comparison Analysis

Compare your chosen threshold with common industry standards:


In [None]:
# Compare your chosen percentile with common thresholds
print("🔍 COMPARING DIFFERENT PROMOTION THRESHOLDS")
print("="*60)

# Compare 15% with common percentiles and median/mean thresholds
print("Comparing top 15% with other common promotion thresholds:")
analyzer.compare_promotion_percentiles(target_percentile=15.0, common_percentiles=[5, 10, 25, 50])

print("\n📊 Key Insights from Method Comparison:")
print("   • Percentile method: Uses relative ranking (most common approach)")
print("   • Median method: Uses middle values as benchmarks (stable, outlier-resistant)")
print("   • Mean method: Uses average values (can be affected by extreme performers)")
print("   • Choose based on your organization's philosophy and data distribution")


### Test Specific Method + Percentile Combinations

Try different combinations to find what works best for your organization:


In [None]:
# Test specific score thresholds and percentile combinations
print("🧪 TESTING SPECIFIC SCORE THRESHOLDS")
print("="*60)

# Example 1: Test top 20% promotion threshold
print("\n1️⃣ TOP 20% PROMOTION ANALYSIS:")
analyzer.show_promotion_thresholds(target_percentile=20.0)

# Example 2: Test top 10% promotion threshold  
print("\n2️⃣ TOP 10% PROMOTION ANALYSIS:")
analyzer.show_promotion_thresholds(target_percentile=10.0)

# Example 3: Compare 12% with common thresholds
print("\n3️⃣ COMPARING 12% WITH COMMON THRESHOLDS:")
analyzer.compare_promotion_percentiles(target_percentile=12.0, common_percentiles=[5, 10, 15, 25])

print("\n💡 SIMPLIFIED APPROACH BENEFITS:")
print("   🎯 Score-only criteria are easy to understand and implement")
print("   📊 Clear threshold comparisons help make informed decisions") 
print("   📈 Focus on total performance rather than individual metrics")
print("   🔄 Quick comparison with industry standards (10%, 25%, 50%)")


## Part 3: Interactive Analysis

### Custom Criteria Testing

Use the cell below to test your own custom promotion criteria:


In [None]:
# Interactive Custom Analysis - modify these values as needed
TARGET_PERCENTILE = 15.0  # Change this to target different percentiles (5, 10, 15, 20, 25, 50, etc.)

print("🎛️ INTERACTIVE CUSTOM ANALYSIS")
print("="*50)
print(f"Target: Top {TARGET_PERCENTILE}% of volunteers")
print("="*50)

# Generate score-only promotion criteria
print("📊 Score-Only Promotion Analysis:")
analyzer.suggest_promotion_criteria(target_percentile=TARGET_PERCENTILE)

# Test manual score threshold
print(f"\n" + "="*60)
print("MANUAL SCORE THRESHOLD TEST")
print("="*60)

CUSTOM_MIN_SCORE = 80  # Test a specific score threshold

print("🧪 Testing Manual Score Threshold:")
print(f"   Minimum Score: {CUSTOM_MIN_SCORE}")

analyzer.test_score_threshold(min_score=CUSTOM_MIN_SCORE, show_volunteers=False)


### Performance Distribution Visualization


In [None]:
# Create detailed performance visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Volunteer Performance Analysis Dashboard', fontsize=16, y=1.02)

# Total Score vs Tasks Completed scatter plot
axes[0, 0].scatter(analyzer.df['Tasks_Completed'], analyzer.df['Total_Score'], 
                  alpha=0.6, c=analyzer.df['Average_Mark'], cmap='viridis')
axes[0, 0].set_xlabel('Tasks Completed')
axes[0, 0].set_ylabel('Total Score')
axes[0, 0].set_title('Total Score vs Tasks Completed\n(Color = Average Mark)')
axes[0, 0].grid(True, alpha=0.3)

# Average Mark vs Average Rating scatter plot
scatter = axes[0, 1].scatter(analyzer.df['Average_Rating'], analyzer.df['Average_Mark'], 
                           alpha=0.6, c=analyzer.df['Total_Score'], cmap='plasma')
axes[0, 1].set_xlabel('Average Rating')
axes[0, 1].set_ylabel('Average Mark')
axes[0, 1].set_title('Average Mark vs Average Rating\n(Color = Total Score)')
axes[0, 1].grid(True, alpha=0.3)
plt.colorbar(scatter, ax=axes[0, 1], label='Total Score')

# Total Score distribution with percentile lines
axes[1, 0].hist(analyzer.df['Total_Score'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
# Add percentile lines
p75 = analyzer.df['Total_Score'].quantile(0.75)
p90 = analyzer.df['Total_Score'].quantile(0.90)
p95 = analyzer.df['Total_Score'].quantile(0.95)
axes[1, 0].axvline(p75, color='orange', linestyle='--', label=f'75th percentile ({p75:.0f})')
axes[1, 0].axvline(p90, color='red', linestyle='--', label=f'90th percentile ({p90:.0f})')
axes[1, 0].axvline(p95, color='darkred', linestyle='--', label=f'95th percentile ({p95:.0f})')
axes[1, 0].set_xlabel('Total Score')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Total Score Distribution with Percentiles')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Box plot of all metrics
metrics_data = [
    analyzer.df['Total_Score'] / 10,  # Scale down for comparison
    analyzer.df['Average_Mark'],
    analyzer.df['Average_Rating'] + 2,  # Shift up for positive values
    analyzer.df['Tasks_Completed']
]
axes[1, 1].boxplot(metrics_data, labels=['Total Score\n(÷10)', 'Avg Mark', 'Avg Rating\n(+2)', 'Tasks'])
axes[1, 1].set_title('Performance Metrics Distribution')
axes[1, 1].set_ylabel('Scaled Values')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("📊 Performance dashboard complete!")


### Top Performers Analysis


In [None]:
# Analyze top performers
top_10_percent = analyzer.df.nlargest(int(len(analyzer.df) * 0.1), 'Total_Score')
top_25_percent = analyzer.df.nlargest(int(len(analyzer.df) * 0.25), 'Total_Score')

print("🏆 TOP 10% PERFORMERS ANALYSIS")
print("="*50)
print(f"Number of volunteers: {len(top_10_percent)}")
print(f"Score range: {top_10_percent['Total_Score'].min():.0f} - {top_10_percent['Total_Score'].max():.0f}")
print(f"Average score: {top_10_percent['Total_Score'].mean():.1f}")
print(f"Average tasks: {top_10_percent['Tasks_Completed'].mean():.1f}")
print(f"Average mark: {top_10_percent['Average_Mark'].mean():.2f}")
print(f"Average rating: {top_10_percent['Average_Rating'].mean():.2f}")

print("\n⭐ TOP 25% PERFORMERS ANALYSIS")
print("="*50)
print(f"Number of volunteers: {len(top_25_percent)}")
print(f"Score range: {top_25_percent['Total_Score'].min():.0f} - {top_25_percent['Total_Score'].max():.0f}")
print(f"Average score: {top_25_percent['Total_Score'].mean():.1f}")
print(f"Average tasks: {top_25_percent['Tasks_Completed'].mean():.1f}")
print(f"Average mark: {top_25_percent['Average_Mark'].mean():.2f}")
print(f"Average rating: {top_25_percent['Average_Rating'].mean():.2f}")

print("\n🎯 TOP 10 INDIVIDUAL PERFORMERS")
print("="*50)
top_10_individuals = analyzer.df.nlargest(10, 'Total_Score')
for i, (_, volunteer) in enumerate(top_10_individuals.iterrows(), 1):
    print(f"{i:2d}. {volunteer['Name']:<25} | Score: {volunteer['Total_Score']:3.0f} | "
          f"Tasks: {volunteer['Tasks_Completed']:2.0f} | Mark: {volunteer['Average_Mark']:.2f} | "
          f"Rating: {volunteer['Average_Rating']:4.2f}")


## Part 4: Advanced Analysis Options

### Score-Filtered Analysis

Analyze only volunteers above a certain score threshold:


In [None]:
# Analyze high-performing volunteers only (score >= 60)
SCORE_THRESHOLD = 60

print(f"🔍 ANALYZING VOLUNTEERS WITH SCORES >= {SCORE_THRESHOLD}")
print("="*60)

# Create a filtered analyzer
filtered_analyzer = VolunteerAnalyzer(csv_file=OUTPUT_FILENAME, 
                                    score_filter='above', 
                                    min_score=SCORE_THRESHOLD)

if len(filtered_analyzer.df) > 0:
    print("\n📊 Filtered Dataset Statistics:")
    filtered_analyzer.basic_statistics()
    
    print("\n🎯 Promotion Criteria for High Performers (Top 15%):")
    filtered_analyzer.suggest_promotion_criteria(target_percentile=15.0)
    
    print("\n🔍 Testing Different Thresholds for High Performers:")
    filtered_analyzer.compare_promotion_percentiles(target_percentile=15.0, common_percentiles=[5, 10, 25])
else:
    print(f"❌ No volunteers found with scores >= {SCORE_THRESHOLD}")
    print("💡 Try lowering the threshold or generating a different dataset")


### Export Analysis Results


In [None]:
# Create a summary report
summary_stats = {
    'Dataset Size': len(analyzer.df),
    'Average Total Score': analyzer.df['Total_Score'].mean(),
    'Median Total Score': analyzer.df['Total_Score'].median(),
    'Score Standard Deviation': analyzer.df['Total_Score'].std(),
    'Average Tasks per Volunteer': analyzer.df['Tasks_Completed'].mean(),
    'Average Mark': analyzer.df['Average_Mark'].mean(),
    'Average Rating': analyzer.df['Average_Rating'].mean(),
    '75th Percentile Score': analyzer.df['Total_Score'].quantile(0.75),
    '90th Percentile Score': analyzer.df['Total_Score'].quantile(0.90),
    '95th Percentile Score': analyzer.df['Total_Score'].quantile(0.95)
}

# Create summary DataFrame
summary_df = pd.DataFrame(list(summary_stats.items()), columns=['Metric', 'Value'])
summary_df['Value'] = summary_df['Value'].round(2)

print("📋 ANALYSIS SUMMARY REPORT")
print("="*40)
display(summary_df)

# Save summary to CSV
summary_filename = 'analysis_summary.csv'
summary_df.to_csv(summary_filename, index=False)
print(f"\n💾 Summary saved to: {summary_filename}")

# Save top performers to CSV
top_performers_filename = 'top_performers.csv'
top_25_percent.to_csv(top_performers_filename, index=False)
print(f"💾 Top 25% performers saved to: {top_performers_filename}")

print("\n✅ Analysis complete! All results have been saved.")