In [None]:
# ================================================================
# GENERATE COMPREHENSIVE SCATTER PLOT DATA FOR REACT FRONTEND
# ================================================================

import json
import random

print("="*60)
print("GENERATING COMPREHENSIVE SCATTER PLOT DATA")
print("="*60)

# Set random seed for reproducible sampling
random.seed(42)
np.random.seed(42)

# Function to safely convert numpy types to native Python types
def convert_to_native_types(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_to_native_types(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_native_types(item) for item in obj]
    return obj

# Function to create comprehensive scatter plot data
def create_comprehensive_scatter_data():
    """Create scatter plot data with representative sampling from entire dataset"""
    
    # Clean data - remove rows with missing critical values
    df_clean = df.dropna(subset=['overall_rating', 'potential', 'age', 'positions', 'nationality'])
    
    print(f"Original dataset: {len(df)} players")
    print(f"Clean dataset: {len(df_clean)} players")
    
    # Define position groupings
    position_groups = {
        'forwards': ['ST', 'CF', 'LW', 'RW', 'LF', 'RF'],
        'midfielders': ['CM', 'CDM', 'CAM', 'LM', 'RM'],
        'defenders': ['CB', 'LB', 'RB', 'LWB', 'RWB'],
        'goalkeepers': ['GK']
    }
    
    # Sample data for better visualization (take more representative samples)
    sample_sizes = {
        'forwards': 800,
        'midfielders': 1000, 
        'defenders': 600,
        'goalkeepers': 200,
        'prospects': 400  # Young players with high potential
    }
    
    def sample_players_by_position(df, positions, n_samples, min_overall=None, max_overall=None):
        """Sample players by position with optional rating filters"""
        position_filter = df['positions'].isin(positions)
        filtered_df = df[position_filter]
        
        if min_overall is not None:
            filtered_df = filtered_df[filtered_df['overall_rating'] >= min_overall]
        if max_overall is not None:
            filtered_df = filtered_df[filtered_df['overall_rating'] <= max_overall]
            
        # If we have fewer players than requested samples, take all
        n_samples = min(n_samples, len(filtered_df))
        
        if n_samples > 0:
            return filtered_df.sample(n=n_samples, random_state=42)
        else:
            return pd.DataFrame()
    
    # Create samples for each position group
    sampled_data = {}
    
    for group_name, positions in position_groups.items():
        print(f"Sampling {group_name} (positions: {positions})...")
        sampled_df = sample_players_by_position(df_clean, positions, sample_sizes[group_name])
        sampled_data[group_name] = sampled_df
        print(f"  Sampled: {len(sampled_df)} players")
    
    # Special case: prospects (young players with high potential)
    print("Sampling prospects (young high-potential players)...")
    prospects_df = df_clean[
        (df_clean['age'] <= 23) & 
        (df_clean['potential'] >= 80) &
        (df_clean['potential'] - df_clean['overall_rating'] >= 5)  # Gap between potential and current
    ]
    if len(prospects_df) > sample_sizes['prospects']:
        prospects_df = prospects_df.sample(n=sample_sizes['prospects'], random_state=42)
    sampled_data['prospects'] = prospects_df
    print(f"  Sampled: {len(prospects_df)} prospects")
    
    # Create Overall vs Potential data structure
    overall_vs_potential_data = {
        "title": "Rating Keseluruhan vs Rating Potensial",
        "xAxis": "Rating Keseluruhan",
        "yAxis": "Rating Potensial",
        "data": {}
    }
    
    # Convert each group to ECharts format for Overall vs Potential
    for group_name, group_df in sampled_data.items():
        if len(group_df) > 0:
            group_data = []
            for _, player in group_df.iterrows():
                player_data = {
                    "value": [int(player['overall_rating']), int(player['potential'])],
                    "name": str(player['full_name']) if pd.notna(player['full_name']) else f"Player_{player.name}",
                    "age": int(player['age']),
                    "overall": int(player['overall_rating']),
                    "potential": int(player['potential']),
                    "position": str(player['positions']),
                    "nationality": str(player['nationality']) if pd.notna(player['nationality']) else "Unknown"
                }
                group_data.append(player_data)
            
            overall_vs_potential_data["data"][group_name] = group_data
    
    # Create Age vs Potential data (combine all groups into single array)
    age_vs_potential_data = {
        "title": "Hubungan Usia dengan Rating Potensial",
        "xAxis": "Usia (tahun)",
        "yAxis": "Rating Potensial",
        "data": []
    }
    
    # Combine all sampled players for age vs potential
    all_sampled = pd.concat(sampled_data.values(), ignore_index=True)
    
    for _, player in all_sampled.iterrows():
        # Determine age category
        age = int(player['age'])
        if age <= 21:
            category = "Muda"
        elif age <= 28:
            category = "Prima"
        elif age <= 33:
            category = "Berpengalaman"
        else:
            category = "Veteran"
            
        player_data = {
            "value": [age, int(player['potential'])],
            "name": str(player['full_name']) if pd.notna(player['full_name']) else f"Player_{player.name}",
            "age": age,
            "overall": int(player['overall_rating']),
            "potential": int(player['potential']),
            "position": str(player['positions']),
            "nationality": str(player['nationality']) if pd.notna(player['nationality']) else "Unknown",
            "category": category
        }
        age_vs_potential_data["data"].append(player_data)
    
    # Combine both scatter plots
    combined_data = {
        "overallVsPotential": overall_vs_potential_data,
        "ageVsPotential": age_vs_potential_data
    }
    
    return combined_data

# Generate comprehensive data
scatter_data = create_comprehensive_scatter_data()

# Convert numpy types to native Python types
scatter_data = convert_to_native_types(scatter_data)

# Print summary
print("\n" + "="*50)
print("SCATTER PLOT DATA SUMMARY")
print("="*50)

print("Overall vs Potential groups:")
for group_name, group_data in scatter_data["overallVsPotential"]["data"].items():
    print(f"  {group_name}: {len(group_data)} players")

total_overall_vs_potential = sum(len(group_data) for group_data in scatter_data["overallVsPotential"]["data"].values())
print(f"  TOTAL Overall vs Potential: {total_overall_vs_potential}")

print(f"\nAge vs Potential: {len(scatter_data['ageVsPotential']['data'])} players")

# Save to JSON file for React frontend
output_file = '../front-end/public/scatter_plots.json'
try:
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(scatter_data, f, indent=2, ensure_ascii=False)
    print(f"\n✅ Comprehensive scatter plot data saved to: {output_file}")
    print(f"📁 File size: {os.path.getsize(output_file) / 1024:.1f} KB")
except Exception as e:
    print(f"❌ Error saving file: {e}")
    
print("\n🎉 Scatter plot data generation completed!")
print("The React frontend will now display much more data points.")

In [None]:
# Add categorical summaries
for col in available_categorical:
    value_counts = df[col].value_counts()
    descriptive_stats["categorical_summary"][col] = {
        "unique_values": int(df[col].nunique()),
        "most_common": str(value_counts.index[0]) if len(value_counts) > 0 else "N/A",
        "most_common_count": int(value_counts.iloc[0]) if len(value_counts) > 0 else 0,
        "distribution": {str(k): int(v) for k, v in value_counts.head(10).items()}
    }

# Position distribution (use 'positions' column)
if 'positions' in df.columns:
    pos_counts = df['positions'].str.split(',').explode().str.strip().value_counts()
    descriptive_stats["categorical_summary"]["positions"] = {
        "unique_positions": int(len(pos_counts)),
        "most_common_position": str(pos_counts.index[0]) if len(pos_counts) > 0 else "N/A",
        "distribution": {str(k): int(v) for k, v in pos_counts.head(15).items()}
    }

# Nationality distribution
if 'nationality' in df.columns:
    nat_counts = df['nationality'].value_counts()
    descriptive_stats["categorical_summary"]["nationalities"] = {
        "unique_countries": int(df['nationality'].nunique()),
        "most_common_nationality": str(nat_counts.index[0]) if len(nat_counts) > 0 else "N/A",
        "top_10_countries": {str(k): int(v) for k, v in nat_counts.head(10).items()}
    }

# Save to JSON file in the frontend public directory
frontend_path = "../front-end/public/descriptive_stats.json"
try:
    with open(frontend_path, 'w', encoding='utf-8') as f:
        json.dump(descriptive_stats, f, indent=2, ensure_ascii=False)
    print(f"✅ Descriptive statistics saved to: {frontend_path}")
except Exception as e:
    print(f"❌ Error saving to frontend path: {e}")
    # Fallback: save in current directory
    with open('descriptive_stats.json', 'w', encoding='utf-8') as f:
        json.dump(descriptive_stats, f, indent=2, ensure_ascii=False)
    print("✅ Descriptive statistics saved to: descriptive_stats.json")

# Print summary
print(f"\n📊 DESCRIPTIVE STATISTICS SUMMARY")
print(f"Total Players: {descriptive_stats['overview']['total_players']:,}")
print(f"Total Attributes: {descriptive_stats['overview']['total_attributes']}")
print(f"Numeric Attributes: {descriptive_stats['overview']['numeric_attributes']}")
print(f"Missing Data: {descriptive_stats['overview']['missing_data_percentage']:.2f}%")

print(f"\n📈 KEY ATTRIBUTE STATISTICS:")
for attr, stats in descriptive_stats["key_statistics"].items():
    print(f"  {attr.replace('_', ' ').title()}:")
    print(f"    Mean: {stats['mean']:.2f} | Median: {stats['median']:.2f}")
    print(f"    Range: {stats['min']:.1f} - {stats['max']:.1f}")
    print(f"    Std Dev: {stats['std']:.2f}")

print(f"\n🌍 TOP CATEGORIES:")
if 'nationalities' in descriptive_stats["categorical_summary"]:
    top_nations = list(descriptive_stats["categorical_summary"]["nationalities"]["top_10_countries"].items())[:3]
    print(f"  Top Nationalities: {', '.join([f'{k} ({v})' for k, v in top_nations])}")

if 'positions' in descriptive_stats["categorical_summary"]:
    top_positions = list(descriptive_stats["categorical_summary"]["positions"]["distribution"].items())[:3]
    print(f"  Top Positions: {', '.join([f'{k} ({v})' for k, v in top_positions])}")

print("\n" + "="*60)