# Student Performance Analysis

Detailed analysis of student performance metrics across Bangladesh educational institutions.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from pathlib import Path

# Set plotting style
plt.style.use('seaborn')
sns.set_palette('husl')

# Load the data
data_path = Path('../processed_data/cleaned/cleaned_student_data.csv')
df = pd.read_csv(data_path)

## 1. Overall Performance Metrics

In [None]:
def calculate_performance_metrics(data):
    """Calculate key performance metrics."""
    metrics = {
        'average_gpa': data['gpa'].mean(),
        'median_gpa': data['gpa'].median(),
        'std_gpa': data['gpa'].std(),
        'passing_rate': (data['gpa'] >= 2.0).mean() * 100,
        'excellence_rate': (data['gpa'] >= 4.0).mean() * 100
    }
    return pd.Series(metrics)

overall_metrics = calculate_performance_metrics(df)
print("Overall Performance Metrics:")
print(overall_metrics)

## 2. Performance Trends by Subject

In [None]:
# Assuming we have subject-wise scores
subjects = ['Mathematics', 'Science', 'English', 'Bangla', 'Social_Science']

# Create subject performance plot
plt.figure(figsize=(12, 6))
subject_means = df[subjects].mean()
subject_stds = df[subjects].std()

# Plot mean scores with error bars
plt.errorbar(subjects, subject_means, yerr=subject_stds, fmt='o')
plt.title('Average Performance by Subject')
plt.xticks(rotation=45)
plt.ylabel('Average Score')
plt.show()

## 3. Performance Distribution Analysis

In [None]:
def analyze_performance_distribution(data):
    """Analyze the distribution of performance scores."""
    # Create performance categories
    conditions = [
        (data['gpa'] >= 4.0),
        (data['gpa'] >= 3.5) & (data['gpa'] < 4.0),
        (data['gpa'] >= 3.0) & (data['gpa'] < 3.5),
        (data['gpa'] >= 2.0) & (data['gpa'] < 3.0),
        (data['gpa'] < 2.0)
    ]
    categories = ['Excellent', 'Very Good', 'Good', 'Average', 'Needs Improvement']
    data['performance_category'] = np.select(conditions, categories)
    
    # Plot distribution
    plt.figure(figsize=(10, 6))
    sns.countplot(data=data, x='performance_category', order=categories)
    plt.title('Distribution of Performance Categories')
    plt.xticks(rotation=45)
    plt.ylabel('Number of Students')
    plt.show()
    
    return data['performance_category'].value_counts()

performance_distribution = analyze_performance_distribution(df)
print("\nPerformance Distribution:")
print(performance_distribution)

## 4. Factors Affecting Performance

In [None]:
def analyze_performance_factors(data):
    """Analyze various factors affecting performance."""
    # Attendance vs Performance
    plt.figure(figsize=(10, 6))
    sns.regplot(data=data, x='attendance_rate', y='gpa')
    plt.title('Attendance Rate vs GPA')
    plt.show()
    
    # Calculate correlation
    correlation = data['attendance_rate'].corr(data['gpa'])
    print(f"Correlation between attendance and GPA: {correlation:.3f}")
    
    # Regional variations
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=data, x='division', y='gpa')
    plt.title('GPA Distribution by Division')
    plt.xticks(rotation=45)
    plt.show()
    
    # Perform ANOVA test for regional differences
    divisions = data['division'].unique()
    division_groups = [data[data['division'] == div]['gpa'] for div in divisions]
    f_stat, p_value = stats.f_oneway(*division_groups)
    print(f"\nANOVA test for regional differences:")
    print(f"F-statistic: {f_stat:.3f}")
    print(f"p-value: {p_value:.3f}")

analyze_performance_factors(df)

## 5. Performance Gap Analysis

In [None]:
def analyze_performance_gaps(data):
    """Analyze performance gaps across different demographics."""
    # Gender gap
    gender_stats = data.groupby('gender')['gpa'].agg(['mean', 'std'])
    print("Performance by Gender:")
    print(gender_stats)
    
    # Perform t-test for gender gap
    male_scores = data[data['gender'] == 'Male']['gpa']
    female_scores = data[data['gender'] == 'Female']['gpa']
    t_stat, p_value = stats.ttest_ind(male_scores, female_scores)
    print(f"\nT-test for gender gap:")
    print(f"t-statistic: {t_stat:.3f}")
    print(f"p-value: {p_value:.3f}")
    
    # Urban-Rural gap
    if 'location_type' in data.columns:
        location_stats = data.groupby('location_type')['gpa'].agg(['mean', 'std'])
        print("\nPerformance by Location:")
        print(location_stats)

analyze_performance_gaps(df)

## 6. Recommendations Based on Analysis

Based on the analysis above, we can make the following recommendations:

1. Performance Improvement:
   - Focus areas for different performance categories
   - Subject-specific interventions
   - Attendance improvement strategies

2. Regional Support:
   - Additional resources for underperforming regions
   - Best practice sharing between regions
   - Targeted intervention programs

3. Gap Reduction:
   - Strategies to address gender gaps
   - Urban-rural equity measures
   - Resource allocation recommendations

4. Monitoring and Evaluation:
   - Key metrics to track
   - Regular assessment schedule
   - Performance indicator framework