In [None]:
import pandas as pd
import numpy as np

In [2]:
def analyze_student_data(csv_file):
    """Analyze student data using Pandas."""
    # Load the data
    try:
        df = pd.read_csv(csv_file)
        print(f"Successfully loaded data with {df.shape[0]} students and {df.shape[1]} columns.")
    except Exception as e:
        print(f"Error loading file: {e}")
        return None
    
    # Basic data exploration
    print("\n=== BASIC DATA EXPLORATION ===")
    print(f"First few records:")
    print(df.head())
    
    print("\nColumn names:")
    print(df.columns.tolist())
    
    # Calculate average scores
    score_columns = ['math_score', 'science_score', 'english_score', 'history_score']
    df['average_score'] = df[score_columns].mean(axis=1).round(2)
    
    # Add letter grades
    def assign_grade(score):
        if score >= 90:
            return 'A'
        elif score >= 80:
            return 'B'
        elif score >= 70:
            return 'C'
        elif score >= 60:
            return 'D'
        else:
            return 'F'
    
    df['letter_grade'] = df['average_score'].apply(assign_grade)
    
    # Statistical summary
    print("\n=== STATISTICAL SUMMARY ===")
    print(df[score_columns + ['average_score']].describe())
    
    # Top students
    print("\n=== TOP STUDENTS ===")
    top_students = df.sort_values('average_score', ascending=False).head(3)
    print(top_students[['student_id', 'first_name', 'last_name', 'average_score', 'letter_grade']])
    
    # Students below average
    average = df['average_score'].mean()
    below_avg = df[df['average_score'] < average]
    print(f"\nNumber of students below average ({average:.2f}): {below_avg.shape[0]}")
    
    # Performance by gender
    print("\n=== PERFORMANCE BY GENDER ===")
    gender_performance = df.groupby('gender')['average_score'].mean().round(2)
    print(gender_performance)
    
    # Grade distribution
    print("\n=== GRADE DISTRIBUTION ===")
    grade_counts = df['letter_grade'].value_counts()
    grade_percentage = (grade_counts / len(df) * 100).round(1)
    grade_df = pd.DataFrame({'Count': grade_counts, 'Percentage': grade_percentage})
    print(grade_df)
    
    return df

# Test the function
student_data = analyze_student_data('../files/students_record.csv')

Successfully loaded data with 20 students and 11 columns.

=== BASIC DATA EXPLORATION ===
First few records:
  student_id first_name last_name gender  age  grade_level  math_score  \
0     STU001       Emma   Johnson      F   16           10        92.0   
1     STU002       Liam     Smith      M   15           10        78.0   
2     STU003     Olivia  Williams      F   16           10        95.0   
3     STU004       Noah     Brown      M   17           11        45.0   
4     STU005        Ava     Jones      F   16           10        82.0   

   science_score  english_score  history_score  attendance_rate  
0           88.0           95.0             78             0.97  
1           82.0           65.0             91             0.89  
2           96.0            NaN             94             0.98  
3           58.0           72.0             63             0.75  
4           79.0           85.0             80             0.92  

Column names:
['student_id', 'first_name', 'last_