In [None]:
print("="*80)
print("SUMMARY AND KEY INSIGHTS")
print("="*80)
print()

# Best performing subject
best_subject = statistics.loc[statistics['Average'].idxmax()]
print(f"✓ Best Performing Subject: {best_subject['Subject']}")
print(f"  Average: {best_subject['Average']:.2f}, Max: {best_subject['Maximum']}, Min: {best_subject['Minimum']}")
print()

# Lowest performing subject
worst_subject = statistics.loc[statistics['Average'].idxmin()]
print(f"✗ Lowest Performing Subject: {worst_subject['Subject']}")
print(f"  Average: {worst_subject['Average']:.2f}, Max: {worst_subject['Maximum']}, Min: {worst_subject['Minimum']}")
print()

# Class average
class_average = df[subjects].values.mean()
print(f"• Class Average (All Subjects): {class_average:.2f}")
print()

# Overall statistics
print(f"• Highest Individual Score: {df[subjects].values.max()} marks")
print(f"• Lowest Individual Score: {df[subjects].values.min()} marks")
print(f"• Average Student Performance: {df['Average_Marks'].mean():.2f}")
print()

# Performance distribution
excellent = (df['Average_Marks'] >= 85).sum()
good = ((df['Average_Marks'] >= 75) & (df['Average_Marks'] < 85)).sum()
average = ((df['Average_Marks'] >= 65) & (df['Average_Marks'] < 75)).sum()
below = (df['Average_Marks'] < 65).sum()

print("Performance Distribution:")
print(f"  Excellent (≥85): {excellent} students ({excellent/len(df)*100:.1f}%)")
print(f"  Good (75-84): {good} students ({good/len(df)*100:.1f}%)")
print(f"  Average (65-74): {average} students ({average/len(df)*100:.1f}%)")
print(f"  Below Average (<65): {below} students ({below/len(df)*100:.1f}%)")
print()
print("="*80)

## 9. Summary and Key Insights

Final analysis and important conclusions from the data.

In [None]:
# Create a comprehensive subject analysis
fig, axes = plt.subplots(2, 1, figsize=(12, 10))

# Average marks comparison with error bars (std deviation)
ax1 = axes[0]
averages = [df[subject].mean() for subject in subjects]
stds = [df[subject].std() for subject in subjects]

colors_subject = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']
bars = ax1.bar(subjects, averages, yerr=stds, capsize=5, color=colors_subject, 
               edgecolor='black', linewidth=1.5, alpha=0.8, error_kw={'linewidth': 2})

ax1.set_ylabel('Average Marks', fontsize=11, fontweight='bold')
ax1.set_title('Average Marks Per Subject (with Standard Deviation)', fontsize=12, fontweight='bold')
ax1.set_ylim(0, 100)
ax1.grid(axis='y', alpha=0.3)
plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45, ha='right')

for i, v in enumerate(averages):
    ax1.text(i, v + stds[i] + 2, f'{v:.1f}', ha='center', va='bottom', fontweight='bold')

# Distribution of marks per subject (box-like representation)
ax2 = axes[1]
mark_ranges = []
for subject in subjects:
    q1 = df[subject].quantile(0.25)
    q3 = df[subject].quantile(0.75)
    mark_ranges.append(q3 - q1)

bars2 = ax2.bar(subjects, mark_ranges, color='#FFE66D', edgecolor='#FF6B35', linewidth=2, alpha=0.8)
ax2.set_ylabel('Interquartile Range (IQR)', fontsize=11, fontweight='bold')
ax2.set_title('Variability in Performance Per Subject (IQR)', fontsize=12, fontweight='bold')
ax2.grid(axis='y', alpha=0.3)
plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45, ha='right')

for i, v in enumerate(mark_ranges):
    ax2.text(i, v + 0.3, f'{v:.1f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

## 8. Subject Performance Analysis

Compare subject difficulty based on average performance and variance.

In [None]:
# Create a bar chart showing total marks for all students
fig, ax = plt.subplots(figsize=(14, 6))

# Sort students by total marks
sorted_df = df.sort_values('Total_Marks', ascending=False)

colors_gradient = plt.cm.viridis(np.linspace(0.3, 0.9, len(sorted_df)))
bars = ax.bar(range(len(sorted_df)), sorted_df['Total_Marks'], color=colors_gradient, edgecolor='black', linewidth=0.5)

# Add value labels on bars
for i, (idx, row) in enumerate(sorted_df.iterrows()):
    ax.text(i, row['Total_Marks'] + 2, f"{int(row['Total_Marks'])}", 
            ha='center', va='bottom', fontsize=8, fontweight='bold')

ax.set_xlabel('Students', fontsize=11, fontweight='bold')
ax.set_ylabel('Total Marks (out of 500)', fontsize=11, fontweight='bold')
ax.set_title('Total Marks Comparison Across All Students', fontsize=13, fontweight='bold')
ax.set_xticks(range(len(sorted_df)))
ax.set_xticklabels(sorted_df['Name'], rotation=45, ha='right')
ax.set_ylim(0, 500)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Student-wise Total Marks Comparison

Visualize total marks achieved by each student.

In [None]:
# Compare top 5 students across subjects
fig, ax = plt.subplots(figsize=(14, 6))

# Get top 5 students by average marks
top_5_students = student_performance.head(5)
x_pos = np.arange(len(subjects))
width = 0.15

colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']

for idx, (_, student_row) in enumerate(top_5_students.iterrows()):
    student_name = student_row['Name']
    marks = [df.loc[df['Name'] == student_name, subject].values[0] for subject in subjects]
    ax.bar(x_pos + (idx * width), marks, width, label=student_name, color=colors[idx], alpha=0.8)

ax.set_xlabel('Subjects', fontsize=11, fontweight='bold')
ax.set_ylabel('Marks', fontsize=11, fontweight='bold')
ax.set_title('Comparison of Top 5 Students Across Subjects', fontsize=13, fontweight='bold')
ax.set_xticks(x_pos + width * 2)
ax.set_xticklabels(subjects)
ax.legend(loc='upper left', framealpha=0.9)
ax.set_ylim(0, 100)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Compare Students Using Bar Charts

Create bar charts comparing individual student performance across subjects.

In [None]:
# Create a comprehensive subject performance visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Average marks per subject
ax1 = axes[0]
averages = [df[subject].mean() for subject in subjects]
bars1 = ax1.bar(subjects, averages, color='skyblue', edgecolor='navy', linewidth=1.5)
ax1.set_title('Average Marks Per Subject', fontsize=12, fontweight='bold')
ax1.set_ylabel('Average Marks', fontsize=10)
ax1.set_ylim(0, 100)
ax1.grid(axis='y', alpha=0.3)
for i, v in enumerate(averages):
    ax1.text(i, v + 1, f'{v:.1f}', ha='center', va='bottom', fontweight='bold')
plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45, ha='right')

# Maximum marks per subject
ax2 = axes[1]
maximums = [df[subject].max() for subject in subjects]
bars2 = ax2.bar(subjects, maximums, color='lightgreen', edgecolor='darkgreen', linewidth=1.5)
ax2.set_title('Maximum Marks Per Subject', fontsize=12, fontweight='bold')
ax2.set_ylabel('Maximum Marks', fontsize=10)
ax2.set_ylim(0, 100)
ax2.grid(axis='y', alpha=0.3)
for i, v in enumerate(maximums):
    ax2.text(i, v + 1, f'{v}', ha='center', va='bottom', fontweight='bold')
plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45, ha='right')

# Minimum marks per subject
ax3 = axes[2]
minimums = [df[subject].min() for subject in subjects]
bars3 = ax3.bar(subjects, minimums, color='lightsalmon', edgecolor='darkred', linewidth=1.5)
ax3.set_title('Minimum Marks Per Subject', fontsize=12, fontweight='bold')
ax3.set_ylabel('Minimum Marks', fontsize=10)
ax3.set_ylim(0, 100)
ax3.grid(axis='y', alpha=0.3)
for i, v in enumerate(minimums):
    ax3.text(i, v + 1, f'{v}', ha='center', va='bottom', fontweight='bold')
plt.setp(ax3.xaxis.get_majorticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

## 5. Visualize Subject-wise Performance

Create bar charts showing average, maximum, and minimum marks per subject.

In [None]:
# Calculate total marks and average per student
df['Total_Marks'] = df[subjects].sum(axis=1)
df['Average_Marks'] = df[subjects].mean(axis=1)

# Display student performance
student_performance = df[['Student_ID', 'Name', 'Total_Marks', 'Average_Marks']].copy()
student_performance['Average_Marks'] = student_performance['Average_Marks'].round(2)
student_performance = student_performance.sort_values('Average_Marks', ascending=False)

print("STUDENT-WISE PERFORMANCE (sorted by average marks):")
print("="*80)
print(student_performance.to_string(index=False))
print("\n" + "="*80 + "\n")

# Find top and bottom performers
print(f"Top Performer: {df.loc[df['Average_Marks'].idxmax(), 'Name']} "
      f"(Average: {df['Average_Marks'].max():.2f})")
print(f"Bottom Performer: {df.loc[df['Average_Marks'].idxmin(), 'Name']} "
      f"(Average: {df['Average_Marks'].min():.2f})")

## 4. Calculate Total Marks and Average Per Student

Perform column operations to compute aggregate student performance.

In [None]:
# Define subject columns
subjects = ['Mathematics', 'Physics', 'Chemistry', 'English', 'History']

# Calculate statistics for each subject
print("SUBJECT-WISE STATISTICS")
print("="*80)

statistics = pd.DataFrame({
    'Subject': subjects,
    'Average': [df[subject].mean() for subject in subjects],
    'Maximum': [df[subject].max() for subject in subjects],
    'Minimum': [df[subject].min() for subject in subjects]
})

print(statistics.to_string(index=False))
print("\n" + "="*80 + "\n")

# Round for better readability
statistics_rounded = statistics.round(2)
print("Statistics (Rounded to 2 decimals):")
print(statistics_rounded.to_string(index=False))

## 3. Calculate Statistics Per Subject

Calculate mean(), max(), and min() for each subject using column operations.

In [None]:
# Data types and info
print("Data Types:")
print(df.dtypes)
print("\n" + "="*80 + "\n")

# Check for missing values
print("Missing values:")
print(df.isnull().sum())
print("\n" + "="*80 + "\n")

# Summary statistics
print("Summary Statistics:")
print(df.describe())

## 2. Explore the Dataset

Check data types, missing values, and basic statistics.

In [2]:
# Load the CSV file
df = pd.read_csv('data/student_marks.csv')

# Display first few rows
print("First 5 rows of the dataset:")
print(df.head())
print("\n" + "="*80 + "\n")

# Display dataset shape and info
print(f"Dataset shape: {df.shape}")
print(f"Total students: {df.shape[0]}")
print(f"Total columns: {df.shape[1]}")
print("\nColumn names:")
print(df.columns.tolist())

NameError: name 'pd' is not defined

## 1. Load Student Marks Dataset

Using `pd.read_csv()` to load student marks from the CSV file.

In [3]:
# Install required libraries
import subprocess
import sys

packages = ['pandas', 'matplotlib', 'numpy']
for package in packages:
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

# Import Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Display settings for better visualization
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
plt.style.use('default')

[0m

# Student Marks Analysis

A comprehensive analysis of student academic performance across multiple subjects using pandas and matplotlib.

## Project Objectives
- Read student marks from CSV file
- Calculate statistics (average, max, min) per subject
- Compare student performance using bar charts
- Identify top and bottom performers