# Exploratory Data Analysis of Bangladesh Student Data

This notebook explores patterns and trends in educational data across Bangladesh.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set plotting style
plt.style.use('seaborn')
sns.set_palette('husl')

## 1. Load Processed Data

In [None]:
# Load cleaned data
data_path = Path('../processed_data/cleaned/cleaned_student_data.csv')
df = pd.read_csv(data_path)

print("Dataset Overview:")
print(f"Number of records: {len(df)}")
print(f"Number of features: {len(df.columns)}")
print("\nFeatures:")
print(df.columns.tolist())

## 2. Geographic Distribution

In [None]:
# Analyze student distribution by division
plt.figure(figsize=(12, 6))
division_counts = df['division'].value_counts()
sns.barplot(x=division_counts.index, y=division_counts.values)
plt.title('Student Distribution by Division')
plt.xticks(rotation=45)
plt.ylabel('Number of Students')
plt.show()

## 3. Academic Performance Analysis

In [None]:
# GPA distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='gpa', bins=20)
plt.title('Distribution of GPAs')
plt.xlabel('GPA')
plt.ylabel('Count')
plt.show()

# Performance by division
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='division', y='gpa')
plt.title('GPA Distribution by Division')
plt.xticks(rotation=45)
plt.show()

## 4. Attendance Patterns

In [None]:
# Calculate attendance rate
df['attendance_rate'] = df['days_present'] / df['total_school_days']

# Plot attendance distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='attendance_rate', bins=20)
plt.title('Distribution of Attendance Rates')
plt.xlabel('Attendance Rate')
plt.ylabel('Count')
plt.show()

# Correlation between attendance and performance
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='attendance_rate', y='gpa')
plt.title('Attendance Rate vs GPA')
plt.xlabel('Attendance Rate')
plt.ylabel('GPA')
plt.show()

## 5. Demographic Analysis

In [None]:
# Gender distribution
plt.figure(figsize=(8, 6))
gender_counts = df['gender'].value_counts()
plt.pie(gender_counts.values, labels=gender_counts.index, autopct='%1.1f%%')
plt.title('Gender Distribution')
plt.show()

# Performance by gender
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='gender', y='gpa')
plt.title('GPA Distribution by Gender')
plt.show()

## 6. Statistical Summary

In [None]:
# Calculate summary statistics
numeric_columns = ['gpa', 'attendance_rate']
summary_stats = df[numeric_columns].describe()
print("Summary Statistics:")
print(summary_stats)

# Calculate correlations
correlation_matrix = df[numeric_columns].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

## 7. Key Findings

1. Geographic Distribution:
   - Distribution of students across divisions
   - Regional variations in enrollment

2. Academic Performance:
   - Overall GPA distribution
   - Regional performance differences

3. Attendance Patterns:
   - Attendance rate distribution
   - Correlation with academic performance

4. Demographic Insights:
   - Gender distribution
   - Performance across demographics

## Next Steps

1. Conduct detailed statistical tests
2. Analyze trends over time
3. Investigate specific regional patterns
4. Create detailed reports for stakeholders