In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from matplotlib import MatplotlibDeprecationWarning

# Suppress the specific FutureWarning from seaborn
warnings.filterwarnings('ignore', category=FutureWarning, module='seaborn')
warnings.filterwarnings("ignore", category=MatplotlibDeprecationWarning)

# Load the dataset
df = pd.read_csv('/kaggle/input/reboot-student-stress-factors/ReBoot_Student_Stress_Factors.csv')

# Get the maximum values from the dataset
max_values = df.max()

# Define expected maximum values based on data_guide.txt
expected_ranges = {
    'anxiety_level': 20,
    'self_esteem': 30,
    'depression': 20,  # Note: data_guide says 1-20 but our data has max 27
    'mental_health_history': 1,
    'headache': 5,
    'blood_pressure': 3,
    'sleep_quality': 5,
    'breathing_problem': 5,
    'noise_level': 5,
    'living_conditions': 5,
    'safety': 5,
    'basic_needs': 5,
    'academic_performance': 5,
    'study_load': None,  # Integer (no specific max in guide)
    'teacher_student_relationship': 5,
    'future_career_concerns': 5,
    'social_support': 5,
    'peer_pressure': 5,
    'extracurricular_activities': 5,
    'bullying': 5,
    'stress_level': 3  # Guide mentions 1-3 range
}

# Compare actual maximums with expected maximums
print("Comparison of Actual Max Values vs Expected Max Values:")
print(f"{'Column':<30} {'Actual Max':<15} {'Expected Max':<15} {'Match?':<10}")
print("-" * 70)

for col, expected_max in expected_ranges.items():
    if expected_max is not None:
        actual_max = max_values[col]
        # Check if the actual max is within the expected range
        if col == 'stress_level' and actual_max <= 2:  # Special case for stress_level (0-indexed)
            match = "✓" 
        elif actual_max <= expected_max:
            match = "✓"
        else:
            match = "✗"
        
        print(f"{col:<30} {actual_max:<15} {expected_max:<15} {match:<10}")
    else:
        print(f"{col:<30} {max_values[col]:<15} {'Not specified':<15} {'N/A':<10}")

# Visualize the comparison
plt.figure(figsize=(14, 10))
columns = []
actual_maxes = []
expected_maxes = []

for col, expected_max in expected_ranges.items():
    if expected_max is not None:
        columns.append(col)
        actual_maxes.append(max_values[col])
        expected_maxes.append(expected_max)

# Sort by column name
sorted_indices = sorted(range(len(columns)), key=lambda i: columns[i])
sorted_columns = [columns[i] for i in sorted_indices]
sorted_actual = [actual_maxes[i] for i in sorted_indices]
sorted_expected = [expected_maxes[i] for i in sorted_indices]

x = np.arange(len(sorted_columns))
width = 0.35

fig, ax = plt.subplots(figsize=(15, 10))
rects1 = ax.bar(x - width/2, sorted_actual, width, label='Actual Max')
rects2 = ax.bar(x + width/2, sorted_expected, width, label='Expected Max')

ax.set_ylabel('Maximum Value')
ax.set_title('Comparison of Actual Maximum Values vs Expected Maximum Values')
ax.set_xticks(x)warnings.filterwarnings("ignore", category=MatplotlibDeprecationWarning)
ax.set_xticklabels(sorted_columns, rotation=45, ha='right')
ax.legend()

# Highlight discrepancies
for i, (actual, expected) in enumerate(zip(sorted_actual, sorted_expected)):
    if actual > expected:
        plt.text(i - width/2, actual + 0.5, '!', color='red', ha='center', fontweight='bold', fontsize=14)

plt.tight_layout()
plt.show()

# Additional analysis for discrepancies
discrepancies = []
for col, expected_max in expected_ranges.items():
    if expected_max is not None and max_values[col] > expected_max:
        discrepancies.append(col)

if discrepancies:
    print("\nColumns with values exceeding expected maximums:")
    for col in discrepancies:
        # Show the distribution of the problematic column
        plt.figure(figsize=(10, 6))
        sns.histplot(df[col], kde=True, bins=30)
        plt.axvline(expected_ranges[col], color='red', linestyle='dashed', 
                   linewidth=2, label=f'Expected Max ({expected_ranges[col]})')
        plt.title(f'Distribution of {col} (Expected Max Exceeded)')
        plt.xlabel(f'{col} Value')
        plt.ylabel('Count')
        plt.legend()
        plt.tight_layout()
        plt.show()
        
        # Show how many records exceed the expected maximum
        exceed_count = df[df[col] > expected_ranges[col]].shape[0]
        exceed_percent = (exceed_count / len(df)) * 100
        print(f"  - {col}: Expected max is {expected_ranges[col]}, but {exceed_count} records ({exceed_percent:.1f}%) exceed this value")