<a href="https://colab.research.google.com/github/fakhrzakbar/Project--/blob/main/analysis_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Student Performance and Aptitude Analysis
# The Key English Course Company - Indonesia

### A Comprehensive Data-Driven Study

**Analysis Date:** January 13, 2026
**Total Students:** 150 (50 per course level)
**Variables Analyzed:** Performance Scores, Aptitude Scores
**Statistical Methods:** ANOVA, Correlation, Effect Sizes, Post-Hoc Tests#


# Chapter 1: Environment Setup

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np


# Chapter 2: Data Loading and Overview

In [None]:
# 1. Load your data
# Ensure your file is named 'data.csv' and is in the same folder as this script
df = pd.read_csv('student_combined_data.csv')

# Chapter 3: Descriptive Statistics by Course Level

# Chapter 4: Statistical Assumption Testing

# Chapter 5: One-Way ANOVA Analysis

In [None]:
# --- 2. Calculate Basic Statistics ---
# We select the two relevant columns and use .agg to get most stats at once
overall_stats = df[['performance_score', 'aptitude_score']].agg(['mean', 'median', 'min', 'max'])

# --- 3. Calculate "Range" (Spread) ---
# Range isn't a built-in pandas string function, so we calculate it manually
# Range = Max - Min
overall_stats.loc['range'] = overall_stats.loc['max'] - overall_stats.loc['min']

# --- 4. Cleaning Up the Table ---
# Rename the Index to match your "Academic" labels
row_labels = {
    'mean': 'Average (Mean)',
    'median': 'Middle Value (Median)',
    'min': 'Lowest Score',
    'max': 'Highest Score',
    'range': 'Spread (Range)'
}
overall_stats = overall_stats.rename(index=row_labels)

# --- 5. Formatting ---
# Round Performance to 2 decimals and Aptitude to 2 decimals
# (You can adjust this if you want Aptitude to be integers)
pd.options.display.float_format = '{:,.2f}'.format

# --- 6. Display ---
print(overall_stats)

In [None]:
import pandas as pd
from scipy import stats

# --- 1. Load Data ---
df = pd.read_csv('student_combined_data.csv')

# --- 2. Prepare Data for ANOVA ---
# Group the performance scores by course level
groups = [group['performance_score'].values for name, group in df.groupby('course_level')]

# --- 3. Run One-Way ANOVA ---
f_stat, p_val = stats.f_oneway(*groups)

# --- 4. Calculate Effect Size (Eta Squared) ---
# Formula: Eta_Squared = Sum_Squares_Between / Sum_Squares_Total

# Calculate Grand Mean (average of everyone)
grand_mean = df['performance_score'].mean()

# Calculate Sum of Squares Total (SST)
ss_total = ((df['performance_score'] - grand_mean) ** 2).sum()

# Calculate Sum of Squares Between (SSB)
ss_between = 0
for name, group in df.groupby('course_level'):
    n = len(group) # Number of students in this level
    group_mean = group['performance_score'].mean()
    ss_between += n * (group_mean - grand_mean)**2

eta_squared = ss_between / ss_total

# --- 5. Print Results in the Format of the Image ---
print("6.2  Testing for Performance Differences")
print("6.2.1  The Test: One-Way ANOVA")
print("-" * 40)
print("What we tested: Are the average performance scores different across the three course levels?")
print("\nResults:")

# Print F-value (Matches 213.41)
print(f"* Test Statistic (F-value): {f_stat:.2f}")

# Print P-value logic (Matches "Less than 0.001")
if p_val < 0.001:
    print("* Probability (p-value): Less than 0.001 (less than 0.1%)")
else:
    print(f"* Probability (p-value): {p_val:.4f}")

# Print Effect Size (Matches 0.744)
print(f"* Effect Size: {eta_squared:.3f} (meaning {eta_squared*100:.1f}% of the difference is explained by course level)")

# Chapter 6: Post-Hoc Tests (Tukey HSD)

In [None]:
# --- 2. Calculate Basic Statistics ---
# Group by 'course_level' and calculate count, mean, min, max, and std dev
stats = df.groupby('course_level')['performance_score'].agg(['count', 'mean', 'min', 'max', 'std'])

# --- 3. Calculate "Typical Range" (Mean +/- SD) ---
# We calculate the lower and upper bounds
stats['lower'] = stats['mean'] - stats['std']
stats['upper'] = stats['mean'] + stats['std']

# Combine them into a single string column like "2.85 - 3.62"
stats['Typical Range'] = (
    stats['lower'].map('{:.2f}'.format) + ' - ' +
    stats['upper'].map('{:.2f}'.format)
)

# --- 4. Formatting & Cleaning ---
# Rename columns to match your Table 2 headers
stats = stats.rename(columns={
    'count': 'Students',
    'mean': 'Average',
    'min': 'Lowest',
    'max': 'Highest'
})

# Select only the columns we want to display
final_table = stats[['Students', 'Average', 'Lowest', 'Highest', 'Typical Range']]

# Reorder the rows to match your image (Advanced -> Intermediate -> Foundation)
final_table = final_table.reindex(['Advanced', 'Intermediate', 'Foundation'])

# Apply formatting to the numeric columns (2 decimal places)
# Note: 'Students' is an integer, so we don't apply float format to it.
pd.options.display.float_format = '{:.2f}'.format

# --- 5. Display ---
print("Table 2: Performance Scores Across Course Levels")
print(final_table)

In [None]:
# --- 2. Calculate Basic Statistics ---
# Group by 'course_level' and calculate stats for 'aptitude_score'
stats = df.groupby('course_level')['aptitude_score'].agg(['count', 'mean', 'min', 'max', 'std'])

# --- 3. Calculate "Typical Range" (Mean +/- Standard Deviation) ---
# The image shows the range values as Integers (e.g., "48 - 86"), so we round them.
stats['lower'] = (stats['mean'] - stats['std']).round(0).astype(int)
stats['upper'] = (stats['mean'] + stats['std']).round(0).astype(int)

# Create the "Int - Int" string format
stats['Typical Range'] = (
    stats['lower'].astype(str) + ' - ' +
    stats['upper'].astype(str)
)

# --- 4. Formatting & Cleaning ---
# Rename columns to match Table 3
stats = stats.rename(columns={
    'count': 'Students',
    'mean': 'Average',
    'min': 'Lowest',
    'max': 'Highest'
})

# Reorder rows (Advanced -> Intermediate -> Foundation)
stats = stats.reindex(['Advanced', 'Intermediate', 'Foundation'])

# Select only the specific columns shown in the image
final_table = stats[['Students', 'Average', 'Lowest', 'Highest', 'Typical Range']]

# --- 5. Final Display Settings ---
# Ensure 'Average' displays with 2 decimal places
pd.options.display.float_format = '{:.2f}'.format

print("Table 3: Aptitude Scores Across Course Levels")
print(final_table)

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

# --- 1. Load Data ---
df = pd.read_csv('student_combined_data.csv')

# --- 2. Define Helper Function for Cohen's d ---
# Cohen's d measures the "standardized mean difference"
def calculate_cohens_d(group1, group2):
    n1, n2 = len(group1), len(group2)
    # Calculate sample variances (ddof=1)
    var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)

    # Calculate Pooled Standard Deviation
    numerator = ((n1 - 1) * var1) + ((n2 - 1) * var2)
    denominator = n1 + n2 - 2
    pooled_sd = np.sqrt(numerator / denominator)

    # Calculate Cohen's d
    d = (np.mean(group1) - np.mean(group2)) / pooled_sd
    return d

# --- 3. Define Interpretation Logic ---
def get_interpretation(d_value):
    abs_d = abs(d_value)
    if abs_d >= 1.2:
        return "Very Large"
    elif abs_d >= 0.8:
        return "Large"
    elif abs_d >= 0.5:
        return "Medium"
    elif abs_d >= 0.2:
        return "Small"
    else:
        return "Negligible"

# --- 4. Perform Comparisons and Build Table Data ---
# Define the specific pairs to compare
comparisons = [
    ('Advanced', 'Intermediate'),
    ('Intermediate', 'Foundation'),
    ('Advanced', 'Foundation')
]

rows = []

for label1, label2 in comparisons:
    # Isolate the data for the two groups being compared
    g1 = df[df['course_level'] == label1]['performance_score']
    g2 = df[df['course_level'] == label2]['performance_score']

    # A. Calculate raw difference
    diff = g1.mean() - g2.mean()

    # B. Calculate p-value (Independent t-test)
    t_stat, p_val = stats.ttest_ind(g1, g2)

    # C. Calculate Effect Size (Cohen's d)
    d_val = calculate_cohens_d(g1, g2)

    # D. Determine Interpretation string
    interpret_str = get_interpretation(d_val)

    # --- Formatting rows to match image ---
    formatted_row = {
        'Comparison': f"{label1} vs {label2}",
        'Difference': f"{diff:.2f} points",
        # Format p-value to "<0.001" if extremely small
        'p-value': "<0.001" if p_val < 0.001 else f"{p_val:.3f}",
        'Effect Size': f"{d_val:.2f}",
        'Interpretation': interpret_str
    }
    rows.append(formatted_row)

# --- 5. Create and Display Pandas DataFrame ---
results_df = pd.DataFrame(rows)

# Set display options to make sure columns don't get cut off
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

# Print Titles
print("6.2.2  Comparing Specific Pairs of Levels\n")
print("Table 4: Pairwise Performance Comparisons")
# Print the Table
print(results_df.to_string(index=False))

In [None]:
import pandas as pd
from scipy import stats

# --- 1. Load Data ---
df = pd.read_csv('student_combined_data.csv')

# --- 2. Prepare Data for ANOVA ---
# Group the Aptitude scores by course level
groups = [group['aptitude_score'].values for name, group in df.groupby('course_level')]

# --- 3. Run One-Way ANOVA ---
f_stat, p_val = stats.f_oneway(*groups)

# --- 4. Calculate Effect Size (Eta Squared) ---
# Formula: Eta_Squared = SS_Between / SS_Total

# Grand Mean (average of all students)
grand_mean = df['aptitude_score'].mean()

# Sum of Squares Total (SST)
ss_total = ((df['aptitude_score'] - grand_mean) ** 2).sum()

# Sum of Squares Between (SSB)
ss_between = 0
for name, group in df.groupby('course_level'):
    n = len(group)
    group_mean = group['aptitude_score'].mean()
    ss_between += n * (group_mean - grand_mean)**2

eta_squared = ss_between / ss_total

# --- 5. Create Summary Table ---
# We structure the results as a DataFrame for a clean table output
anova_results = pd.DataFrame({
    'Metric': [
        'Test Statistic (F-value)',
        'Probability (p-value)',
        'Effect Size (Eta Squared)'
    ],
    'Value': [
        f"{f_stat:.2f}",
        "< 0.001" if p_val < 0.001 else f"{p_val:.4f}",
        f"{eta_squared:.3f}"
    ],
    'Interpretation': [
        '', # No interpretation needed for F-value itself in this format
        'Significant difference exists' if p_val < 0.05 else 'No significant difference',
        f"{eta_squared*100:.1f}% of difference explained by course level"
    ]
})

# --- 6. Display ---
print("6.3.1  The Test: One-Way ANOVA (Aptitude Scores)\n")
# Left align the text for better readability
print(anova_results.to_string(index=False))

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

# --- 1. Load Data ---
df = pd.read_csv('student_combined_data.csv')

# --- 2. Helper Functions ---

# Function to calculate Cohen's d (Effect Size)
def calculate_cohens_d(group1, group2):
    n1, n2 = len(group1), len(group2)
    var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)

    # Pooled Standard Deviation
    numerator = ((n1 - 1) * var1) + ((n2 - 1) * var2)
    denominator = n1 + n2 - 2
    pooled_sd = np.sqrt(numerator / denominator)

    # Cohen's d
    return (np.mean(group1) - np.mean(group2)) / pooled_sd

# Function to interpret Effect Size
def interpret_effect(d):
    d = abs(d)
    if d >= 1.2: return "Very Large"
    if d >= 0.8: return "Large"
    if d >= 0.5: return "Medium"
    return "Small"

# --- 3. Perform Pairwise Comparisons ---
comparisons = [
    ('Advanced', 'Intermediate'),
    ('Intermediate', 'Foundation'),
    ('Advanced', 'Foundation')
]

rows = []

for label1, label2 in comparisons:
    # Get the Aptitude Scores for the two groups
    g1 = df[df['course_level'] == label1]['aptitude_score']
    g2 = df[df['course_level'] == label2]['aptitude_score']

    # Calculate Stats
    diff = g1.mean() - g2.mean()
    t_stat, p_val = stats.ttest_ind(g1, g2)
    d_val = calculate_cohens_d(g1, g2)

    # Format Row
    rows.append({
        'Comparison': f"{label1} vs {label2}",
        'Difference': f"{diff:.1f} points",
        'p-value': "<0.001" if p_val < 0.001 else f"{p_val:.3f}",
        'Effect Size': f"{d_val:.2f}",
        'Interpretation': interpret_effect(d_val)
    })

# --- 4. Display Table ---
results = pd.DataFrame(rows)

# Adjust display settings to ensure full table visibility
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

print("6.3.2  Comparing Specific Pairs of Levels\n")
print("Table 5: Pairwise Aptitude Comparisons")
print(results.to_string(index=False))

# Chapter 7: Correlation Analysis

# Chapter 8: Effect Sizes (Cohen's d)

# Chapter 9: Comprehensive Visualizations

In [None]:
# --- 1. Load Data ---
df = pd.read_csv('student_combined_data.csv')

# --- 2. Setup Figure ---
sns.set_style("darkgrid")
# Create a 3x3 Grid (3 Rows, 3 Columns)
fig, axes = plt.subplots(3, 3, figsize=(20, 15))

# Increase vertical spacing so titles don't overlap
plt.subplots_adjust(hspace=0.4)

# ==========================================
# ROW 1: Boxplots & Performance Violin
# ==========================================

# 1.1 Performance Boxplot (Left)
sns.boxplot(
    data=df, x='course_level', y='performance_score',
    order=['Advanced', 'Foundation', 'Intermediate'],
    boxprops=dict(facecolor=(0,0,0,0)), # Transparent
    width=0.5, linewidth=1.2, fliersize=6,
    ax=axes[0, 0]
)
axes[0, 0].set_title('Performance Score Distribution by Course Level', fontsize=14)
axes[0, 0].grid(axis='x', color='white', linestyle='-', linewidth=1.5)

# 1.2 Aptitude Boxplot (Middle)
sns.boxplot(
    data=df, x='course_level', y='aptitude_score',
    order=['Advanced', 'Foundation', 'Intermediate'],
    boxprops=dict(facecolor=(0,0,0,0)), # Transparent
    width=0.5, linewidth=1.2, fliersize=6,
    ax=axes[0, 1]
)
axes[0, 1].set_title('Aptitude Score Distribution by Course Level', fontsize=14)
axes[0, 1].tick_params(axis='both', which='major', labelsize=11)
axes[0, 1].grid(axis='x', color='white', linestyle='-', linewidth=1.5)

# 1.3 Performance Violin Plot (Right)
sns.violinplot(
    data=df, x='course_level', y='performance_score',
    order=['Advanced', 'Intermediate', 'Foundation'],
    color='#F6848F', inner='box', linewidth=1.2,
    ax=axes[0, 2]
)
axes[0, 2].set_title('Performance Score Distribution (Violin Plot)', fontsize=14)
axes[0, 2].grid(axis='x', color='white', linestyle='-', linewidth=1.5)


# ==========================================
# ROW 2: Aptitude Violin & Bar Charts
# ==========================================

# 2.1 Aptitude Violin Plot (Left)
sns.violinplot(
    data=df, x='course_level', y='aptitude_score',
    order=['Advanced', 'Intermediate', 'Foundation'],
    color='#F6848F', inner='box', linewidth=1.2,
    ax=axes[1, 0]
)
axes[1, 0].set_title('Aptitude Distribution (Violin Plot)', fontsize=14)
axes[1, 0].grid(axis='x', color='white', linestyle='-', linewidth=1.5)

# 2.2 Mean Performance Bar Plot (Middle)
sns.barplot(
    data=df, x='course_level', y='performance_score',
    order=['Advanced', 'Foundation', 'Intermediate'],
    errorbar='sd', capsize=0.1, width=0.5,
    hue='course_level', legend=False,
    err_kws={'color': 'black', 'linewidth': 1.5},
    ax=axes[1, 1]
)
axes[1, 1].set_title('Mean Performance Score by Level (with SD)', fontsize=14)
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(axis='x', color='white', linestyle='-', linewidth=1.5)

# 2.3 Mean Aptitude Bar Plot (Right)
sns.barplot(
    data=df, x='course_level', y='aptitude_score',
    order=['Advanced', 'Foundation', 'Intermediate'],
    errorbar='sd', capsize=0.1, width=0.5,
    hue='course_level', legend=False,
    err_kws={'color': 'black', 'linewidth': 1.5},
    ax=axes[1, 2]
)
axes[1, 2].set_title('Mean Aptitude Score by Level (with SD)', fontsize=14)
axes[1, 2].tick_params(axis='x', rotation=45)
axes[1, 2].grid(axis='x', color='white', linestyle='-', linewidth=1.5)


# ==========================================
# ROW 3: Correlation & Histograms
# ==========================================

# 3.1 Scatter Plot with Regression (Left)
# Calculate Correlation
import scipy.stats as sps # Local import and alias to avoid name collision with 'stats' DataFrame variable
r, p = sps.pearsonr(df['performance_score'], df['aptitude_score'])

sns.scatterplot(
    data=df, x='performance_score', y='aptitude_score',
    hue='course_level', s=80, alpha=0.7, palette='tab10',
    ax=axes[2, 0]
)
sns.regplot(
    data=df, x='performance_score', y='aptitude_score',
    scatter=False, ci=None, line_kws={"color": "red", "ls": "--", "linewidth": 2.5},
    ax=axes[2, 0]
)
axes[2, 0].set_title(f'Performance vs Aptitude Score (r = {r:.3f})', fontsize=14)
axes[2, 0].legend(loc='upper left', frameon=False)

# Custom Colors for Histograms
colors = {'Advanced': '#F49AC2', 'Intermediate': '#C4B083', 'Foundation': '#93C47D'}

# 3.2 Performance Histogram (Middle)
for level, color in colors.items():
    sns.histplot(
        data=df[df['course_level'] == level], x='performance_score',
        bins=15, alpha=0.6, color=color, label=level, edgecolor=None, linewidth=0,
        ax=axes[2, 1]
    )
axes[2, 1].set_title('Performance Score Histogram by Level', fontsize=14)
axes[2, 1].legend(labels=['Advanced', 'Intermediate', 'Foundation'])

# 3.3 Aptitude Histogram (Right)
for level, color in colors.items():
    sns.histplot(
        data=df[df['course_level'] == level], x='aptitude_score',
        bins=15, alpha=0.6, color=color, label=level, edgecolor=None, linewidth=0,
        ax=axes[2, 2]
    )
axes[2, 2].set_title('Aptitude Score Histogram by Level', fontsize=14)
axes[2, 2].legend(labels=['Advanced', 'Intermediate', 'Foundation'])

# --- Final Layout ---
plt.tight_layout()
plt.show()

# Chapter 10: Summary and Conclusions

In [None]:
import pandas as pd
from scipy import stats

# --- 1. Load Data ---
df = pd.read_csv('student_combined_data.csv')

# --- 2. Calculate Correlation Statistics ---
# Pearson correlation returns the coefficient (r) and the p-value
r, p_value = stats.pearsonr(df['aptitude_score'], df['performance_score'])

# Calculate Shared Variance (Coefficient of Determination, r-squared)
# This represents the percentage of variation in one variable explained by the other
shared_variance = r ** 2

# --- 3. Build the Table Data ---
# We manually construct the rows to match the specific format of your image
data = [
    {
        'Relationship': 'Aptitude <-> Performance',
        'Correlation (r)': f"{r:.3f}",
        'Interpretation': 'Very Strong Positive' # Interpretation based on r > 0.8
    },
    {
        'Relationship': 'Statistical Significance',
        'Correlation (r)': 'p < 0.001' if p_value < 0.001 else f"{p_value:.4f}",
        'Interpretation': 'Extremely Confident' # Interpretation based on p < 0.001
    },
    {
        'Relationship': 'Shared Variance',
        'Correlation (r)': f"{shared_variance*100:.1f}%",
        'Interpretation': 'High Predictability' # Interpretation based on high r-squared
    }
]

# --- 4. Create and Display Table ---
results_df = pd.DataFrame(data)

# Adjust display settings for clean output
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)
print("7.2  Overall Correlation Results\n")
print("Table 6: Correlation Between Aptitude and Performance")
print(results_df.to_string(index=False))

In [None]:
import pandas as pd
from scipy import stats

# --- 1. Load Data ---
df = pd.read_csv('student_combined_data.csv')

# --- 2. Define Interpretations (Text from your image) ---
# Since the "What This Means" column is qualitative text, we map it to the levels.
descriptions = {
    'Advanced': 'Even among advanced students, aptitude predicts performance',
    'Intermediate': 'Clear aptitude-performance link in the middle range',
    'Foundation': 'Weaker but still meaningful relationship'
}

# --- 3. Calculate Correlations by Group ---
rows = []
# We force the specific order: Advanced -> Intermediate -> Foundation
order = ['Advanced', 'Intermediate', 'Foundation']

for level in order:
    # Get data for this specific level
    subset = df[df['course_level'] == level]

    # Calculate Pearson Correlation
    r, _ = stats.pearsonr(subset['aptitude_score'], subset['performance_score'])

    # Determine Strength Label dynamically based on r value
    if r > 0.7:
        strength = "Strong"
    elif r > 0.4:
        strength = "Moderate"
    else:
        strength = "Weak-Moderate"

    # Append to list
    rows.append({
        'Level': level,
        'Correlation': f"{r:.3f}",
        'Strength': strength,
        'What This Means': descriptions[level]
    })

# --- 4. Create and Format Table ---
results_df = pd.DataFrame(rows)

# Adjust pandas settings to ensure the long text sentences don't get cut off
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)
print("7.3  Correlation Within Each Course Level\n")
print("Table 7: Correlations Within Each Course Level")
print(results_df.to_string(index=False))

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# --- 1. Load Data ---
df = pd.read_csv('student_combined_data.csv')

# --- 2. Calculate Correlation Matrix ---
# Select only the two columns we want to compare
corr_matrix = df[['performance_score', 'aptitude_score']].corr()

# --- 3. Setup Styling ---
# Use white background so the white lines between heatmap squares are visible
sns.set_style("white")
plt.figure(figsize=(7, 6))

# --- 4. Draw the Heatmap ---
sns.heatmap(
    corr_matrix,
    annot=True,          # Show the numbers (0.89, 1) in the squares
    cmap='RdYlGn',       # Red-Yellow-Green colormap (matches the red in your image)
    center=0,            # centers the colormap
    vmin=0.88, vmax=1,   # Limits the color range to make the red intense (since correlation is high)
    linewidths=2,        # Width of the white lines between squares
    linecolor='white',   # Color of the lines
    square=True,         # Forces squares to be perfect squares
    cbar_kws={"shrink": 0.8} # Slightly shrinks the color bar on the right
)

# --- 5. Customization ---
plt.title('Correlation Heatmap: Performance vs Aptitude Score', fontsize=14)

plt.tight_layout()
plt.show()