## Test Data

In [1]:
import pandas as pd


In [2]:

# Load the exam data
df_raw = pd.read_parquet("../data/parquet/dfAllFase1.parquet.gzip")
outcols = ['ano', 'Exame', 'Sexo', 'CIF', 'Class_Exam', 
           'CFD', 'Covid', 'PubPriv', 'DescrNuts3', 'DescrNuts2',
           'DescrExameAbrev', 'DescrSubtipoCurso']


# Filter: valid exam grades only
df = df_raw[df_raw['Class_Exam'].notna()].loc[:,outcols]


print(f"Total rows: {len(df):,}")
print(f"With valid exam grades: {len(df):,}")
print(df.shape)
df.head()

Total rows: 4,726,985
With valid exam grades: 4,726,985
(4726985, 12)


Unnamed: 0,ano,Exame,Sexo,CIF,Class_Exam,CFD,Covid,PubPriv,DescrNuts3,DescrNuts2,DescrExameAbrev,DescrSubtipoCurso
0,2024,639,F,,14.0,,After,PUB,Área Metropolitana de Lisboa,AM Lisboa,Português,Cursos Profissionais
1,2024,639,M,12.0,4.1,,After,PUB,Área Metropolitana de Lisboa,AM Lisboa,Português,Cursos Científico-Humanísticos
2,2024,639,F,15.0,10.0,,After,PUB,Área Metropolitana de Lisboa,AM Lisboa,Português,Cursos Científico-Humanísticos
3,2024,639,M,,11.2,,After,PUB,Área Metropolitana de Lisboa,AM Lisboa,Português,Equivalências
4,2024,639,F,12.0,6.2,,After,PUB,Área Metropolitana de Lisboa,AM Lisboa,Português,Cursos Científico-Humanísticos


In [3]:
print(df.shape)
df.columns

(4726985, 12)


Index(['ano', 'Exame', 'Sexo', 'CIF', 'Class_Exam', 'CFD', 'Covid', 'PubPriv',
       'DescrNuts3', 'DescrNuts2', 'DescrExameAbrev', 'DescrSubtipoCurso'],
      dtype='object')

In [4]:
viz1 = df.copy()

# Round Class_Exam to 1 decimal for granularity
viz1['grade_value'] = viz1['Class_Exam'].round(1)

# Create viz1 aggregation
viz1df = viz1.groupby([
    'grade_value',
    'ano',
    'Sexo',
    'PubPriv',
    'DescrNuts2',
    'DescrNuts3',
    'DescrSubtipoCurso'
], dropna=False).agg(
    num_exams=('Class_Exam', 'count'),
    avg_grade=('Class_Exam', 'mean')
).reset_index()

# Rename columns
viz1df.rename(columns={
    'ano': 'year',
    'Sexo': 'gender',
    'PubPriv': 'school_type',
    'DescrNuts2': 'nuts2',
    'DescrNuts3': 'nuts3',
    'DescrSubtipoCurso': 'course_subtype'
}, inplace=True)

# Add covid_period based on year
viz1df['covid_period'] = viz1df['year'].apply(lambda y: 'Before' if y <= 2019 else 'After')

print(f"Rows: {len(viz1df):,}")
print(f"Memory: {viz1df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# # Save
viz1df.to_csv('viz1_grade_distribution.csv', index=False)

Rows: 486,295
Memory: 211.9 MB


In [5]:
viz1df.head().T

Unnamed: 0,0,1,2,3,4
grade_value,0.0,0.0,0.0,0.0,0.0
year,2008,2008,2008,2008,2008
gender,F,F,F,F,F
school_type,PRI,PRI,PRI,PRI,PUB
nuts2,Centro,Centro,Norte,Norte,AM Lisboa
nuts3,Beiras e Serra da Estrela,Região de Coimbra,Área Metropolitana do Porto,Área Metropolitana do Porto,Área Metropolitana de Lisboa
course_subtype,Cursos Científico-Humanísticos,Cursos Científico-Humanísticos,Cursos Científico-Humanísticos,Cursos Científico-humanísticos do Ensino Recor...,Cursos Científico-Humanísticos
num_exams,1,1,1,1,14
avg_grade,0.0,0.0,0.0,0.0,0.0
covid_period,Before,Before,Before,Before,Before


In [6]:
vizdf = df.copy()
# Viz2: Year trends
viz2 = vizdf.groupby([
    'ano',
    'Sexo',
    'PubPriv',
    'DescrNuts2',
    'DescrNuts3',
    'DescrSubtipoCurso'
], dropna=False).agg(
    num_exams=('Class_Exam', 'count'),
    avg_grade=('Class_Exam', 'mean')
).reset_index()

viz2.rename(columns={
    'ano': 'year',
    'Sexo': 'gender',
    'PubPriv': 'school_type',
    'DescrNuts2': 'nuts2',
    'DescrNuts3': 'nuts3',
    'DescrSubtipoCurso': 'course_subtype'
}, inplace=True)

viz2['covid_period'] = viz2['year'].apply(lambda y: 'Before' if y <= 2019 else 'After')

print(f"Viz2 rows: {len(viz2):,}")
viz2.to_csv('viz2_year_trends.csv', index=False)

# Viz3: Exam comparison (top 10 exams only)
top_exams = vizdf['DescrExameAbrev'].value_counts().head(10).index

viz3 = vizdf[vizdf['DescrExameAbrev'].isin(top_exams)].groupby([
    'DescrExameAbrev',
    'ano',
    'Sexo',
    'PubPriv',
    'DescrNuts2',
    'DescrNuts3',
    'DescrSubtipoCurso'
], dropna=False).agg(
    num_exams=('Class_Exam', 'count'),
    avg_grade=('Class_Exam', 'mean')
).reset_index()

viz3.rename(columns={
    'DescrExameAbrev': 'exam_name',
    'ano': 'year',
    'Sexo': 'gender',
    'PubPriv': 'school_type',
    'DescrNuts2': 'nuts2',
    'DescrNuts3': 'nuts3',
    'DescrSubtipoCurso': 'course_subtype'
}, inplace=True)

viz3['covid_period'] = viz3['year'].apply(lambda y: 'Before' if y <= 2019 else 'After')

print(f"Viz3 rows: {len(viz3):,}")
viz3.to_csv('viz3_exam_comparison.csv', index=False)

Viz2 rows: 13,937
Viz3 rows: 56,916
