# Data Checker

El propósito de este notebook es realizar unas verificaciones de los datos para tener la seguridad que que los datos son consistentes

In [1]:
import pandas as pd
import os
import sys
import duckdb
import matplotlib as plt

project_root = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..', '..'))
os.chdir(project_root)

# Añadir el directorio raíz al path de Python
if project_root not in sys.path:
    sys.path.append(project_root)

from utils.hash_utility import HashUtility
from utils.moodle_path_resolver import MoodlePathResolver

## Verificación de estudiantes inscritos en archivos raw

In [2]:
expected_students_fusa_2024 = {
    '1': 15,
    '2': 15,
    '3': 15,
    '4': 22,
    '5': 17,
    '6': 20,
    '7': 22,
    '8': 17,
    '9': 16,
    '10': 17,
    '11': 13,
}

expected_students_girardot_2024 = {
    '1': 17,
    '2': 7,
    '3': 17,
    '4': 18,
    '5': 10,
    '6': 16,
    '7': 16,
    '8': 12,
}

expected_students_fusa_2025 = {
    '1': 10,
    '2': 9,
    '3': 15,
    '4': 16,
    '5': 24,
    '6': 18,
    '7': 21,
    '8': 21,
    '9': 18,
    '10': 14,
    '11': 13,
}

expected_students_girardot_2025 = {
    '1': 8,
    '2': 16,
    '3': 4,
    '4': 20,
    '5': 15,
    '6': 7,
    '7': 15,
    '8': 12,
    '9': 4
}

students_2024 = sum(expected_students_fusa_2024.values()) + sum(expected_students_girardot_2024.values())
students_2025 = sum(expected_students_fusa_2025.values()) + sum(expected_students_girardot_2025.values())
# Sumar los valores de cada grado (llave) entre Fusa y Girardot para 2024
total_students_by_grade_2024 = {}
all_grades = set(expected_students_fusa_2024.keys()).union(expected_students_girardot_2024.keys())
for grade in all_grades:
    fusa_count = expected_students_fusa_2024.get(grade, 0)
    girardot_count = expected_students_girardot_2024.get(grade, 0)
    total_students_by_grade_2024[grade] = fusa_count + girardot_count

# Sumar los valores de cada grado (llave) entre Fusa y Girardot para 2025
total_students_by_grade_2025 = {}
all_grades = set(expected_students_fusa_2025.keys()).union(expected_students_girardot_2025.keys())
for grade in all_grades:
    fusa_count = expected_students_fusa_2025.get(grade, 0)
    girardot_count = expected_students_girardot_2025.get(grade, 0)
    total_students_by_grade_2025[grade] = fusa_count + girardot_count

In [3]:
total_students_by_grade_2024

{'8': 29,
 '5': 27,
 '6': 36,
 '10': 17,
 '9': 16,
 '7': 38,
 '1': 32,
 '2': 22,
 '11': 13,
 '4': 40,
 '3': 32}

In [4]:
total_students_by_grade_2025

{'8': 33,
 '5': 39,
 '6': 25,
 '10': 14,
 '9': 22,
 '7': 36,
 '1': 18,
 '2': 25,
 '11': 13,
 '4': 36,
 '3': 19}

In [5]:
con = duckdb.connect()

In [6]:
# Read pandas files
students_2024_df = pd.read_csv('data/raw/estudiantes/estudiantes_2024.csv')
students_2025_df = pd.read_csv('data/raw/estudiantes/estudiantes_2025.csv')


students_2024_df = students_2024_df[
    (~students_2024_df['grado'].isin(['Prejardín', 'Jardín', 'Transición']))
]

students_2025_df = students_2025_df[
    (~students_2025_df['grado'].isin(['Prejardín', 'Jardín', 'Transición']))
]


In [7]:
girardot_count_2024 = students_2024_df[students_2024_df['sede'] == 'Girardot'].shape[0]
fusa_count_2024 = students_2024_df[students_2024_df['sede'] == 'Fusagasugá'].shape[0]

# Asserting the counts
assert girardot_count_2024 == sum(expected_students_girardot_2024.values()), "Count mismatch for Girardot 2024"
assert fusa_count_2024 == sum(expected_students_fusa_2024.values()), "Count mismatch for Fusa 2024"

In [8]:
girardot_count_2025 = students_2025_df[students_2025_df['sede'] == 'Girardot'].shape[0]
fusa_count_2025 = students_2025_df[students_2025_df['sede'] == 'Fusagasugá'].shape[0]

# Asserting the counts
assert girardot_count_2025 == sum(expected_students_girardot_2025.values()), "Count mismatch for Girardot 2025"
assert fusa_count_2025 == sum(expected_students_fusa_2025.values()), "Count mismatch for Fusa 2025"

In [9]:
girardot_count_2024_by_grade = (
    students_2024_df[students_2024_df['sede'] == 'Girardot']
    .groupby('grado')
    .size()
    .to_dict()
)
assert girardot_count_2024_by_grade == expected_students_girardot_2024, "Grade count mismatch for Girardot 2024"

fusa_count_2024_by_grade = (
    students_2024_df[students_2024_df['sede'] == 'Fusagasugá']
    .groupby('grado')
    .size()
    .to_dict()
)
assert fusa_count_2024_by_grade == expected_students_fusa_2024, "Grade count mismatch for Fusagasugá 2024"

In [10]:
girardot_count_2025_by_grade = (
    students_2025_df[students_2025_df['sede'] == 'Girardot']
    .groupby('grado')
    .size()
    .to_dict()
)
# Convert keys to string
girardot_count_2025_by_grade = {str(k): v for k, v in girardot_count_2025_by_grade.items()}
assert girardot_count_2025_by_grade == expected_students_girardot_2025


fusa_count_2025_by_grade = (
    students_2025_df[students_2025_df['sede'] == 'Fusagasugá']
    .groupby('grado')
    .size()
    .to_dict()
)
fusa_count_2025_by_grade = {str(k): v for k, v in fusa_count_2025_by_grade.items()}
assert fusa_count_2025_by_grade == expected_students_fusa_2025


In [11]:
total_count_2025_by_grade = (
    students_2025_df
    .groupby('grado')
    .size()
    .to_dict()
)
total_count_2025_by_grade = {str(k): v for k, v in total_count_2025_by_grade.items()}
assert total_students_by_grade_2025 == total_count_2025_by_grade


In [12]:
total_count_2024_by_grade = (
    students_2024_df
    .groupby('grado')
    .size()
    .to_dict()
)
total_count_2024_by_grade = {str(k): v for k, v in total_count_2024_by_grade.items()}
assert total_students_by_grade_2024 == total_count_2024_by_grade

## Verificar que para cada estudiante de 2024 y 2025 estén los datos

In [13]:
students_2025_2025 = pd.read_csv('data/raw/estudiantes/estudiantes_2024_2025.csv')
docs_2024_2025 = set(students_2025_2025['documento_identificación'])

In [14]:
not_in_all_df = students_2024_df[~students_2024_df['documento_identificación'].isin(docs_2024_2025)]
assert not_in_all_df.empty, "Hay estudiantes de 2024 que no están en el archivo de todos los estudiantes de 2024 y 2025"

## Excepción

Hay un estudiante que tiene documento de identificación diferente en año 2024 y 2025 (1023942729,1031833774)

In [None]:
## Apply hash to 1031833774
# Aplicar hash al documento 1031833774
hash_1031833774 = HashUtility.hash_stable("1031833774")
print(f"Hash para documento 1031833774: {hash_1031833774}")

hash_1023942729 = HashUtility.hash_stable("1023942729")
print(f"Hash para documento 1023942729: {hash_1023942729}")



Hash para documento 1031833774: c5fcc9a2b1087f1c855cc8c85fa4446f2ad80e0e4734c0afbe854332acc1c22f
Hash para documento 1023942729: b590e0984547ece1eda24fa512647a538cac6f79b4b3d59c0972638354830fe5


In [15]:
not_in_all_df = students_2025_df[~students_2025_df['documento_identificación'].isin(docs_2024_2025)]

# Remove from not_in_all_df the student with document 1031833774
not_in_all_df = not_in_all_df[not_in_all_df['documento_identificación'] != 1031833774]

assert not_in_all_df.empty, "Hay estudiantes de 2025 que no están en el archivo de todos los estudiantes de 2024 y 2025"

## Verificación con moodle

In [16]:
parquet_users_path="data/raw/moodle/2024/Users/mdlvf_user.parquet"
parquet_user_info_path="data/raw/moodle/2024/Users/mdlvf_user_info_data.parquet"

sql = f"""
SELECT 
    u.id AS UserID,
    u.idnumber AS documento_identificación,
    CONCAT(u.firstname, ' ', u.lastname) AS "Nombre Completo",
    u.city AS Sede,
    to_timestamp(u.firstaccess) AS "Fecha Primer Acceso",
    to_timestamp(u.lastaccess) AS "Feha Último Acceso",
    to_timestamp(u.lastlogin) AS "Fecha Último Inicio de Sesión",
    to_timestamp(u.timecreated) AS "Fecha Creación"
FROM 
    '{parquet_users_path}' u
JOIN 
    '{parquet_user_info_path}' uid 
    ON u.id = uid.userid
WHERE 
    uid.data = 'Estudiante'
    AND u.idnumber <> ''
    AND u.deleted = 0;
"""
df_2024 = con.execute(sql).df()
df_2024["documento_identificación"] = df_2024["documento_identificación"].astype(str).str.replace(r"\s+", "", regex=True)
df_2024["documento_identificación_hash"] = df_2024["documento_identificación"].apply(HashUtility.hash_stable)

In [17]:
missing_in_moodle_2024 = students_2024_df[~students_2024_df['documento_identificación'].astype(str).isin(df_2024['documento_identificación'])]
assert missing_in_moodle_2024.empty, "Hay estudiantes de 2024 que no están en el moodle 2024"

In [18]:
parquet_users_path="data/raw/moodle/2025/Users/mdlvf_user.parquet"
parquet_user_info_path="data/raw/moodle/2025/Users/mdlvf_user_info_data.parquet"

sql = f"""
SELECT 
    u.id AS UserID,
    u.idnumber AS documento_identificación,
    CONCAT(u.firstname, ' ', u.lastname) AS "Nombre Completo",
    u.city AS Sede,
    to_timestamp(u.firstaccess) AS "Fecha Primer Acceso",
    to_timestamp(u.lastaccess) AS "Feha Último Acceso",
    to_timestamp(u.lastlogin) AS "Fecha Último Inicio de Sesión",
    to_timestamp(u.timecreated) AS "Fecha Creación"
FROM 
    '{parquet_users_path}' u
JOIN 
    '{parquet_user_info_path}' uid 
    ON u.id = uid.userid
WHERE 
    uid.data = 'Estudiante'
    AND u.idnumber <> ''
    AND u.deleted = 0;
"""
df_2025 = con.execute(sql).df()
df_2025["documento_identificación"] = df_2025["documento_identificación"].astype(str).str.replace(r"\s+", "", regex=True)
df_2025["documento_identificación_hash"] = df_2025["documento_identificación"].apply(HashUtility.hash_stable)

In [19]:
missing_in_moodle_2025 = students_2025_df[~students_2025_df['documento_identificación'].astype(str).isin(df_2025['documento_identificación'])]
assert missing_in_moodle_2025.empty, "Hay estudiantes de 2025 que no están en el moodle 2025"

## Verificación con Edukrea

In [20]:
df_user_edukrea = pd.read_parquet('data/raw/moodle/Edukrea/Users/mdl_user.parquet')

In [21]:
missing_in_edukrea_2025 = students_2025_df[
    ~students_2025_df['documento_identificación'].astype(str).isin(df_user_edukrea['idnumber'].astype(str))
]
assert missing_in_edukrea_2025.empty, "Hay estudiantes de 2025 que no están en el Edukrea"

## Verificación de estudiantes inscritos en archivos interim

In [22]:
students_2024_hashed = pd.read_csv('data/interim/estudiantes/estudiantes_2024_hashed.csv')
students_2025_hashed = pd.read_csv('data/interim/estudiantes/estudiantes_2025_hashed.csv')
students_clean = pd.read_csv('data/interim/estudiantes/estudiantes_clean.csv')
students_imputed_encoded = pd.read_csv('data/interim/estudiantes/estudiantes_imputed_encoded.csv')
students_imputed = pd.read_csv('data/interim/estudiantes/estudiantes_imputed.csv')

In [23]:
students_2024_hashed = students_2024_hashed[
    (~students_2024_hashed['grado'].isin(['Prejardín', 'Jardín', 'Transición']))
]

students_2025_hashed = students_2025_hashed[
    (~students_2025_hashed['grado'].isin(['Prejardín', 'Jardín', 'Transición']))
]

In [24]:
# Apply hash to raw files to match the hashed files
students_2024_df['documento_identificación_hash'] = students_2024_df['documento_identificación'].apply(HashUtility.hash_stable)
students_2025_df['documento_identificación_hash'] = students_2025_df['documento_identificación'].apply(HashUtility.hash_stable)
students_2025_2025['documento_identificación_hash'] = students_2025_2025['documento_identificación'].apply(HashUtility.hash_stable)

In [25]:
missing_in_students_2024_hashed = students_2024_hashed[~students_2024_hashed['documento_identificación'].astype(str).isin(students_2024_df['documento_identificación_hash'])]
assert missing_in_students_2024_hashed.empty, "Hay estudiantes de 2024 hash que no están en el archivo de estudiantes 2024 hasheados"

In [26]:
missing_in_students_2025_hashed = students_2025_hashed[~students_2025_hashed['documento_identificación'].astype(str).isin(students_2025_df['documento_identificación_hash'])]
assert missing_in_students_2025_hashed.empty, "Hay estudiantes de 2025 hash que no están en el archivo de estudiantes 2025 hasheados"

In [27]:
missing_in_students_clean = students_clean[~students_clean['documento_identificación'].astype(str).isin(students_2025_2025['documento_identificación_hash'])]
assert missing_in_students_clean.empty, "Hay estudiantes de hash que no están en el archivo de estudiantes hasheados"

In [28]:
missing_in_students_imputed = students_imputed[~students_imputed['documento_identificación'].astype(str).isin(students_2025_2025['documento_identificación_hash'])]
assert missing_in_students_imputed.empty, "Hay estudiantes de hash que no están en el archivo de estudiantes hasheados"

In [29]:
missing_in_students_imputed_encoded = students_imputed_encoded[~students_imputed_encoded['documento_identificación'].astype(str).isin(students_2025_2025['documento_identificación_hash'])]
assert missing_in_students_imputed_encoded.empty, "Hay estudiantes de hash que no están en el archivo de estudiantes hasheados"

## Verificación con enrollments

In [30]:
enrollments = pd.read_csv('data/interim/estudiantes/enrollments.csv')

enrollments_2024 = enrollments[enrollments['year'] == 2024]
enrollments_2025 = enrollments[enrollments['year'] == 2025]

In [31]:
assert len(students_2025_df) == students_2025, f"El número de filas en students_2025_df ({len(students_2025_df)}) no coincide con students_2025 ({students_2025})"
assert len(students_2024_df) == students_2024, f"El número de filas en students_2024_df ({len(students_2024_df)}) no coincide con students_2024 ({students_2024})"

In [32]:
# Verificar que el conteo de filas por grado coincida entre students_2025_df y enrollments_2025
# Agrupar y contar por grado en students_2025_df
students_2025_by_grade = students_2025_df.groupby('grado').size().sort_index()
# Agrupar y contar por id_grado en enrollments_2025
enrollments_2025_by_grade = enrollments_2025.groupby('id_grado').size().sort_index()
# Convertir los índices a string para asegurar la comparación
students_2025_by_grade.index = students_2025_by_grade.index.astype(str)
enrollments_2025_by_grade.index = enrollments_2025_by_grade.index.astype(str)
# Comparar ambos conteos
assert students_2025_by_grade.equals(enrollments_2025_by_grade), "Los conteos por grado no coinciden entre students_2025_df y enrollments_2025"

In [33]:
# Verificar que el conteo de filas por grado coincida entre students_2024_df y enrollments_2024
# Agrupar y contar por grado en students_2024_df
students_2024_by_grade = students_2024_df.groupby('grado').size().sort_index()
# Agrupar y contar por id_grado en enrollments_2024
enrollments_2024_by_grade = enrollments_2024.groupby('id_grado').size().sort_index()
# Convertir los índices a string para asegurar la comparación
students_2024_by_grade.index = students_2024_by_grade.index.astype(str)
enrollments_2024_by_grade.index = enrollments_2024_by_grade.index.astype(str)

# Ordenar los conteos por grado
students_2024_by_grade = students_2024_by_grade.sort_index()
enrollments_2024_by_grade = enrollments_2024_by_grade.sort_index()

# Comparar ambos conteos
assert students_2024_by_grade.equals(enrollments_2024_by_grade), "Los conteos por grado no coinciden entre students_2024_df y enrollments_2024"

## Verificación de cursos

In [34]:
students_courses = pd.read_csv('data/interim/moodle/student_courses.csv')
students_courses

Unnamed: 0,platform,moodle_user_id,year,id_grado,course_id,course_name,documento_identificación,sede,id_asignatura
0,moodle,1561,2024,1,23,Educación Física y Deportes 1,b8bd5170750ee52b2456363db9b3987fe2af7b89842474...,Fusagasugá,8
1,moodle,1563,2024,1,174,Creatividad e innovación 1,13d065def180ab40a3e0acc899c5524b4a437af268c0ec...,Fusagasugá,6
2,moodle,1563,2024,1,239,Artes 1,13d065def180ab40a3e0acc899c5524b4a437af268c0ec...,Fusagasugá,10
3,moodle,1564,2024,1,20,Lengua Castellana 1,65778a71ac202c4eada1eb1cd76b30206b0546c7a34b25...,Fusagasugá,4
4,moodle,1564,2024,1,239,Artes 1,65778a71ac202c4eada1eb1cd76b30206b0546c7a34b25...,Fusagasugá,10
...,...,...,...,...,...,...,...,...,...
8179,edukrea,95,2025,4,45,Matemáticas 4,66cd69c0b9b0e4e6c2c95c0120c915b85181789a09e407...,Fusagasugá,3
8180,edukrea,114,2025,5,46,Matemáticas 5,f9968176048580150899e0757c02c993d73d738e87cce0...,Fusagasugá,3
8181,edukrea,111,2025,5,24,Ciencias Sociales 5,5d6d51d0e60ac5aead8d9c61f3cf11aaaa9709590ac0a6...,Fusagasugá,2
8182,edukrea,111,2025,5,57,Lenguaje 5,5d6d51d0e60ac5aead8d9c61f3cf11aaaa9709590ac0a6...,Fusagasugá,4


In [43]:
students_moodle = students_courses[students_courses['platform'] == 'moodle']
students_edukrea = students_courses[students_courses['platform'] == 'edukrea']

In [44]:
students_moodle_2025 = students_moodle[students_moodle['year'] == 2025]
students_moodle_2024 = students_moodle[students_moodle['year'] == 2024]

In [45]:
assert not students_courses.isnull().any().any(), "students_courses_new contiene valores null en alguna columna"
assert students_moodle.shape[0] == 7068, f"students_moodle should have 7068 rows actually has {students_moodle.shape[0]}"
assert students_moodle_2024.shape[0] == 3654, f"students_moodle_2024 should have 3654 rows actually has {students_moodle_2024.shape[0]}"
assert students_moodle_2025.shape[0] == 3414, f"students_moodle_2025 should have 3414 rows actually has {students_moodle_2025.shape[0]}"

In [46]:
assert students_edukrea.shape[0] == 1116, f"students_edukrea should have 1116 rows actually has {students_edukrea.shape[0]}"

In [47]:
# Diccionario para guardar los resultados por grado
inconsistencias_por_grado = {}

for grado in range(1, 12):  # Del 1 al 11 inclusive
    filtro = (students_courses['id_grado'] == grado) & (~students_courses['course_name'].str.contains(str(grado)))
    inconsistencias = students_courses[filtro]
    
    if not inconsistencias.empty:
        inconsistencias_por_grado[grado] = inconsistencias

In [48]:
for grado, df in inconsistencias_por_grado.items():
    print(f"\nInconsistencias para grado {grado}:")
    print(df[['documento_identificación', 'course_name']].head())

In [49]:
# Diccionario para guardar los resultados por grado
inconsistencias_por_grado = {}

for grado in range(1, 12):  # Del 1 al 11 inclusive
    filtro = (students_edukrea['id_grado'] == grado) & (~students_edukrea['course_name'].str.contains(str(grado)))
    inconsistencias = students_edukrea[filtro]
    
    if not inconsistencias.empty:
        inconsistencias_por_grado[grado] = inconsistencias

In [50]:
for grado, df in inconsistencias_por_grado.items():
    print(f"\nInconsistencias para grado {grado}:")
    print(df[['documento_identificación', 'course_name']])

## Chequeo de módulos que contengan fecha de inicio

In [51]:
edukrea_modules_df = pd.read_csv('data/interim/moodle/modules_active_edukrea.csv')
moodle_modules_df = pd.read_csv('data/interim/moodle/modules_active_moodle.csv')

In [52]:
assert edukrea_modules_df[edukrea_modules_df['planned_start_date'].isnull()].empty, "Hay módulos en Edukrea sin fecha de inicio de semana"
assert edukrea_modules_df[edukrea_modules_df['planned_end_date'].isnull()].empty, "Hay módulos en Edukrea sin fecha de fin de semana"

In [53]:
assert moodle_modules_df[moodle_modules_df['planned_start_date'].isnull()].empty, "Hay módulos en Moodle sin fecha de inicio de semana"
assert moodle_modules_df[moodle_modules_df['planned_end_date'].isnull()].empty, "Hay módulos en Moodle sin fecha de fin de semana"

In [54]:
moodle_modules_df.columns

Index(['year', 'course_id', 'course_module_id', 'sede', 'id_grado',
       'id_asignatura', 'asignatura_name', 'course_name', 'section_id',
       'section_name', 'module_type_id', 'instance', 'module_creation_date',
       'module_type', 'module_name', 'week', 'period', 'is_interactive',
       'is_in_english', 'planned_start_date', 'planned_end_date'],
      dtype='object')

## Verificar calendario académico

In [55]:
# Leer CSVs
calendario_df = pd.read_csv("data/raw/tablas_maestras/calendario_escolar.csv", dayfirst=True)
vacaciones_df = pd.read_csv("data/raw/tablas_maestras/vacaciones_festivos.csv", dayfirst=True)

# Limpiar columnas
calendario_df.columns = calendario_df.columns.str.strip().str.lower()
vacaciones_df.columns = vacaciones_df.columns.str.strip().str.lower()

# Convertir fechas a datetime
calendario_df["inicio"] = pd.to_datetime(calendario_df["inicio"], dayfirst=True)
vacaciones_df["inicio"] = pd.to_datetime(vacaciones_df["inicio"], dayfirst=True)
vacaciones_df["fin"] = pd.to_datetime(vacaciones_df["fin"], dayfirst=True)

# Construir periods
periods = {}

for year in calendario_df["año"].unique():
    year_data = calendario_df[(calendario_df["año"] == year) & (calendario_df["semana"] == 1)]
    year_periods = {}

    # Fechas de inicio de bimestres
    for _, row in year_data.iterrows():
        bimestre = int(row["bimestre"])
        start_date = pd.Timestamp(row["inicio"], tz="America/Bogota")
        year_periods[f"p{bimestre}_start"] = start_date

    # Vacaciones como Timestamps con hora incluida
    year_vacaciones = vacaciones_df[vacaciones_df["año"] == year]
    vacations_list = [
        (
            pd.Timestamp(row["inicio"].strftime("%Y-%m-%dT00:00:00"), tz="America/Bogota"),
            pd.Timestamp(row["fin"].strftime("%Y-%m-%dT23:59:00"), tz="America/Bogota")
        )
        for _, row in year_vacaciones.iterrows()
    ]
    year_periods["vacations"] = vacations_list

    periods[year] = year_periods

# Resultado final
periods

{np.int64(2024): {'p1_start': Timestamp('2024-02-01 00:00:00-0500', tz='America/Bogota'),
  'p2_start': Timestamp('2024-04-15 00:00:00-0500', tz='America/Bogota'),
  'p3_start': Timestamp('2024-07-08 00:00:00-0500', tz='America/Bogota'),
  'p4_start': Timestamp('2024-09-09 00:00:00-0500', tz='America/Bogota'),
  'vacations': [(Timestamp('2024-03-23 00:00:00-0500', tz='America/Bogota'),
    Timestamp('2024-04-01 23:59:00-0500', tz='America/Bogota')),
   (Timestamp('2024-05-01 00:00:00-0500', tz='America/Bogota'),
    Timestamp('2024-05-01 23:59:00-0500', tz='America/Bogota')),
   (Timestamp('2024-05-17 00:00:00-0500', tz='America/Bogota'),
    Timestamp('2024-05-17 23:59:00-0500', tz='America/Bogota')),
   (Timestamp('2024-06-15 00:00:00-0500', tz='America/Bogota'),
    Timestamp('2024-07-08 23:59:00-0500', tz='America/Bogota')),
   (Timestamp('2024-08-07 00:00:00-0500', tz='America/Bogota'),
    Timestamp('2024-08-07 23:59:00-0500', tz='America/Bogota')),
   (Timestamp('2024-08-18 00:0

## Consulta de docentes en cursos

In [58]:
year = 2025
course_file = f"data/raw/moodle/{year}/Course/mdlvf_course.parquet"
context_file = f"data/raw/moodle/{year}/System/mdlvf_context.parquet"
role_assignments_file = f"data/raw/moodle/{year}/Users/mdlvf_role_assignments.parquet"
role_file = f"data/raw/moodle/{year}/Users/mdlvf_role.parquet"
user_file = f"data/raw/moodle/{year}/Users/mdlvf_user.parquet"
unique_courses_file = "data/interim/moodle/unique_courses_moodle.csv"

In [59]:
sql = f"""
SELECT 
    c.id AS courseid,
    c.fullname AS coursename,
    string_agg(DISTINCT u.firstname || ' ' || u.lastname, ', ') AS teacher
FROM '{course_file}' c
JOIN '{unique_courses_file}' uc ON c.id = uc.course_id
JOIN '{context_file}' ctx ON ctx.instanceid = c.id AND ctx.contextlevel = 50
JOIN '{role_assignments_file}' ra ON ra.contextid = ctx.id
JOIN '{role_file}' r ON r.id = ra.roleid AND r.shortname = 'editingteacher'
JOIN '{user_file}' u ON u.id = ra.userid
WHERE c.visible = 1
  AND NOT (u.firstname = 'Provisional' AND u.lastname = 'Girardot')
GROUP BY c.id, c.fullname
ORDER BY c.id;
"""

df = con.execute(sql).df()
df

Unnamed: 0,courseid,coursename,teacher
0,17,Matemáticas 1,Wilder Mauricio Ussa Santana
1,18,Ciencias Naturales y Educación Ambiental 1,Ximena Alejandra León Dicelis
2,19,Ciencias Sociales 1,Angie Jimena Gómez Arévalo
3,20,Lengua Castellana 1,Yessika Alejandra Morales García
4,21,English 1st,Yessika Alejandra Morales García
...,...,...,...
239,562,Centro de Interés Artístico 9,"Daniel Felipe Sánchez Saldarriaga, Álvaro Alex..."
240,563,Tecnologías Informáticas 9,Vanessa Liliana Sarabia Vargas
241,564,Integralidad 9,Jhojan Stiven Rubiano Capador
242,566,Física 10,Dayron Mateo González Penagos


## Check Logs Students

In [68]:
student_logs = pd.read_csv("data/interim/moodle/student_logs.csv")
student_logs

Unnamed: 0,year,id,eventname,component,action,target,objectid,contextinstanceid,userid,documento_identificación,courseid,timecreated,origin,ip,platform,moodle_user_id,edukrea_user_id
0,2024,18911591,\core\event\course_viewed,core,viewed,course,,204,1659,fdddfbf896fa113a54f649dcd5ee1a03fc2cc1d48472f7...,204,1706880368,web,161.10.197.179,moodle,1659,0
1,2024,18911592,\mod_resource\event\course_module_viewed,mod_resource,viewed,course_module,2727.0,101144,1659,fdddfbf896fa113a54f649dcd5ee1a03fc2cc1d48472f7...,204,1706880373,web,161.10.197.179,moodle,1659,0
2,2024,18911594,\core\event\course_viewed,core,viewed,course,,245,1659,fdddfbf896fa113a54f649dcd5ee1a03fc2cc1d48472f7...,245,1706880417,web,161.10.197.179,moodle,1659,0
3,2024,18911595,\mod_forum\event\course_module_viewed,mod_forum,viewed,course_module,2573.0,34931,1659,fdddfbf896fa113a54f649dcd5ee1a03fc2cc1d48472f7...,245,1706880421,web,161.10.197.179,moodle,1659,0
4,2024,18911598,\core\event\course_viewed,core,viewed,course,,245,1659,fdddfbf896fa113a54f649dcd5ee1a03fc2cc1d48472f7...,245,1706880429,web,161.10.197.179,moodle,1659,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
680790,2025,439703,\core\event\course_module_completion_updated,core,updated,course_module_completion,5698.0,5899,76,66a3ad7beed7ad01c5cccf25abc9b0dc88659830772679...,30,1744211785,web,190.71.155.42,edukrea,0,76
680791,2025,439704,\core\event\user_graded,core,graded,user,76109.0,30,76,66a3ad7beed7ad01c5cccf25abc9b0dc88659830772679...,30,1744211785,web,190.71.155.42,edukrea,0,76
680792,2025,439706,\mod_hvp\event\attempt_submitted,mod_hvp,submitted,attempt,,5899,76,66a3ad7beed7ad01c5cccf25abc9b0dc88659830772679...,30,1744211785,web,190.71.155.42,edukrea,0,76
680793,2025,439707,\core\event\user_graded,core,graded,user,76100.0,30,74,b6005ef089582419d97090147411273c47991bd937856d...,30,1744211794,web,190.71.155.42,edukrea,0,74


In [69]:
assert student_logs['documento_identificación'].notna().all()

In [70]:
student_logs_moodle = student_logs[student_logs['platform'] == 'moodle']
student_logs_edukrea = student_logs[student_logs['platform'] == 'edukrea']

In [71]:
assert student_logs_moodle.shape[0] == 537014
assert student_logs_moodle['moodle_user_id'].notna().all()
assert student_logs_moodle['moodle_user_id'].dtype == int
# edukrea_user_id must be filled with 0
assert student_logs_moodle['edukrea_user_id'].dtype == int
assert (student_logs_moodle['edukrea_user_id'] == 0).all()

In [75]:
assert student_logs_edukrea.shape[0] == 143781
assert student_logs_edukrea['edukrea_user_id'].notna().all()
assert student_logs_edukrea['edukrea_user_id'].dtype == int
# edukrea_user_id must be filled with 0
assert student_logs_edukrea['moodle_user_id'].dtype == int
assert (student_logs_edukrea['moodle_user_id'] == 0).all()

## Check Logs teachers

In [76]:
teacher_logs = pd.read_csv("data/interim/moodle/teacher_logs.csv")
teacher_logs

Unnamed: 0,year,id,eventname,component,action,target,objectid,contextinstanceid,userid,courseid,timecreated,origin,ip,platform
0,2024,18824284,\gradereport_grader\event\grade_report_viewed,gradereport_grader,viewed,grade_report,,42,1513,42,1704827338,web,190.171.76.67,moodle
1,2024,18824290,\core\event\course_viewed,core,viewed,course,,235,1505,235,1704904631,web,181.55.241.44,moodle
2,2024,18824293,\core\event\course_viewed,core,viewed,course,,271,45,271,1704923291,web,191.156.236.106,moodle
3,2024,18824294,\core\event\course_viewed,core,viewed,course,,206,45,206,1704923373,web,191.156.236.106,moodle
4,2024,18824295,\mod_assign\event\course_module_viewed,mod_assign,viewed,course_module,10467.0,44130,45,206,1704923408,web,191.156.236.106,moodle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1923506,2025,439646,\core\event\course_module_updated,core,updated,course_module,5926.0,5926,21,30,1744211635,web,191.156.55.137,edukrea
1923507,2025,439646,\core\event\course_module_updated,core,updated,course_module,5926.0,5926,21,30,1744211635,web,191.156.55.137,edukrea
1923508,2025,439647,\mod_hvp\event\course_module_viewed,mod_hvp,viewed,course_module,4661.0,5926,21,30,1744211643,web,191.156.55.137,edukrea
1923509,2025,439647,\mod_hvp\event\course_module_viewed,mod_hvp,viewed,course_module,4661.0,5926,21,30,1744211643,web,191.156.55.137,edukrea


In [44]:
assert teacher_logs['userid'].notna().all()
assert teacher_logs['userid'].dtype == int

In [45]:
teacher_logs_moodle = teacher_logs[teacher_logs["platform"] == "moodle"]
teacher_logs_edukrea = teacher_logs[teacher_logs["platform"] == "edukrea"]

In [47]:
df_moodle.shape

(1723800, 13)

In [49]:
df_edukrea.shape

(199711, 13)

In [50]:
assert teacher_logs_moodle.shape[0] == 1723800

In [51]:
assert teacher_logs_edukrea.shape[0] == 199711

## Check HVP Data

In [2]:
edukrea_hvp = pd.read_csv('data/interim/moodle/hvp_edukrea.csv')
moodle_hvp = pd.read_csv('data/interim/moodle/hvp_moodle.csv')

In [3]:
edukrea_modules_df = pd.read_csv('data/interim/moodle/modules_active_edukrea.csv')
moodle_modules_df = pd.read_csv('data/interim/moodle/modules_active_moodle.csv')

In [4]:
edukrea_modules_hvp_df = edukrea_modules_df[edukrea_modules_df['module_type'] == 'hvp']
moodle_modules_hvp_df = moodle_modules_df[moodle_modules_df['module_type'] == 'hvp']

In [5]:
assert len(edukrea_hvp) == len(edukrea_modules_hvp_df), "El número de filas en edukrea_hvp no coincide con edukrea_modules_hvp_df"

In [6]:
assert len(moodle_hvp) == len(moodle_modules_hvp_df), "El número de filas en edukrea_hvp no coincide con edukrea_modules_hvp_df"

In [7]:
edukrea_modules_featured = pd.read_csv('data/interim/moodle/modules_edukrea_featured.csv')
moodle_modules_featured = pd.read_csv('data/interim/moodle/modules_moodle_featured.csv')

In [8]:
assert len(edukrea_modules_featured) == len(edukrea_modules_df), "El número de filas en edukrea_modules_featured no coincide con edukrea_modules_df"

In [9]:
assert len(moodle_modules_featured) == len(moodle_modules_df), "El número de filas en moodle_modules_featured no coincide con moodle_modules_df"

In [11]:
edukrea_modules_featured['total_estudiantes'].unique()

array([26, 18, 14, 22, 13, 39, 25, 33, 36])

In [12]:
moodle_modules_featured['total_estudiantes'].unique()

array([15, 17, 20, 22, 10, 16, 18,  7, 12, 13, 24, 21,  9, 14,  8,  4])

## Verificar cursos y carga horaria

In [53]:
df_carga = pd.read_csv('data/raw/tablas_maestras/carga_horaria.csv')
df_carga

Unnamed: 0,sede,year,id_grado,asignatura,id_asignatura,intensidad,docente,id_docente
0,Fusagasugá,2021,1,Ciencias Naturales,1,3,Camila Andrea Santamaría Olaya,
1,Fusagasugá,2021,2,Ciencias Naturales,1,3,Camila Andrea Santamaría Olaya,
2,Fusagasugá,2021,3,Ciencias Naturales,1,3,Yuli Katherine Cubillos Velez,
3,Fusagasugá,2021,4,Ciencias Naturales,1,3,José David Amaya Herrera,
4,Fusagasugá,2021,5,Ciencias Naturales,1,3,Jose David Amaya Herrera,
...,...,...,...,...,...,...,...,...
676,Girardot,2025,5,Centro de Interés Artístico,10,40,DOCENTE FALTANTE ARTES,
677,Girardot,2025,6,Centro de Interés Artístico,10,40,DOCENTE FALTANTE ARTES,
678,Girardot,2025,7,Centro de Interés Artístico,10,40,DOCENTE FALTANTE ARTES,
679,Girardot,2025,8,Centro de Interés Artístico,10,40,DOCENTE FALTANTE ARTES,


In [54]:
moodle_courses = pd.read_csv('data/interim/moodle/unique_courses_moodle.csv')
moodle_courses

Unnamed: 0,year,id_grado,course_id,course_name,sede,id_asignatura
0,2024,1,161,Aprendizaje Basado en Proyectos 1,Fusagasugá,7
1,2024,1,17,Matemáticas 1,Fusagasugá,3
2,2024,1,18,Ciencias Naturales y Educación Ambiental 1,Fusagasugá,1
3,2024,1,239,Artes 1,Fusagasugá,10
4,2024,1,19,Ciencias Sociales 1,Fusagasugá,2
...,...,...,...,...,...,...
469,2025,8,496,Aprendizaje Basado en Investigación 8,Girardot,14
470,2025,3,328,English 3rd,Girardot,5
471,2025,9,564,Integralidad 9,Girardot,12
472,2025,9,554,Ciencias Sociales 9,Girardot,2


In [55]:
carga_keys = set(zip(
    df_carga["year"],
    df_carga["id_grado"],
    df_carga["id_asignatura"],
    df_carga["sede"]
))

# Verificar si cada fila de moodle_courses existe en carga_keys
moodle_courses["existe_en_carga"] = moodle_courses.apply(
    lambda row: (row["year"], row["id_grado"], row["id_asignatura"], row["sede"]) in carga_keys,
    axis=1
)

In [56]:
moodle_courses

Unnamed: 0,year,id_grado,course_id,course_name,sede,id_asignatura,existe_en_carga
0,2024,1,161,Aprendizaje Basado en Proyectos 1,Fusagasugá,7,True
1,2024,1,17,Matemáticas 1,Fusagasugá,3,True
2,2024,1,18,Ciencias Naturales y Educación Ambiental 1,Fusagasugá,1,True
3,2024,1,239,Artes 1,Fusagasugá,10,True
4,2024,1,19,Ciencias Sociales 1,Fusagasugá,2,True
...,...,...,...,...,...,...,...
469,2025,8,496,Aprendizaje Basado en Investigación 8,Girardot,14,True
470,2025,3,328,English 3rd,Girardot,5,True
471,2025,9,564,Integralidad 9,Girardot,12,True
472,2025,9,554,Ciencias Sociales 9,Girardot,2,True


In [57]:
no_encontrados = moodle_courses[~moodle_courses["existe_en_carga"]]
no_encontrados

Unnamed: 0,year,id_grado,course_id,course_name,sede,id_asignatura,existe_en_carga
236,2025,11,503,Ciencias Naturales Integradas 11,Fusagasugá,20,False
244,2025,10,368,Ciencias Naturales Integradas 10,Fusagasugá,20,False


## Check Modules featured

In [2]:
df_featured = pd.read_csv("data/interim/moodle/modules_featured.csv")
df_featured

Unnamed: 0,year,course_id,course_module_id,sede,id_grado,id_asignatura,asignatura_name,course_name,section_id,section_name,...,min_views_per_student,max_views_per_student,median_views_per_student,percent_students_interacted,percent_students_viewed,interaction_to_view_ratio,teacher_accessed_before_start,teacher_updated_before_start,teacher_updated_during_week_planned,teacher_active
0,2024,17,36166,Fusagasugá,1,3,Matemáticas,Matemáticas 1,93,Semana 5,...,0,0,0,0.000000,0.000000,0.000000,0,0,0,0
1,2024,17,31855,Fusagasugá,1,3,Matemáticas,Matemáticas 1,1359,Semana 3.,...,0,0,0,0.000000,0.000000,0.000000,0,1,0,1
2,2024,17,26132,Fusagasugá,1,3,Matemáticas,Matemáticas 1,1957,Semana 28,...,9,9,9,6.666667,6.666667,0.333333,0,0,1,1
3,2024,174,37373,Fusagasugá,1,6,Creatividad e innovación,Creatividad e innovación 1,3948,WEEK 2,...,0,0,0,0.000000,0.000000,0.000000,0,1,0,1
4,2024,17,36165,Fusagasugá,1,3,Matemáticas,Matemáticas 1,93,Semana 5,...,0,0,0,0.000000,0.000000,0.000000,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37550,2025,25,2405,Fusagasugá,6,2,Ciencias Sociales,Ciencias Sociales 6,231,1. Los Misterios del Cosmos: Explorando el Ori...,...,2,32,4,11.111111,83.333333,0.038095,1,1,0,1
37551,2025,28,2392,Girardot,9,2,Ciencias Sociales,Ciencias Sociales 9,592,5. Sectores terciario y cuaternario: Del servi...,...,1,1,1,0.000000,25.000000,0.000000,0,1,0,1
37552,2025,28,2392,Fusagasugá,9,2,Ciencias Sociales,Ciencias Sociales 9,592,5. Sectores terciario y cuaternario: Del servi...,...,1,1,1,0.000000,5.555556,0.000000,0,1,0,1
37553,2025,28,2402,Girardot,9,2,Ciencias Sociales,Ciencias Sociales 9,589,2. El proceso productivo: De la materia prima ...,...,1,1,1,0.000000,50.000000,0.000000,0,1,0,1


In [3]:
df_featured_moodle = df_featured[df_featured["platform"] == "moodle"]
df_featured_edukrea = df_featured[df_featured["platform"] == "edukrea"]

In [4]:
df_modules_active_edukrea = pd.read_csv("data/interim/moodle/modules_active_edukrea.csv")
df_modules_active_moodle = pd.read_csv("data/interim/moodle/modules_active_moodle.csv")

In [6]:
assert df_featured_moodle.shape[0] == df_modules_active_moodle.shape[0]
assert df_featured_edukrea.shape[0] == df_modules_active_edukrea.shape[0]

## Check Student Modules

In [76]:
student_modules_df = pd.read_csv("data/interim/moodle/student_modules.csv")

In [77]:
student_modules_df_moodle = student_modules_df[student_modules_df["platform"] == "moodle"]
student_modules_df_edukrea = student_modules_df[student_modules_df["platform"] == "edukrea"]

In [93]:
assert student_modules_df_moodle.shape[0] == 502501
assert student_modules_df_edukrea.shape[0] == 75528
assert not student_modules_df.duplicated(subset=["course_module_id", "documento_identificación", "year"]).any()

## Check Courses

In [82]:
courses_moodle = pd.read_csv('data/interim/moodle/courses_moodle.csv')
courses_moodle

Unnamed: 0,sede,id_grado,id_asignatura,asignatura_name,course_id,course_name,period,year,total_students,count_evaluation,...,avg_interactions_per_student,median_interactions_per_student,id_least_viewed_module,students_viewed_least_module,id_most_late_opened_module,days_before_start,percent_modules_out_of_date,percent_students_viewed,percent_students_interacted,percent_modules_viewed
0,Fusagasugá,1,1,Ciencias Naturales y Educación Ambiental,18,Ciencias Naturales y Educación Ambiental 1,1,2024,15,0,...,0.209677,0.0,35001.0,1.0,31677,-19754.0,0.12,0.53,0.27,1.82
1,Fusagasugá,1,1,Ciencias Naturales y Educación Ambiental,18,Ciencias Naturales y Educación Ambiental 1,1,2025,10,0,...,0.000000,0.0,149034.0,1.0,144414,-20122.0,0.03,0.30,0.00,0.31
2,Fusagasugá,1,1,Ciencias Naturales y Educación Ambiental,18,Ciencias Naturales y Educación Ambiental 1,2,2024,15,5,...,0.032258,0.0,17425.0,1.0,132921,-19828.0,0.11,0.47,0.07,1.68
3,Fusagasugá,1,1,Ciencias Naturales y Educación Ambiental,18,Ciencias Naturales y Educación Ambiental 1,3,2024,15,3,...,1.097087,0.0,139639.0,2.0,43688,-19912.0,0.28,0.87,0.73,4.20
4,Fusagasugá,1,1,Ciencias Naturales y Educación Ambiental,18,Ciencias Naturales y Educación Ambiental 1,4,2024,15,0,...,0.856164,0.0,143151.0,1.0,23528,-19975.0,0.19,0.80,0.47,2.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1063,Girardot,9,10,Centro de Interés Artístico,562,Centro de Interés Artístico 9,1,2025,4,0,...,,,,,146351,,0.00,0.00,0.00,0.00
1064,Girardot,9,11,Tecnologías Informáticas,563,Tecnologías Informáticas 9,1,2025,4,7,...,1.384615,0.0,151095.0,1.0,148319,-20136.0,0.08,1.00,0.75,0.33
1065,Girardot,9,12,Integralidad,564,Integralidad 9,1,2025,4,0,...,,,,,146595,,0.00,0.00,0.00,0.00
1066,Girardot,9,13,Innovación y Emprendimiento,560,Innovación y Emprendimiento 9,1,2025,4,7,...,0.000000,0.0,149989.0,1.0,149751,-20150.0,0.01,0.50,0.00,0.06


In [83]:
df_carga = pd.read_csv('data/raw/tablas_maestras/carga_horaria.csv')

In [84]:
df_carga_2025 = df_carga[df_carga['year'] == 2025]

In [85]:
courses_moodle_2025 = courses_moodle[courses_moodle['year'] == 2025]
courses_moodle_2025

Unnamed: 0,sede,id_grado,id_asignatura,asignatura_name,course_id,course_name,period,year,total_students,count_evaluation,...,avg_interactions_per_student,median_interactions_per_student,id_least_viewed_module,students_viewed_least_module,id_most_late_opened_module,days_before_start,percent_modules_out_of_date,percent_students_viewed,percent_students_interacted,percent_modules_viewed
1,Fusagasugá,1,1,Ciencias Naturales y Educación Ambiental,18,Ciencias Naturales y Educación Ambiental 1,1,2025,10,0,...,0.000000,0.0,149034.0,1.0,144414,-20122.0,0.03,0.3,0.00,0.31
6,Fusagasugá,1,2,Ciencias Sociales,19,Ciencias Sociales 1,1,2025,10,0,...,0.000000,0.0,149331.0,1.0,148167,-20122.0,0.02,0.2,0.00,0.19
11,Fusagasugá,1,3,Matemáticas,17,Matemáticas 1,1,2025,10,0,...,0.000000,0.0,150842.0,1.0,150842,-20164.0,0.01,0.1,0.00,0.06
16,Fusagasugá,1,4,Lengua Castellana,20,Lengua Castellana 1,1,2025,10,0,...,,,,,131533,,0.00,0.0,0.00,0.00
21,Fusagasugá,1,5,English,21,English 1st,1,2025,10,0,...,0.000000,0.0,147436.0,1.0,147436,-20122.0,0.00,0.1,0.00,0.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1063,Girardot,9,10,Centro de Interés Artístico,562,Centro de Interés Artístico 9,1,2025,4,0,...,,,,,146351,,0.00,0.0,0.00,0.00
1064,Girardot,9,11,Tecnologías Informáticas,563,Tecnologías Informáticas 9,1,2025,4,7,...,1.384615,0.0,151095.0,1.0,148319,-20136.0,0.08,1.0,0.75,0.33
1065,Girardot,9,12,Integralidad,564,Integralidad 9,1,2025,4,0,...,,,,,146595,,0.00,0.0,0.00,0.00
1066,Girardot,9,13,Innovación y Emprendimiento,560,Innovación y Emprendimiento 9,1,2025,4,7,...,0.000000,0.0,149989.0,1.0,149751,-20150.0,0.01,0.5,0.00,0.06


In [86]:
courses_moodle_2025_unique = courses_moodle_2025.drop_duplicates(subset=['id_grado', 'id_asignatura', 'sede'])
courses_moodle_2025_unique

Unnamed: 0,sede,id_grado,id_asignatura,asignatura_name,course_id,course_name,period,year,total_students,count_evaluation,...,avg_interactions_per_student,median_interactions_per_student,id_least_viewed_module,students_viewed_least_module,id_most_late_opened_module,days_before_start,percent_modules_out_of_date,percent_students_viewed,percent_students_interacted,percent_modules_viewed
1,Fusagasugá,1,1,Ciencias Naturales y Educación Ambiental,18,Ciencias Naturales y Educación Ambiental 1,1,2025,10,0,...,0.000000,0.0,149034.0,1.0,144414,-20122.0,0.03,0.3,0.00,0.31
6,Fusagasugá,1,2,Ciencias Sociales,19,Ciencias Sociales 1,1,2025,10,0,...,0.000000,0.0,149331.0,1.0,148167,-20122.0,0.02,0.2,0.00,0.19
11,Fusagasugá,1,3,Matemáticas,17,Matemáticas 1,1,2025,10,0,...,0.000000,0.0,150842.0,1.0,150842,-20164.0,0.01,0.1,0.00,0.06
16,Fusagasugá,1,4,Lengua Castellana,20,Lengua Castellana 1,1,2025,10,0,...,,,,,131533,,0.00,0.0,0.00,0.00
21,Fusagasugá,1,5,English,21,English 1st,1,2025,10,0,...,0.000000,0.0,147436.0,1.0,147436,-20122.0,0.00,0.1,0.00,0.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1063,Girardot,9,10,Centro de Interés Artístico,562,Centro de Interés Artístico 9,1,2025,4,0,...,,,,,146351,,0.00,0.0,0.00,0.00
1064,Girardot,9,11,Tecnologías Informáticas,563,Tecnologías Informáticas 9,1,2025,4,7,...,1.384615,0.0,151095.0,1.0,148319,-20136.0,0.08,1.0,0.75,0.33
1065,Girardot,9,12,Integralidad,564,Integralidad 9,1,2025,4,0,...,,,,,146595,,0.00,0.0,0.00,0.00
1066,Girardot,9,13,Innovación y Emprendimiento,560,Innovación y Emprendimiento 9,1,2025,4,7,...,0.000000,0.0,149989.0,1.0,149751,-20150.0,0.01,0.5,0.00,0.06


In [87]:
df_carga_2025

Unnamed: 0,sede,year,id_grado,asignatura,id_asignatura,intensidad,docente,id_docente
437,Fusagasugá,2025,1,Ciencias Naturales,1,3,Ximena Alejandra León Dicelis,5.0
438,Fusagasugá,2025,2,Ciencias Naturales,1,3,Ximena Alejandra León Dicelis,5.0
439,Fusagasugá,2025,3,Ciencias Naturales,1,3,Ximena Alejandra León Dicelis,5.0
440,Fusagasugá,2025,4,Ciencias Naturales,1,3,Ximena Alejandra León Dicelis,5.0
441,Fusagasugá,2025,5,Ciencias Naturales,1,3,Ximena Alejandra León Dicelis,5.0
...,...,...,...,...,...,...,...,...
676,Girardot,2025,5,Centro de Interés Artístico,10,40,DOCENTE FALTANTE ARTES,
677,Girardot,2025,6,Centro de Interés Artístico,10,40,DOCENTE FALTANTE ARTES,
678,Girardot,2025,7,Centro de Interés Artístico,10,40,DOCENTE FALTANTE ARTES,
679,Girardot,2025,8,Centro de Interés Artístico,10,40,DOCENTE FALTANTE ARTES,


In [91]:
# Definir la llave compuesta
merge_keys = ['id_grado', 'id_asignatura', 'sede']

# Hacer el anti-join: cursos en df_carga_2025 que no están en courses_moodle_2025_unique
cursos_no_en_moodle = df_carga_2025.merge(
    courses_moodle_2025_unique[merge_keys].drop_duplicates(),
    on=merge_keys,
    how='left',
    indicator=True
).query('_merge == "left_only"').drop(columns=['_merge'])
cursos_no_en_moodle

Unnamed: 0,sede,year,id_grado,asignatura,id_asignatura,intensidad,docente,id_docente
56,Fusagasugá,2025,1,Aprendizaje Basado en Proyectos,7,4,Ximena Alejandra León Dicelis,5.0
57,Fusagasugá,2025,2,Aprendizaje Basado en Proyectos,7,4,Angie Jimena Gómez Arévalo,1.0
58,Fusagasugá,2025,3,Aprendizaje Basado en Proyectos,7,4,Ximena Alejandra León Dicelis,5.0
120,Fusagasugá,2025,6,Aprendizaje Basado en Investigación,14,7,Johann Alexander Sanchez Riveros,10.0
186,Girardot,2025,1,Aprendizaje Basado en Proyectos,7,4,Paola Andrea Herman Cruz,23.0
187,Girardot,2025,2,Aprendizaje Basado en Proyectos,7,4,Maria Camila Rubio Rincón,16.0
188,Girardot,2025,3,Aprendizaje Basado en Proyectos,7,4,Carlos Alberto Reyes Villanueva,18.0
189,Girardot,2025,4,Aprendizaje Basado en Proyectos,7,4,Valeria Valentina Sarabia Vargas,19.0
190,Girardot,2025,5,Aprendizaje Basado en Proyectos,7,4,Vanessa Liliana Sarabia Vargas,17.0
231,Girardot,2025,6,Aprendizaje Basado en Investigación,14,7,Carlos Alberto Reyes Villanueva,18.0


In [89]:
merge_keys = ['id_grado', 'id_asignatura', 'sede']

print("Cursos únicos en carga:", df_carga_2025[merge_keys].drop_duplicates().shape[0])
print("Cursos únicos en Moodle:", courses_moodle_2025_unique[merge_keys].drop_duplicates().shape[0])

Cursos únicos en carga: 244
Cursos únicos en Moodle: 224


In [90]:
merge_keys = ['id_grado', 'id_asignatura', 'sede']

# Filtrar cursos duplicados por la llave
duplicados_moodle = df_carga_2025[
    df_carga_2025.duplicated(subset=merge_keys, keep=False)
]

# Ver los duplicados ordenados
duplicados_moodle = duplicados_moodle.sort_values(by=merge_keys)
duplicados_moodle

Unnamed: 0,sede,year,id_grado,asignatura,id_asignatura,intensidad,docente,id_docente


In [77]:
unique_courses_moodle = pd.read_csv('data/interim/moodle/unique_courses_moodle.csv')

In [78]:
df_carga_2025

Unnamed: 0,sede,year,id_grado,asignatura,id_asignatura,intensidad,docente,id_docente
437,Fusagasugá,2025,1,Ciencias Naturales,1,3,Ximena Alejandra León Dicelis,5.0
438,Fusagasugá,2025,2,Ciencias Naturales,1,3,Ximena Alejandra León Dicelis,5.0
439,Fusagasugá,2025,3,Ciencias Naturales,1,3,Ximena Alejandra León Dicelis,5.0
440,Fusagasugá,2025,4,Ciencias Naturales,1,3,Ximena Alejandra León Dicelis,5.0
441,Fusagasugá,2025,5,Ciencias Naturales,1,3,Ximena Alejandra León Dicelis,5.0
...,...,...,...,...,...,...,...,...
676,Girardot,2025,5,Centro de Interés Artístico,10,40,DOCENTE FALTANTE ARTES,
677,Girardot,2025,6,Centro de Interés Artístico,10,40,DOCENTE FALTANTE ARTES,
678,Girardot,2025,7,Centro de Interés Artístico,10,40,DOCENTE FALTANTE ARTES,
679,Girardot,2025,8,Centro de Interés Artístico,10,40,DOCENTE FALTANTE ARTES,


In [79]:
unique_courses_moodle_2024 = unique_courses_moodle[unique_courses_moodle['year'] == 2025]
unique_courses_moodle_2024

Unnamed: 0,year,id_grado,course_id,course_name,sede,id_asignatura
230,2025,11,511,Educación Física y Deportes 11,Fusagasugá,8
231,2025,11,513,Centro de Interés Artístico 11,Fusagasugá,10
232,2025,11,529,Integralidad 11,Fusagasugá,12
233,2025,11,506,Cálculo 11,Fusagasugá,21
234,2025,11,507,Lengua Castellana 11,Fusagasugá,4
...,...,...,...,...,...,...
469,2025,8,496,Aprendizaje Basado en Investigación 8,Girardot,14
470,2025,3,328,English 3rd,Girardot,5
471,2025,9,564,Integralidad 9,Girardot,12
472,2025,9,554,Ciencias Sociales 9,Girardot,2


In [80]:
courses_moodle_2024

NameError: name 'courses_moodle_2024' is not defined

In [42]:
modules_moodle_df = pd.read_csv("data/interim/moodle/modules_active_moodle.csv")


In [43]:
modules_moodle_df[modules_moodle_df['course_id'] == 503]

Unnamed: 0,year,course_id,course_module_id,sede,id_grado,id_asignatura,asignatura_name,course_name,section_id,section_name,...,instance,module_creation_date,module_type,module_name,week,period,is_interactive,is_in_english,planned_start_date,planned_end_date
3142,2024,503,121303,Fusagasugá,11,20,Ciencias Naturales Integradas,Ciencias Naturales Integradas 11,16782,Semana 29,...,39040,2020-05-25 00:34:35,assign,Semana 29. Ejercicio de interpretación.,29,4,1,0,2024-10-14 00:00:00,2024-10-20 23:59:59
3143,2024,503,121310,Fusagasugá,11,20,Ciencias Naturales Integradas,Ciencias Naturales Integradas 11,16783,Semana 30,...,39042,2020-05-25 00:34:35,assign,Semana 30. Ejercicio de interpretación,30,4,1,0,2024-10-21 00:00:00,2024-10-27 23:59:59
3144,2024,503,121318,Fusagasugá,11,20,Ciencias Naturales Integradas,Ciencias Naturales Integradas 11,16784,Semana 31,...,39045,2020-05-25 00:34:35,assign,Semana 31. Ejercicio de interpretación,31,4,1,0,2024-10-28 00:00:00,2024-11-03 23:59:59
3147,2024,503,121190,Fusagasugá,11,20,Ciencias Naturales Integradas,Ciencias Naturales Integradas 11,16765,Semana 12,...,39003,2020-05-25 00:34:35,assign,Semana 12. Ejercicio de interpretación.,12,2,1,0,2024-05-06 00:00:00,2024-05-12 23:59:59
3148,2024,503,121196,Fusagasugá,11,20,Ciencias Naturales Integradas,Ciencias Naturales Integradas 11,16766,Semana 13,...,39005,2020-05-25 00:34:35,assign,Semana 13. . Ejercicio de interpretación.,13,2,1,0,2024-05-13 00:00:00,2024-05-19 23:59:59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31303,2025,503,148843,Fusagasugá,11,20,Ciencias Naturales Integradas,Ciencias Naturales Integradas 11,16757,Semana 4,...,47954,2020-05-26 20:00:41,hvp,Semana 4. Desarrollo de las sesiones de aprend...,4,1,1,0,2025-02-24 00:00:00,2025-03-02 23:59:59
31304,2025,503,149421,Fusagasugá,11,20,Ciencias Naturales Integradas,Ciencias Naturales Integradas 11,16758,Semana 5,...,48152,2020-05-26 20:00:41,hvp,Semana 5. Desarrollo de las sesiones de aprend...,5,1,1,0,2025-03-03 00:00:00,2025-03-09 23:59:59
31305,2025,503,150088,Fusagasugá,11,20,Ciencias Naturales Integradas,Ciencias Naturales Integradas 11,16759,Semana 6,...,48322,2020-05-26 20:00:41,hvp,Semana 6. Desarrollo de las sesiones de aprend...,6,1,1,0,2025-03-10 00:00:00,2025-03-16 23:59:59
31306,2025,503,150545,Fusagasugá,11,20,Ciencias Naturales Integradas,Ciencias Naturales Integradas 11,16760,Semana 7,...,48467,2020-05-26 20:00:41,hvp,Semana 7. Desarrollo de las sesiones de aprend...,7,1,1,0,2025-03-17 00:00:00,2025-03-23 23:59:59


In [6]:
modules_2024[modules_2024['course_id'] == 161]

Unnamed: 0,year,course_id,course_module_id,sede,id_grado,id_asignatura,asignatura_name,course_name,section_id,section_name,module_type_id,instance,module_creation_date,module_type,module_name
29727,2024,161,128042,Fusagasugá,1,7,Aprendizaje Basado en Proyectos,Aprendizaje Basado en Proyectos 1,3817,Lineamientos Generales,17,3152,1706915437,resource,Syllabus Aprendizaje Basado en Proyectos
29739,2024,161,130126,Fusagasugá,1,7,Aprendizaje Basado en Proyectos,Aprendizaje Basado en Proyectos 1,3817,Lineamientos Generales,17,3305,1708809864,resource,Cronograma ABP
30324,2024,161,143430,Fusagasugá,1,7,Aprendizaje Basado en Proyectos,Aprendizaje Basado en Proyectos 1,3817,Lineamientos Generales,15,33,1716943413,page,Bitácora de avance en proyectos ABP semestre II
