# Data Checker

El propósito de este notebook es realizar unas verificaciones de los datos para tener la seguridad que que los datos son consistentes

## Verificación de estudiantes inscritos en archivos raw

In [1]:
expected_students_fusa_2024 = {
    '1': 15,
    '2': 15,
    '3': 15,
    '4': 22,
    '5': 17,
    '6': 20,
    '7': 22,
    '8': 17,
    '9': 16,
    '10': 17,
    '11': 13,
}

expected_students_girardot_2024 = {
    '1': 17,
    '2': 7,
    '3': 17,
    '4': 18,
    '5': 10,
    '6': 16,
    '7': 16,
    '8': 12,
}

expected_students_fusa_2025 = {
    '1': 10,
    '2': 9,
    '3': 15,
    '4': 16,
    '5': 24,
    '6': 18,
    '7': 21,
    '8': 21,
    '9': 18,
    '10': 14,
    '11': 13,
}

expected_students_girardot_2025 = {
    '1': 8,
    '2': 16,
    '3': 4,
    '4': 20,
    '5': 15,
    '6': 7,
    '7': 15,
    '8': 12,
    '9': 4
}

students_2024 = sum(expected_students_fusa_2024.values()) + sum(expected_students_girardot_2024.values())
students_2025 = sum(expected_students_fusa_2025.values()) + sum(expected_students_girardot_2025.values())
# Sumar los valores de cada grado (llave) entre Fusa y Girardot para 2024
total_students_by_grade_2024 = {}
all_grades = set(expected_students_fusa_2024.keys()).union(expected_students_girardot_2024.keys())
for grade in all_grades:
    fusa_count = expected_students_fusa_2024.get(grade, 0)
    girardot_count = expected_students_girardot_2024.get(grade, 0)
    total_students_by_grade_2024[grade] = fusa_count + girardot_count

# Sumar los valores de cada grado (llave) entre Fusa y Girardot para 2025
total_students_by_grade_2025 = {}
all_grades = set(expected_students_fusa_2025.keys()).union(expected_students_girardot_2025.keys())
for grade in all_grades:
    fusa_count = expected_students_fusa_2025.get(grade, 0)
    girardot_count = expected_students_girardot_2025.get(grade, 0)
    total_students_by_grade_2025[grade] = fusa_count + girardot_count

In [2]:
total_students_by_grade_2024

{'9': 16,
 '1': 32,
 '4': 40,
 '11': 13,
 '6': 36,
 '2': 22,
 '5': 27,
 '3': 32,
 '10': 17,
 '7': 38,
 '8': 29}

In [3]:
total_students_by_grade_2025

{'9': 22,
 '1': 18,
 '4': 36,
 '11': 13,
 '6': 25,
 '2': 25,
 '5': 39,
 '3': 19,
 '10': 14,
 '7': 36,
 '8': 33}

In [1]:
import pandas as pd
import os
import sys
import duckdb

project_root = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..', '..'))
os.chdir(project_root)

# Añadir el directorio raíz al path de Python
if project_root not in sys.path:
    sys.path.append(project_root)

from utils.hash_utility import HashUtility

In [5]:
con = duckdb.connect()

In [6]:
# Read pandas files
students_2024_df = pd.read_csv('data/raw/estudiantes/estudiantes_2024.csv')
students_2025_df = pd.read_csv('data/raw/estudiantes/estudiantes_2025.csv')


students_2024_df = students_2024_df[
    (~students_2024_df['grado'].isin(['Prejardín', 'Jardín', 'Transición']))
]

students_2025_df = students_2025_df[
    (~students_2025_df['grado'].isin(['Prejardín', 'Jardín', 'Transición']))
]


In [7]:
girardot_count_2024 = students_2024_df[students_2024_df['sede'] == 'Girardot'].shape[0]
fusa_count_2024 = students_2024_df[students_2024_df['sede'] == 'Fusagasugá'].shape[0]

# Asserting the counts
assert girardot_count_2024 == sum(expected_students_girardot_2024.values()), "Count mismatch for Girardot 2024"
assert fusa_count_2024 == sum(expected_students_fusa_2024.values()), "Count mismatch for Fusa 2024"

In [8]:
girardot_count_2025 = students_2025_df[students_2025_df['sede'] == 'Girardot'].shape[0]
fusa_count_2025 = students_2025_df[students_2025_df['sede'] == 'Fusagasugá'].shape[0]

# Asserting the counts
assert girardot_count_2025 == sum(expected_students_girardot_2025.values()), "Count mismatch for Girardot 2025"
assert fusa_count_2025 == sum(expected_students_fusa_2025.values()), "Count mismatch for Fusa 2025"

In [9]:
girardot_count_2024_by_grade = (
    students_2024_df[students_2024_df['sede'] == 'Girardot']
    .groupby('grado')
    .size()
    .to_dict()
)
assert girardot_count_2024_by_grade == expected_students_girardot_2024, "Grade count mismatch for Girardot 2024"

fusa_count_2024_by_grade = (
    students_2024_df[students_2024_df['sede'] == 'Fusagasugá']
    .groupby('grado')
    .size()
    .to_dict()
)
assert fusa_count_2024_by_grade == expected_students_fusa_2024, "Grade count mismatch for Fusagasugá 2024"

In [10]:
girardot_count_2025_by_grade = (
    students_2025_df[students_2025_df['sede'] == 'Girardot']
    .groupby('grado')
    .size()
    .to_dict()
)
# Convert keys to string
girardot_count_2025_by_grade = {str(k): v for k, v in girardot_count_2025_by_grade.items()}
assert girardot_count_2025_by_grade == expected_students_girardot_2025


fusa_count_2025_by_grade = (
    students_2025_df[students_2025_df['sede'] == 'Fusagasugá']
    .groupby('grado')
    .size()
    .to_dict()
)
fusa_count_2025_by_grade = {str(k): v for k, v in fusa_count_2025_by_grade.items()}
assert fusa_count_2025_by_grade == expected_students_fusa_2025


In [11]:
total_count_2025_by_grade = (
    students_2025_df
    .groupby('grado')
    .size()
    .to_dict()
)
total_count_2025_by_grade = {str(k): v for k, v in total_count_2025_by_grade.items()}
assert total_students_by_grade_2025 == total_count_2025_by_grade


In [12]:
total_count_2024_by_grade = (
    students_2024_df
    .groupby('grado')
    .size()
    .to_dict()
)
total_count_2024_by_grade = {str(k): v for k, v in total_count_2024_by_grade.items()}
assert total_students_by_grade_2024 == total_count_2024_by_grade

## Verificar que para cada estudiante de 2024 y 2025 estén los datos

In [13]:
students_2025_2025 = pd.read_csv('data/raw/estudiantes/estudiantes_2024_2025.csv')
docs_2024_2025 = set(students_2025_2025['documento_identificación'])

In [14]:
not_in_all_df = students_2024_df[~students_2024_df['documento_identificación'].isin(docs_2024_2025)]
assert not_in_all_df.empty, "Hay estudiantes de 2024 que no están en el archivo de todos los estudiantes de 2024 y 2025"

## Excepción

Hay un estudiante que tiene documento de identificación diferente en año 2024 y 2025 (1023942729,1031833774)

In [15]:
not_in_all_df = students_2025_df[~students_2025_df['documento_identificación'].isin(docs_2024_2025)]

# Remove from not_in_all_df the student with document 1031833774
not_in_all_df = not_in_all_df[not_in_all_df['documento_identificación'] != 1031833774]

assert not_in_all_df.empty, "Hay estudiantes de 2025 que no están en el archivo de todos los estudiantes de 2024 y 2025"

## Verificación con moodle

In [16]:
parquet_users_path="data/raw/moodle/2024/Users/mdlvf_user.parquet"
parquet_user_info_path="data/raw/moodle/2024/Users/mdlvf_user_info_data.parquet"

sql = f"""
SELECT 
    u.id AS UserID,
    u.idnumber AS documento_identificación,
    CONCAT(u.firstname, ' ', u.lastname) AS "Nombre Completo",
    u.city AS Sede,
    to_timestamp(u.firstaccess) AS "Fecha Primer Acceso",
    to_timestamp(u.lastaccess) AS "Feha Último Acceso",
    to_timestamp(u.lastlogin) AS "Fecha Último Inicio de Sesión",
    to_timestamp(u.timecreated) AS "Fecha Creación"
FROM 
    '{parquet_users_path}' u
JOIN 
    '{parquet_user_info_path}' uid 
    ON u.id = uid.userid
WHERE 
    uid.data = 'Estudiante'
    AND u.idnumber <> ''
    AND u.deleted = 0;
"""
df_2024 = con.execute(sql).df()
df_2024["documento_identificación"] = df_2024["documento_identificación"].astype(str).str.replace(r"\s+", "", regex=True)
df_2024["documento_identificación_hash"] = df_2024["documento_identificación"].apply(HashUtility.hash_stable)

In [17]:
missing_in_moodle_2024 = students_2024_df[~students_2024_df['documento_identificación'].astype(str).isin(df_2024['documento_identificación'])]
assert missing_in_moodle_2024.empty, "Hay estudiantes de 2024 que no están en el moodle 2024"

In [18]:
parquet_users_path="data/raw/moodle/2025/Users/mdlvf_user.parquet"
parquet_user_info_path="data/raw/moodle/2025/Users/mdlvf_user_info_data.parquet"

sql = f"""
SELECT 
    u.id AS UserID,
    u.idnumber AS documento_identificación,
    CONCAT(u.firstname, ' ', u.lastname) AS "Nombre Completo",
    u.city AS Sede,
    to_timestamp(u.firstaccess) AS "Fecha Primer Acceso",
    to_timestamp(u.lastaccess) AS "Feha Último Acceso",
    to_timestamp(u.lastlogin) AS "Fecha Último Inicio de Sesión",
    to_timestamp(u.timecreated) AS "Fecha Creación"
FROM 
    '{parquet_users_path}' u
JOIN 
    '{parquet_user_info_path}' uid 
    ON u.id = uid.userid
WHERE 
    uid.data = 'Estudiante'
    AND u.idnumber <> ''
    AND u.deleted = 0;
"""
df_2025 = con.execute(sql).df()
df_2025["documento_identificación"] = df_2025["documento_identificación"].astype(str).str.replace(r"\s+", "", regex=True)
df_2025["documento_identificación_hash"] = df_2025["documento_identificación"].apply(HashUtility.hash_stable)

In [19]:
missing_in_moodle_2025 = students_2025_df[~students_2025_df['documento_identificación'].astype(str).isin(df_2025['documento_identificación'])]
assert missing_in_moodle_2025.empty, "Hay estudiantes de 2025 que no están en el moodle 2025"

## Verificación con Edukrea

In [20]:
df_user_edukrea = pd.read_parquet('data/raw/moodle/Edukrea/Users/mdl_user.parquet')

In [21]:
missing_in_edukrea_2025 = students_2025_df[
    ~students_2025_df['documento_identificación'].astype(str).isin(df_user_edukrea['idnumber'].astype(str))
]
assert missing_in_edukrea_2025.empty, "Hay estudiantes de 2025 que no están en el Edukrea"

## Verificación de estudiantes inscritos en archivos interim

In [22]:
students_2024_hashed = pd.read_csv('data/interim/estudiantes/estudiantes_2024_hashed.csv')
students_2025_hashed = pd.read_csv('data/interim/estudiantes/estudiantes_2025_hashed.csv')
students_clean = pd.read_csv('data/interim/estudiantes/estudiantes_clean.csv')
students_imputed_encoded = pd.read_csv('data/interim/estudiantes/estudiantes_imputed_encoded.csv')
students_imputed = pd.read_csv('data/interim/estudiantes/estudiantes_imputed.csv')

In [23]:
students_2024_hashed = students_2024_hashed[
    (~students_2024_hashed['grado'].isin(['Prejardín', 'Jardín', 'Transición']))
]

students_2025_hashed = students_2025_hashed[
    (~students_2025_hashed['grado'].isin(['Prejardín', 'Jardín', 'Transición']))
]

In [24]:
# Apply hash to raw files to match the hashed files
students_2024_df['documento_identificación_hash'] = students_2024_df['documento_identificación'].apply(HashUtility.hash_stable)
students_2025_df['documento_identificación_hash'] = students_2025_df['documento_identificación'].apply(HashUtility.hash_stable)
students_2025_2025['documento_identificación_hash'] = students_2025_2025['documento_identificación'].apply(HashUtility.hash_stable)

In [25]:
missing_in_students_2024_hashed = students_2024_hashed[~students_2024_hashed['documento_identificación'].astype(str).isin(students_2024_df['documento_identificación_hash'])]
assert missing_in_students_2024_hashed.empty, "Hay estudiantes de 2024 hash que no están en el archivo de estudiantes 2024 hasheados"

In [26]:
missing_in_students_2025_hashed = students_2025_hashed[~students_2025_hashed['documento_identificación'].astype(str).isin(students_2025_df['documento_identificación_hash'])]
assert missing_in_students_2025_hashed.empty, "Hay estudiantes de 2025 hash que no están en el archivo de estudiantes 2025 hasheados"

In [27]:
missing_in_students_clean = students_clean[~students_clean['documento_identificación'].astype(str).isin(students_2025_2025['documento_identificación_hash'])]
assert missing_in_students_clean.empty, "Hay estudiantes de hash que no están en el archivo de estudiantes hasheados"

In [28]:
missing_in_students_imputed = students_imputed[~students_imputed['documento_identificación'].astype(str).isin(students_2025_2025['documento_identificación_hash'])]
assert missing_in_students_imputed.empty, "Hay estudiantes de hash que no están en el archivo de estudiantes hasheados"

In [29]:
missing_in_students_imputed_encoded = students_imputed_encoded[~students_imputed_encoded['documento_identificación'].astype(str).isin(students_2025_2025['documento_identificación_hash'])]
assert missing_in_students_imputed_encoded.empty, "Hay estudiantes de hash que no están en el archivo de estudiantes hasheados"

## Verificación con enrollments

In [30]:
enrollments = pd.read_csv('data/interim/estudiantes/enrollments.csv')

enrollments_2024 = enrollments[enrollments['year'] == 2024]
enrollments_2025 = enrollments[enrollments['year'] == 2025]

In [31]:
assert len(students_2025_df) == students_2025, f"El número de filas en students_2025_df ({len(students_2025_df)}) no coincide con students_2025 ({students_2025})"
assert len(students_2024_df) == students_2024, f"El número de filas en students_2024_df ({len(students_2024_df)}) no coincide con students_2024 ({students_2024})"

In [32]:
# Verificar que el conteo de filas por grado coincida entre students_2025_df y enrollments_2025
# Agrupar y contar por grado en students_2025_df
students_2025_by_grade = students_2025_df.groupby('grado').size().sort_index()
# Agrupar y contar por id_grado en enrollments_2025
enrollments_2025_by_grade = enrollments_2025.groupby('id_grado').size().sort_index()
# Convertir los índices a string para asegurar la comparación
students_2025_by_grade.index = students_2025_by_grade.index.astype(str)
enrollments_2025_by_grade.index = enrollments_2025_by_grade.index.astype(str)
# Comparar ambos conteos
assert students_2025_by_grade.equals(enrollments_2025_by_grade), "Los conteos por grado no coinciden entre students_2025_df y enrollments_2025"

In [33]:
# Verificar que el conteo de filas por grado coincida entre students_2024_df y enrollments_2024
# Agrupar y contar por grado en students_2024_df
students_2024_by_grade = students_2024_df.groupby('grado').size().sort_index()
# Agrupar y contar por id_grado en enrollments_2024
enrollments_2024_by_grade = enrollments_2024.groupby('id_grado').size().sort_index()
# Convertir los índices a string para asegurar la comparación
students_2024_by_grade.index = students_2024_by_grade.index.astype(str)
enrollments_2024_by_grade.index = enrollments_2024_by_grade.index.astype(str)

# Ordenar los conteos por grado
students_2024_by_grade = students_2024_by_grade.sort_index()
enrollments_2024_by_grade = enrollments_2024_by_grade.sort_index()

# Comparar ambos conteos
assert students_2024_by_grade.equals(enrollments_2024_by_grade), "Los conteos por grado no coinciden entre students_2024_df y enrollments_2024"

## Verificación de cursos

In [34]:
students_courses = pd.read_csv('data/interim/moodle/student_moodle_courses.csv')
students_courses_2025 = students_courses[students_courses['year'] == 2025]
students_courses_2024 = students_courses[students_courses['year'] == 2024]

In [35]:
# Diccionario para guardar los resultados por grado
inconsistencias_por_grado = {}

for grado in range(1, 12):  # Del 1 al 11 inclusive
    filtro = (students_courses['id_grado'] == grado) & (~students_courses['course_name'].str.contains(str(grado)))
    inconsistencias = students_courses[filtro]
    
    if not inconsistencias.empty:
        inconsistencias_por_grado[grado] = inconsistencias

In [36]:
for grado, df in inconsistencias_por_grado.items():
    print(f"\nInconsistencias para grado {grado}:")
    print(df[['documento_identificación', 'course_name']].head())

In [37]:
students_courses = pd.read_csv('data/interim/moodle/student_edukrea_courses.csv')

In [38]:
# Diccionario para guardar los resultados por grado
inconsistencias_por_grado = {}

for grado in range(1, 12):  # Del 1 al 11 inclusive
    filtro = (students_courses['id_grado'] == grado) & (~students_courses['course_name'].str.contains(str(grado)))
    inconsistencias = students_courses[filtro]
    
    if not inconsistencias.empty:
        inconsistencias_por_grado[grado] = inconsistencias

In [39]:
for grado, df in inconsistencias_por_grado.items():
    print(f"\nInconsistencias para grado {grado}:")
    print(df[['documento_identificación', 'course_name']])


Inconsistencias para grado 3:
                               documento_identificación           course_name
586   cf6b81c391cf8ded4e4f1fab86fbbac0766591ed5cf768...            Lenguaje 2
627   cf6b81c391cf8ded4e4f1fab86fbbac0766591ed5cf768...         Matemáticas 2
844   cf6b81c391cf8ded4e4f1fab86fbbac0766591ed5cf768...   Ciencias Sociales 2
1082  cf6b81c391cf8ded4e4f1fab86fbbac0766591ed5cf768...  Ciencias Naturales 2


Hay 4 cursos de grado 2do inscritos a una estudiante de 3ero pero verificando en la plataforma es correcto

## Chequeo de módulos que contengan fecha de inicio

In [64]:
edukrea_modules_df = pd.read_csv('data/interim/moodle/modules_active_edukrea.csv')
moodle_modules_df = pd.read_csv('data/interim/moodle/modules_active_moodle.csv')

In [69]:
assert edukrea_modules_df[edukrea_modules_df['planned_start_date'].isnull()].empty, "Hay módulos en Edukrea sin fecha de inicio de semana"
assert edukrea_modules_df[edukrea_modules_df['planned_end_date'].isnull()].empty, "Hay módulos en Edukrea sin fecha de fin de semana"

In [68]:
assert moodle_modules_df[moodle_modules_df['planned_start_date'].isnull()].empty, "Hay módulos en Moodle sin fecha de inicio de semana"
assert moodle_modules_df[moodle_modules_df['planned_end_date'].isnull()].empty, "Hay módulos en Moodle sin fecha de fin de semana"

In [70]:
moodle_modules_df.columns

Index(['year', 'course_id', 'course_module_id', 'sede', 'id_grado',
       'id_asignatura', 'asignatura_name', 'course_name', 'section_id',
       'section_name', 'module_type_id', 'instance', 'module_creation_date',
       'module_type', 'module_name', 'is_edukrea_access',
       'is_absence_assignment', 'week', 'period', 'is_interactive',
       'is_in_english', 'planned_start_date', 'planned_end_date'],
      dtype='object')

## Verificar calendario académico

In [None]:
# Leer CSVs
calendario_df = pd.read_csv("data/raw/tablas_maestras/calendario_escolar.csv", dayfirst=True)
vacaciones_df = pd.read_csv("data/raw/tablas_maestras/vacaciones_festivos.csv", dayfirst=True)

# Limpiar columnas
calendario_df.columns = calendario_df.columns.str.strip().str.lower()
vacaciones_df.columns = vacaciones_df.columns.str.strip().str.lower()

# Convertir fechas a datetime
calendario_df["inicio"] = pd.to_datetime(calendario_df["inicio"], dayfirst=True)
vacaciones_df["inicio"] = pd.to_datetime(vacaciones_df["inicio"], dayfirst=True)
vacaciones_df["fin"] = pd.to_datetime(vacaciones_df["fin"], dayfirst=True)

# Construir periods
periods = {}

for year in calendario_df["año"].unique():
    year_data = calendario_df[(calendario_df["año"] == year) & (calendario_df["semana"] == 1)]
    year_periods = {}

    # Fechas de inicio de bimestres
    for _, row in year_data.iterrows():
        bimestre = int(row["bimestre"])
        start_date = pd.Timestamp(row["inicio"], tz="America/Bogota")
        year_periods[f"p{bimestre}_start"] = start_date

    # Vacaciones como Timestamps con hora incluida
    year_vacaciones = vacaciones_df[vacaciones_df["año"] == year]
    vacations_list = [
        (
            pd.Timestamp(row["inicio"].strftime("%Y-%m-%dT00:00:00"), tz="America/Bogota"),
            pd.Timestamp(row["fin"].strftime("%Y-%m-%dT23:59:00"), tz="America/Bogota")
        )
        for _, row in year_vacaciones.iterrows()
    ]
    year_periods["vacations"] = vacations_list

    periods[year] = year_periods

# Resultado final
periods

{np.int64(2024): {'p1_start': Timestamp('2024-01-02 00:00:00-0500', tz='America/Bogota'),
  'p2_start': Timestamp('2024-04-15 00:00:00-0500', tz='America/Bogota'),
  'p3_start': Timestamp('2024-08-07 00:00:00-0500', tz='America/Bogota'),
  'p4_start': Timestamp('2024-09-09 00:00:00-0500', tz='America/Bogota'),
  'vacations': [(Timestamp('2024-03-23 00:00:00-0500', tz='America/Bogota'),
    Timestamp('2024-04-01 23:59:00-0500', tz='America/Bogota')),
   (Timestamp('2024-05-01 00:00:00-0500', tz='America/Bogota'),
    Timestamp('2024-05-01 23:59:00-0500', tz='America/Bogota')),
   (Timestamp('2024-05-17 00:00:00-0500', tz='America/Bogota'),
    Timestamp('2024-05-17 23:59:00-0500', tz='America/Bogota')),
   (Timestamp('2024-06-15 00:00:00-0500', tz='America/Bogota'),
    Timestamp('2024-07-08 23:59:00-0500', tz='America/Bogota')),
   (Timestamp('2024-08-07 00:00:00-0500', tz='America/Bogota'),
    Timestamp('2024-08-07 23:59:00-0500', tz='America/Bogota')),
   (Timestamp('2024-08-18 00:0

## Consulta de docentes en cursos

In [None]:
year = 2025
course_file = f"data/raw/moodle/{year}/Course/mdlvf_course.parquet"
context_file = f"data/raw/moodle/{year}/System/mdlvf_context.parquet"
role_assignments_file = f"data/raw/moodle/{year}/Users/mdlvf_role_assignments.parquet"
role_file = f"data/raw/moodle/{year}/Users/mdlvf_role.parquet"
user_file = f"data/raw/moodle/{year}/Users/mdlvf_user.parquet"
unique_courses_file = "data/interim/moodle/courses_unique_moodle.csv"

In [None]:
sql = f"""
SELECT 
    c.id AS courseid,
    c.fullname AS coursename,
    string_agg(DISTINCT u.firstname || ' ' || u.lastname, ', ') AS teacher
FROM '{course_file}' c
JOIN '{unique_courses_file}' uc ON c.id = uc.course_id
JOIN '{context_file}' ctx ON ctx.instanceid = c.id AND ctx.contextlevel = 50
JOIN '{role_assignments_file}' ra ON ra.contextid = ctx.id
JOIN '{role_file}' r ON r.id = ra.roleid AND r.shortname = 'editingteacher'
JOIN '{user_file}' u ON u.id = ra.userid
WHERE c.visible = 1
  AND NOT (u.firstname = 'Provisional' AND u.lastname = 'Girardot')
GROUP BY c.id, c.fullname
ORDER BY c.id;
"""

df = con.execute(sql).df()
df

Unnamed: 0,courseid,coursename,teacher
0,17,Matemáticas 1,Wilder Mauricio Ussa Santana
1,18,Ciencias Naturales y Educación Ambiental 1,Ximena Alejandra León Dicelis
2,19,Ciencias Sociales 1,Angie Jimena Gómez Arévalo
3,20,Lengua Castellana 1,Yessika Alejandra Morales García
4,21,English 1st,Yessika Alejandra Morales García
...,...,...,...
239,562,Centro de Interés Artístico 9,"Daniel Felipe Sánchez Saldarriaga, Álvaro Alex..."
240,563,Tecnologías Informáticas 9,Vanessa Liliana Sarabia Vargas
241,564,Integralidad 9,Jhojan Stiven Rubiano Capador
242,566,Física 10,Dayron Mateo González Penagos


## Check HVP Data

In [2]:
edukrea_hvp = pd.read_csv('data/interim/moodle/hvp_edukrea.csv')
moodle_hvp = pd.read_csv('data/interim/moodle/hvp_moodle.csv')

In [3]:
edukrea_modules_df = pd.read_csv('data/interim/moodle/modules_active_edukrea.csv')
moodle_modules_df = pd.read_csv('data/interim/moodle/modules_active_moodle.csv')

In [4]:
edukrea_modules_hvp_df = edukrea_modules_df[edukrea_modules_df['module_type'] == 'hvp']
moodle_modules_hvp_df = moodle_modules_df[moodle_modules_df['module_type'] == 'hvp']

In [5]:
assert len(edukrea_hvp) == len(edukrea_modules_hvp_df), "El número de filas en edukrea_hvp no coincide con edukrea_modules_hvp_df"

In [6]:
assert len(moodle_hvp) == len(moodle_modules_hvp_df), "El número de filas en edukrea_hvp no coincide con edukrea_modules_hvp_df"

In [7]:
edukrea_modules_featured = pd.read_csv('data/interim/moodle/modules_edukrea_featured.csv')
moodle_modules_featured = pd.read_csv('data/interim/moodle/modules_moodle_featured.csv')

In [8]:
assert len(edukrea_modules_featured) == len(edukrea_modules_df), "El número de filas en edukrea_modules_featured no coincide con edukrea_modules_df"

In [9]:
assert len(moodle_modules_featured) == len(moodle_modules_df), "El número de filas en moodle_modules_featured no coincide con moodle_modules_df"

In [11]:
edukrea_modules_featured['total_estudiantes'].unique()

array([26, 18, 14, 22, 13, 39, 25, 33, 36])

In [12]:
moodle_modules_featured['total_estudiantes'].unique()

array([15, 17, 20, 22, 10, 16, 18,  7, 12, 13, 24, 21,  9, 14,  8,  4])