<a href="https://colab.research.google.com/github/emisoft-designs/SEEFAR-SUSTAIN/blob/academy-data-extraction-v2/SEEFAR_SUSTAIN_(Academy_DB_Extraction).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Colab Notebook – Data Extraction from SEEFAR Academy Database**

In [None]:
!pip install pymysql sqlalchemy gspread gspread-dataframe

Collecting pymysql
  Downloading pymysql-1.1.2-py3-none-any.whl.metadata (4.3 kB)
Downloading pymysql-1.1.2-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.3/45.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymysql
Successfully installed pymysql-1.1.2


In [None]:
import pandas as pd
import re

def extract_first_span_content(html_string):
    """
    Extracts the text content from the first <span> tag in an HTML string.
    Returns an empty string if no <span> tag is found.
    """
    if pd.isna(html_string):
        return None
    match = re.search(r'<span[^>]*>(.*?)</span>', html_string)
    if match:
        return match.group(1)
    return ''

# Example HTML strings
html1 = '<span lang="en" class="multilang">35 or above</span><span lang="ar" class="multilang">35 أو اكبر</span><span lang="bn" class="multilang">৩৫ এর বেশি</span><span lang="ur" class="multilang">٣٥ اور اس سے زیادہ</span><span lang="ps" class="multilang">۳۵ یا تر هغه لوړ </span><span lang="fa" class="multilang">۳۵ یا بالاتر</span><span lang="ckb" class="multilang">یان زیاترلە٣٥</span>'
html2 = '<span lang="en" class="multilang">Other</span><span lang="ar" class="multilang">آخرئ</span><span lang="bn" class="multilang">অন্যান্য</span><span lang="ur" class="multilang">دیگر</span><span lang="ps" class="multilang">نور</span><span lang="fa" class="multilang">سایر موارد</span><span lang="ckb" class="multilang">ئەوانی تر</span>'
html3 = '<div>Some other text</div>'
html4 = 'No span tag here'
html5 = None

df = pd.DataFrame({"raw_html": [html1, html2, html3, html4, html5]})

print("Original DataFrame:")
display(df)

# Apply the function to the 'raw_html' column to create a new column 'extracted_content'
df["extracted_content"] = df["raw_html"].apply(extract_first_span_content)

# Optionally, you can replace the original 'raw_html' column with the extracted content
# Or, if you need to clean the 'raw_html' in place:
df["cleaned_html"] = df["raw_html"].apply(lambda x: extract_first_span_content(x) if x else None)

print("\nDataFrame after extracting and cleaning:")
display(df)

Original DataFrame:


Unnamed: 0,raw_html
0,"<span lang=""en"" class=""multilang"">35 or above<..."
1,"<span lang=""en"" class=""multilang"">Other</span>..."
2,<div>Some other text</div>
3,No span tag here
4,



DataFrame after extracting and cleaning:


Unnamed: 0,raw_html,extracted_content,cleaned_html
0,"<span lang=""en"" class=""multilang"">35 or above<...",35 or above,35 or above
1,"<span lang=""en"" class=""multilang"">Other</span>...",Other,Other
2,<div>Some other text</div>,,
3,No span tag here,,
4,,,


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
"""
SUSTAIN MEL Data Extraction & Documentation Script
====================================================

Purpose: Extract, structure, and document Moodle database data for Monitoring,
         Evaluation, and Learning (MEL) indicators for SUSTAIN training programs.

Author: Seefar Academy Data Team
Date: 2026-02-03
Version: 2.0

Database: Seefar Academy Pathways Moodle (pathways_moodle)
Target: MEL reporting for SUSTAIN, Onboarding, and General Preparatory Training

Key Objectives:
1. Map database schema for training delivery, assessments, users, and roles
2. Link Course ID ↔ Course Name for training track filtering
3. Standardize login dates (First Login, Last Login)
4. Extract and relate pre-test/post-test results to users and courses
5. Extract demographics for disaggregation (name, email, phone, gender, country)
6. Differentiate user types (Participants, Candidates, Employers, Visa Facilitators)
7. Support MEL indicator calculations (completion rates, knowledge increase, etc.)
8. Enable automated reporting and dashboarding
"""

# ============================================================================
# SECTION 1: SETUP & CONFIGURATION
# ============================================================================

# Install required packages
# !pip install pymysql sqlalchemy gspread gspread-dataframe pandas numpy
import os
import re
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
from typing import Dict, List, Tuple, Any, Optional
import warnings
from datetime import datetime
import json
import time

warnings.filterwarnings('ignore')

# Database credentials configuration
class DatabaseConfig:
    """Database connection configuration"""
    DB_USER = "dbuserread"
    # DB_PASSWORD should be set via environment variable or Google Colab userdata
    DB_HOST = "ls-17122ea7e3a528fd292a260b6217b006cb7a0f38.cyhh372xsm2h.ap-southeast-1.rds.amazonaws.com"
    DB_PORT = 3306
    DB_NAME = "pathways_moodle"

    @classmethod
    def get_connection_url(cls, password: str) -> str:
        """Generate database connection URL"""
        return f"mysql+pymysql://{cls.DB_USER}:{password}@{cls.DB_HOST}:{cls.DB_PORT}/{cls.DB_NAME}"


# ============================================================================
# SECTION 2: DATABASE UTILITY FUNCTIONS
# ============================================================================

def create_db_engine(password: str):
    """
    Create SQLAlchemy engine for database connection.

    Args:
        password: Database password

    Returns:
        SQLAlchemy engine object
    """
    connection_url = DatabaseConfig.get_connection_url(password)
    engine = create_engine(connection_url, pool_pre_ping=True)
    print("✓ Database engine created successfully")
    return engine


def fetch_data(engine, query: str, params: Optional[Dict] = None) -> pd.DataFrame:
    """
    Execute SQL query and return results as DataFrame.

    Args:
        engine: SQLAlchemy engine
        query: SQL query string
        params: Optional query parameters

    Returns:
        pandas DataFrame with query results
    """
    try:
        with engine.connect() as conn:
            if params:
                df = pd.read_sql(text(query), conn, params=params)
            else:
                df = pd.read_sql(query, conn)
        print(f"✓ Fetched {df.shape[0]} rows × {df.shape[1]} columns")
        return df
    except Exception as e:
        print(f"✗ Error fetching data: {e}")
        return pd.DataFrame()


def get_table_info(engine, table_name: str) -> pd.DataFrame:
    """
    Get column information for a specific table.

    Args:
        engine: SQLAlchemy engine
        table_name: Name of the table

    Returns:
        DataFrame with column information
    """
    query = f"""
    SELECT
        COLUMN_NAME as column_name,
        DATA_TYPE as data_type,
        IS_NULLABLE as is_nullable,
        COLUMN_KEY as column_key,
        COLUMN_COMMENT as column_comment
    FROM INFORMATION_SCHEMA.COLUMNS
    WHERE TABLE_SCHEMA = '{DatabaseConfig.DB_NAME}'
    AND TABLE_NAME = '{table_name}'
    ORDER BY ORDINAL_POSITION
    """
    return fetch_data(engine, query)


# ============================================================================
# SECTION 3: CORE DATA EXTRACTION FUNCTIONS
# ============================================================================

class MELDataExtractor:
    """Main class for extracting MEL-relevant data from Moodle database"""

    def __init__(self, engine):
        self.engine = engine
        self.extracted_data = {}

    def extract_users(self) -> pd.DataFrame:
        """
        Extract user data with demographic information.

        Returns:
            DataFrame with user information including demographics
        """
        print("\n" + "="*80)
        print("EXTRACTING USER DATA")
        print("="*80)

        query = """
        SELECT
            u.id as user_id,
            u.username,
            u.firstname,
            u.lastname,
            u.email,
            u.phone1 as phone,
            u.country,
            u.city,
            u.institution,
            u.department,
            u.timecreated as user_created_timestamp,
            u.timemodified as user_modified_timestamp,
            u.firstaccess as first_login_timestamp,
            u.lastaccess as last_login_timestamp,
            u.lastlogin as last_login_timestamp_alt,
            u.currentlogin as current_login_timestamp,
            u.confirmed,
            u.suspended,
            u.deleted,
            FROM_UNIXTIME(u.timecreated) as user_created_date,
            FROM_UNIXTIME(u.firstaccess) as first_login_date,
            FROM_UNIXTIME(u.lastaccess) as last_login_date
        FROM mdl_user u
        WHERE u.deleted = 0
        AND u.id > 2  -- Exclude guest and admin system accounts
        ORDER BY u.id
        """

        users_df = fetch_data(self.engine, query)

        # Add custom fields (demographics)
        demographics_query = """
        SELECT
            uid.userid as user_id,
            uif.shortname as field_name,
            uid.data as field_value
        FROM mdl_user_info_data uid
        JOIN mdl_user_info_field uif ON uid.fieldid = uif.id
        WHERE uif.shortname IN ('gender', 'age', 'agegroup', 'nationality', 'ethnicity')
        """

        demographics_df = fetch_data(self.engine, demographics_query)

        # Pivot demographics data
        if not demographics_df.empty:
            demographics_pivot = demographics_df.pivot(
                index='user_id',
                columns='field_name',
                values='field_value'
            ).reset_index()

            users_df = users_df.merge(demographics_pivot, on='user_id', how='left')

        print(f"✓ Extracted {len(users_df)} users with demographics")
        self.extracted_data['users'] = users_df
        return users_df

    def extract_courses(self) -> pd.DataFrame:
        """
        Extract course information with categorization.

        Returns:
            DataFrame with course details
        """
        print("\n" + "="*80)
        print("EXTRACTING COURSE DATA")
        print("="*80)

        query = """
        SELECT
            c.id as course_id,
            c.category as course_category_id,
            c.fullname as course_name,
            c.shortname as course_shortname,
            c.summary as course_description,
            c.startdate as course_start_timestamp,
            c.enddate as course_end_timestamp,
            FROM_UNIXTIME(c.startdate) as course_start_date,
            FROM_UNIXTIME(c.enddate) as course_end_date,
            c.visible as is_visible,
            c.timecreated as course_created_timestamp,
            FROM_UNIXTIME(c.timecreated) as course_created_date,
            cc.name as category_name,
            cc.path as category_path
        FROM mdl_course c
        LEFT JOIN mdl_course_categories cc ON c.category = cc.id
        WHERE c.id > 1  -- Exclude site home course
        ORDER BY c.id
        """

        courses_df = fetch_data(self.engine, query)

        # Categorize courses based on name patterns
        def categorize_course(row):
            """Categorize course into training tracks"""
            course_name = str(row['course_name']).lower()
            course_short = str(row['course_shortname']).lower()

            if 'sustain' in course_name or 'sustain' in course_short:
                return 'SUSTAIN Training'
            elif 'onboard' in course_name or 'onboard' in course_short:
                return 'Onboarding Training'
            elif 'prep' in course_name or 'career' in course_name or 'stem' in course_name:
                return 'General Preparatory Training'
            elif 'employer' in course_name:
                return 'Employer Training'
            elif 'visa' in course_name or 'facilitator' in course_name:
                return 'Visa Facilitator Training'
            else:
                return 'Other'

        courses_df['training_track'] = courses_df.apply(categorize_course, axis=1)

        print(f"✓ Extracted {len(courses_df)} courses")
        print("\nCourse distribution by training track:")
        print(courses_df['training_track'].value_counts())

        self.extracted_data['courses'] = courses_df
        return courses_df

    def extract_user_roles(self) -> pd.DataFrame:
        """
        Extract user role assignments to differentiate user types.

        Returns:
            DataFrame with user roles
        """
        print("\n" + "="*80)
        print("EXTRACTING USER ROLES")
        print("="*80)

        query = """
        SELECT
            ra.id as role_assignment_id,
            ra.userid as user_id,
            ra.roleid as role_id,
            r.shortname as role_shortname,
            r.name as role_name,
            ra.contextid,
            c.contextlevel,
            c.instanceid,
            FROM_UNIXTIME(ra.timemodified) as role_assigned_date
        FROM mdl_role_assignments ra
        JOIN mdl_role r ON ra.roleid = r.id
        JOIN mdl_context c ON ra.contextid = c.id
        ORDER BY ra.userid, ra.roleid
        """

        roles_df = fetch_data(self.engine, query)

        # Categorize user types based on roles
        def categorize_user_type(role_shortname):
            """Map role to user type category"""
            role_lower = str(role_shortname).lower()

            if 'student' in role_lower:
                return 'Participant/Candidate'
            elif 'teacher' in role_lower or 'editingteacher' in role_lower:
                return 'Facilitator/Instructor'
            elif 'employer' in role_lower:
                return 'Employer'
            elif 'manager' in role_lower:
                return 'Manager'
            else:
                return 'Other'

        roles_df['user_type_category'] = roles_df['role_shortname'].apply(categorize_user_type)

        print(f"✓ Extracted {len(roles_df)} role assignments")
        print("\nUser type distribution:")
        print(roles_df['user_type_category'].value_counts())

        self.extracted_data['user_roles'] = roles_df
        return roles_df

    def extract_enrollments(self) -> pd.DataFrame:
        """
        Extract user course enrollments.

        Returns:
            DataFrame with enrollment data
        """
        print("\n" + "="*80)
        print("EXTRACTING ENROLLMENT DATA")
        print("="*80)

        query = """
        SELECT
            ue.id as enrollment_id,
            ue.userid as user_id,
            ue.enrolid,
            e.courseid as course_id,
            e.enrol as enrollment_method,
            ue.status as enrollment_status,
            ue.timestart as enrollment_start_timestamp,
            ue.timeend as enrollment_end_timestamp,
            ue.timecreated as enrollment_created_timestamp,
            ue.timemodified as enrollment_modified_timestamp,
            FROM_UNIXTIME(ue.timecreated) as enrollment_date,
            FROM_UNIXTIME(ue.timestart) as enrollment_start_date,
            FROM_UNIXTIME(ue.timeend) as enrollment_end_date
        FROM mdl_user_enrolments ue
        JOIN mdl_enrol e ON ue.enrolid = e.id
        ORDER BY ue.userid, e.courseid
        """

        enrollments_df = fetch_data(self.engine, query)

        # Add enrollment status label
        enrollments_df['enrollment_status_label'] = enrollments_df['enrollment_status'].map({
            0: 'Active',
            1: 'Suspended'
        })

        print(f"✓ Extracted {len(enrollments_df)} enrollments")

        self.extracted_data['enrollments'] = enrollments_df
        return enrollments_df

    def extract_quiz_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """
        Extract quiz, quiz attempts, and quiz grades for pre/post-test analysis.

        Returns:
            Tuple of (quizzes, attempts, grades) DataFrames
        """
        print("\n" + "="*80)
        print("EXTRACTING QUIZ/ASSESSMENT DATA")
        print("="*80)

        # Quizzes
        quiz_query = """
        SELECT
            q.id as quiz_id,
            q.course as course_id,
            q.name as quiz_name,
            q.intro as quiz_description,
            q.timeopen as quiz_open_timestamp,
            q.timeclose as quiz_close_timestamp,
            q.timelimit as time_limit_seconds,
            q.grade as max_grade,
            q.sumgrades as sum_grades,
            FROM_UNIXTIME(q.timeopen) as quiz_open_date,
            FROM_UNIXTIME(q.timeclose) as quiz_close_date
        FROM mdl_quiz q
        ORDER BY q.course, q.id
        """
        quizzes_df = fetch_data(self.engine, quiz_query)

        # Categorize quizzes as pre-test or post-test
        def categorize_quiz(quiz_name):
            """Identify pre-test vs post-test"""
            name_lower = str(quiz_name).lower()
            if 'pre' in name_lower or 'pre-test' in name_lower or 'pretest' in name_lower:
                return 'Pre-Test'
            elif 'post' in name_lower or 'post-test' in name_lower or 'posttest' in name_lower:
                return 'Post-Test'
            else:
                return 'Assessment'

        quizzes_df['assessment_type'] = quizzes_df['quiz_name'].apply(categorize_quiz)

        # Quiz attempts
        attempts_query = """
        SELECT
            qa.id as attempt_id,
            qa.quiz as quiz_id,
            qa.userid as user_id,
            qa.attempt as attempt_number,
            qa.state as attempt_state,
            qa.timestart as attempt_start_timestamp,
            qa.timefinish as attempt_finish_timestamp,
            qa.timemodified as attempt_modified_timestamp,
            qa.sumgrades as attempt_score,
            FROM_UNIXTIME(qa.timestart) as attempt_start_date,
            FROM_UNIXTIME(qa.timefinish) as attempt_finish_date,
            TIMESTAMPDIFF(SECOND, FROM_UNIXTIME(qa.timestart), FROM_UNIXTIME(qa.timefinish)) as duration_seconds
        FROM mdl_quiz_attempts qa
        WHERE qa.state = 'finished'
        ORDER BY qa.userid, qa.quiz, qa.attempt
        """
        attempts_df = fetch_data(self.engine, attempts_query)

        # Quiz grades
        grades_query = """
        SELECT
            qg.id as grade_id,
            qg.quiz as quiz_id,
            qg.userid as user_id,
            qg.grade as final_grade,
            qg.timemodified as grade_timestamp,
            FROM_UNIXTIME(qg.timemodified) as grade_date
        FROM mdl_quiz_grades qg
        ORDER BY qg.userid, qg.quiz
        """
        grades_df = fetch_data(self.engine, grades_query)

        print(f"✓ Extracted {len(quizzes_df)} quizzes")
        print(f"✓ Extracted {len(attempts_df)} quiz attempts")
        print(f"✓ Extracted {len(grades_df)} quiz grades")
        print("\nQuiz type distribution:")
        print(quizzes_df['assessment_type'].value_counts())

        self.extracted_data['quizzes'] = quizzes_df
        self.extracted_data['quiz_attempts'] = attempts_df
        self.extracted_data['quiz_grades'] = grades_df

        return quizzes_df, attempts_df, grades_df

    def extract_assignment_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """
        Extract assignment, submission, and grading data.

        Returns:
            Tuple of (assignments, submissions, grades) DataFrames
        """
        print("\n" + "="*80)
        print("EXTRACTING ASSIGNMENT DATA")
        print("="*80)

        # Assignments
        assign_query = """
        SELECT
            a.id as assignment_id,
            a.course as course_id,
            a.name as assignment_name,
            a.intro as assignment_description,
            a.duedate as due_timestamp,
            a.allowsubmissionsfromdate as submissions_from_timestamp,
            a.grade as max_grade,
            FROM_UNIXTIME(a.duedate) as due_date,
            FROM_UNIXTIME(a.allowsubmissionsfromdate) as submissions_from_date
        FROM mdl_assign a
        ORDER BY a.course, a.id
        """
        assignments_df = fetch_data(self.engine, assign_query)

        # Submissions
        submissions_query = """
        SELECT
            asub.id as submission_id,
            asub.assignment as assignment_id,
            asub.userid as user_id,
            asub.status as submission_status,
            asub.timecreated as submission_created_timestamp,
            asub.timemodified as submission_modified_timestamp,
            asub.attemptnumber as attempt_number,
            asub.latest as is_latest,
            FROM_UNIXTIME(asub.timecreated) as submission_created_date,
            FROM_UNIXTIME(asub.timemodified) as submission_modified_date
        FROM mdl_assign_submission asub
        ORDER BY asub.userid, asub.assignment
        """
        submissions_df = fetch_data(self.engine, submissions_query)

        # Grades
        assign_grades_query = """
        SELECT
            ag.id as grade_id,
            ag.assignment as assignment_id,
            ag.userid as user_id,
            ag.grade as grade,
            ag.grader as grader_id,
            ag.attemptnumber as attempt_number,
            ag.timecreated as grade_created_timestamp,
            ag.timemodified as grade_modified_timestamp,
            FROM_UNIXTIME(ag.timecreated) as grade_created_date,
            FROM_UNIXTIME(ag.timemodified) as grade_modified_date
        FROM mdl_assign_grades ag
        WHERE ag.grade >= 0  -- Exclude ungraded submissions
        ORDER BY ag.userid, ag.assignment
        """
        assign_grades_df = fetch_data(self.engine, assign_grades_query)

        print(f"✓ Extracted {len(assignments_df)} assignments")
        print(f"✓ Extracted {len(submissions_df)} submissions")
        print(f"✓ Extracted {len(assign_grades_df)} assignment grades")

        self.extracted_data['assignments'] = assignments_df
        self.extracted_data['assignment_submissions'] = submissions_df
        self.extracted_data['assignment_grades'] = assign_grades_df

        return assignments_df, submissions_df, assign_grades_df

    def extract_feedback_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Extract feedback/survey responses for training usefulness assessment.

        Returns:
            Tuple of (feedback_activities, responses) DataFrames
        """
        print("\n" + "="*80)
        print("EXTRACTING FEEDBACK/SURVEY DATA")
        print("="*80)

        # Feedback activities
        feedback_query = """
        SELECT
            f.id as feedback_id,
            f.course as course_id,
            f.name as feedback_name,
            f.intro as feedback_description,
            f.timeopen as feedback_open_timestamp,
            f.timeclose as feedback_close_timestamp,
            FROM_UNIXTIME(f.timeopen) as feedback_open_date,
            FROM_UNIXTIME(f.timeclose) as feedback_close_date
        FROM mdl_feedback f
        ORDER BY f.course, f.id
        """
        feedback_df = fetch_data(self.engine, feedback_query)

        # Completed responses
        responses_query = """
        SELECT
            fc.id as response_id,
            fc.feedback as feedback_id,
            fc.userid as user_id,
            fc.timemodified as response_timestamp,
            FROM_UNIXTIME(fc.timemodified) as response_date
        FROM mdl_feedback_completed fc
        ORDER BY fc.userid, fc.feedback
        """
        responses_df = fetch_data(self.engine, responses_query)

        # Questionnaire data
        questionnaire_query = """
        SELECT
            qr.id as questionnaire_response_id,
            qr.questionnaireid as questionnaire_id,
            qr.userid as user_id,
            qr.submitted as submission_timestamp,
            qr.complete as is_complete,
            FROM_UNIXTIME(qr.submitted) as submission_date
        FROM mdl_questionnaire_response qr
        ORDER BY qr.userid, qr.questionnaireid
        """
        questionnaire_df = fetch_data(self.engine, questionnaire_query)

        print(f"✓ Extracted {len(feedback_df)} feedback activities")
        print(f"✓ Extracted {len(responses_df)} feedback responses")
        print(f"✓ Extracted {len(questionnaire_df)} questionnaire responses")

        self.extracted_data['feedback'] = feedback_df
        self.extracted_data['feedback_responses'] = responses_df
        self.extracted_data['questionnaire_responses'] = questionnaire_df

        return feedback_df, responses_df

    def extract_completion_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """
        Extract course and module completion data.

        Returns:
            Tuple of (course_completions, module_completions, certificates) DataFrames
        """
        print("\n" + "="*80)
        print("EXTRACTING COMPLETION DATA")
        print("="*80)

        # Course completions
        course_completion_query = """
        SELECT
            cc.id as completion_id,
            cc.userid as user_id,
            cc.course as course_id,
            cc.timeenrolled as enrollment_timestamp,
            cc.timestarted as started_timestamp,
            cc.timecompleted as completed_timestamp,
            cc.reaggregate,
            FROM_UNIXTIME(cc.timeenrolled) as enrollment_date,
            FROM_UNIXTIME(cc.timestarted) as started_date,
            FROM_UNIXTIME(cc.timecompleted) as completed_date
        FROM mdl_course_completions cc
        WHERE cc.timecompleted IS NOT NULL
        ORDER BY cc.userid, cc.course
        """
        course_completions_df = fetch_data(self.engine, course_completion_query)

        # Module completions
        module_completion_query = """
        SELECT
            cmc.id as module_completion_id,
            cmc.coursemoduleid as course_module_id,
            cmc.userid as user_id,
            cmc.completionstate as completion_state,
            cmc.timemodified as completion_timestamp,
            FROM_UNIXTIME(cmc.timemodified) as completion_date
        FROM mdl_course_modules_completion cmc
        WHERE cmc.completionstate > 0
        ORDER BY cmc.userid, cmc.coursemoduleid
        """
        module_completions_df = fetch_data(self.engine, module_completion_query)

        # Map completion states
        module_completions_df['completion_state_label'] = module_completions_df['completion_state'].map({
            1: 'Complete',
            2: 'Complete (Pass)',
            3: 'Complete (Fail)'
        })

        # Certificates issued
        certificates_query = """
        SELECT
            ci.id as certificate_issue_id,
            ci.userid as user_id,
            ci.customcertid as certificate_id,
            ci.code as certificate_code,
            ci.timecreated as issued_timestamp,
            FROM_UNIXTIME(ci.timecreated) as issued_date
        FROM mdl_customcert_issues ci
        ORDER BY ci.userid, ci.customcertid
        """
        certificates_df = fetch_data(self.engine, certificates_query)

        print(f"✓ Extracted {len(course_completions_df)} course completions")
        print(f"✓ Extracted {len(module_completions_df)} module completions")
        print(f"✓ Extracted {len(certificates_df)} certificates issued")

        self.extracted_data['course_completions'] = course_completions_df
        self.extracted_data['module_completions'] = module_completions_df
        self.extracted_data['certificates'] = certificates_df

        return course_completions_df, module_completions_df, certificates_df

    def extract_activity_logs(self, limit: int = 100000) -> pd.DataFrame:
        """
        Extract user activity logs for engagement analysis.

        Args:
            limit: Maximum number of log entries to retrieve

        Returns:
            DataFrame with activity log data
        """
        print("\n" + "="*80)
        print("EXTRACTING ACTIVITY LOGS")
        print("="*80)

        query = f"""
        SELECT
            l.id as log_id,
            l.userid as user_id,
            l.courseid as course_id,
            l.eventname,
            l.component,
            l.action,
            l.target,
            l.timecreated as event_timestamp,
            FROM_UNIXTIME(l.timecreated) as event_date,
            l.origin,
            l.ip
        FROM mdl_logstore_standard_log l
        WHERE l.userid > 2  -- Exclude system users
        ORDER BY l.timecreated DESC
        LIMIT {limit}
        """

        logs_df = fetch_data(self.engine, query)

        print(f"✓ Extracted {len(logs_df)} activity log entries")

        self.extracted_data['activity_logs'] = logs_df
        return logs_df

    @staticmethod
    def extract_first_span_content(html_string):
        """
        Extracts the text content from the first <span> tag in an HTML string.
        Returns an empty string if no <span> tag is found.
        """
        if pd.isna(html_string):
            return None
        if not isinstance(html_string, str):
            return html_string
        if not html_string.startswith('<span'):
            return html_string
        match = re.search(r'<span[^>]*>(.*?)</span>', str(html_string))
        if match:
            return match.group(1)
        return ''

    def extract_all_data(self):
        """Execute all extraction functions"""
        print("\n" + "="*80)
        print("STARTING COMPREHENSIVE DATA EXTRACTION")
        print("="*80)

        self.extract_users()
        self.extract_courses()
        self.extract_user_roles()
        self.extract_enrollments()
        self.extract_quiz_data()
        self.extract_assignment_data()
        self.extract_feedback_data()
        self.extract_completion_data()
        self.extract_activity_logs()

        # Clean the columns
        for name, df in self.extracted_data.items():
            self.extracted_data[name] = df.applymap(self.extract_first_span_content)

        print("\n" + "="*80)
        print("DATA EXTRACTION COMPLETE")
        print("="*80)
        print(f"\nTotal datasets extracted: {len(self.extracted_data)}")
        print("\nDataset summary:")
        for name, df in self.extracted_data.items():
            print(f"  - {name}: {len(df)} records")


# ============================================================================
# SECTION 4: MEL INDICATOR CALCULATIONS
# ============================================================================

class MELIndicatorCalculator:
    """Calculate MEL indicators from extracted data"""

    def __init__(self, extracted_data: Dict[str, pd.DataFrame]):
        self.data = extracted_data
        self.indicators = {}

    def calculate_knowledge_increase(self) -> pd.DataFrame:
        """
        Calculate knowledge increase (pre-test vs post-test scores).

        Returns:
            DataFrame with knowledge increase metrics per user and course
        """
        print("\n" + "="*80)
        print("CALCULATING KNOWLEDGE INCREASE INDICATORS")
        print("="*80)

        quizzes = self.data['quizzes']
        grades = self.data['quiz_grades']

        # Merge quiz metadata with grades
        quiz_results = grades.merge(
            quizzes[['quiz_id', 'course_id', 'assessment_type', 'max_grade']],
            on='quiz_id',
            how='left'
        )

        # Separate pre and post tests
        pre_tests = quiz_results[quiz_results['assessment_type'] == 'Pre-Test']
        post_tests = quiz_results[quiz_results['assessment_type'] == 'Post-Test']

        # Merge pre and post test scores
        knowledge_increase = pre_tests.merge(
            post_tests,
            on=['user_id', 'course_id'],
            suffixes=('_pre', '_post'),
            how='outer'
        )

        # Calculate metrics
        knowledge_increase['pre_test_score'] = knowledge_increase['final_grade_pre']
        knowledge_increase['post_test_score'] = knowledge_increase['final_grade_post']
        knowledge_increase['score_increase_absolute'] = (
            knowledge_increase['post_test_score'] - knowledge_increase['pre_test_score']
        )
        knowledge_increase['score_increase_percentage'] = (
            (knowledge_increase['score_increase_absolute'] / knowledge_increase['pre_test_score']) * 100
        )
        knowledge_increase['knowledge_improved'] = knowledge_increase['score_increase_absolute'] > 0

        # Select relevant columns
        result = knowledge_increase[[
            'user_id', 'course_id',
            'pre_test_score', 'post_test_score',
            'score_increase_absolute', 'score_increase_percentage',
            'knowledge_improved'
        ]]

        print(f"✓ Calculated knowledge increase for {len(result)} user-course pairs")
        print(f"  - Average score increase: {result['score_increase_absolute'].mean():.2f}")
        print(f"  - % with improved scores: {result['knowledge_improved'].mean()*100:.1f}%")

        self.indicators['knowledge_increase'] = result
        return result

    def calculate_completion_rates(self) -> pd.DataFrame:
        """
        Calculate course completion rates by training track and user type.

        Returns:
            DataFrame with completion rate metrics
        """
        print("\n" + "="*80)
        print("CALCULATING COMPLETION RATES")
        print("="*80)

        users = self.data['users']
        courses = self.data['courses']
        enrollments = self.data['enrollments']
        completions = self.data['course_completions']
        roles = self.data['user_roles']

        # Merge data
        completion_data = enrollments.merge(
            completions[['user_id', 'course_id', 'completed_timestamp']],
            on=['user_id', 'course_id'],
            how='left'
        )
        completion_data['completed'] = ~completion_data['completed_timestamp'].isna()

        # Add course information
        completion_data = completion_data.merge(
            courses[['course_id', 'course_name', 'training_track']],
            on='course_id',
            how='left'
        )

        # Add user demographics
        completion_data = completion_data.merge(
            users[['user_id', 'gender', 'country']],
            on='user_id',
            how='left'
        )

        # Add user type (primary role)
        user_primary_role = roles.groupby('user_id').first()[['user_type_category']].reset_index()
        completion_data = completion_data.merge(
            user_primary_role,
            on='user_id',
            how='left'
        )

        # Calculate completion rates by different dimensions
        completion_summary = completion_data.groupby(
            ['training_track', 'user_type_category', 'gender']
        ).agg({
            'user_id': 'count',
            'completed': 'sum'
        }).reset_index()

        completion_summary.columns = [
            'training_track', 'user_type', 'gender',
            'total_enrollments', 'completions'
        ]
        completion_summary['completion_rate'] = (
            completion_summary['completions'] / completion_summary['total_enrollments'] * 100
        )

        print(f"✓ Calculated completion rates")
        print("\nCompletion rate summary:")
        print(completion_summary.groupby('training_track')['completion_rate'].mean())

        self.indicators['completion_rates'] = completion_summary
        self.indicators['completion_detail'] = completion_data

        return completion_summary

    def calculate_engagement_metrics(self) -> pd.DataFrame:
        """
        Calculate engagement metrics from activity logs.

        Returns:
            DataFrame with engagement metrics per user
        """
        print("\n" + "="*80)
        print("CALCULATING ENGAGEMENT METRICS")
        print("="*80)

        users = self.data['users']
        logs = self.data.get('activity_logs', pd.DataFrame())

        if logs.empty:
            print("⚠ No activity logs available")
            return pd.DataFrame()

        # Calculate metrics per user
        engagement = logs.groupby('user_id').agg({
            'log_id': 'count',
            'event_timestamp': ['min', 'max']
        }).reset_index()

        engagement.columns = ['user_id', 'total_activities', 'first_activity', 'last_activity']

        # Calculate active days
        engagement['active_days'] = (
            pd.to_datetime(engagement['last_activity'], unit='s') -
            pd.to_datetime(engagement['first_activity'], unit='s')
        ).dt.days + 1

        engagement['avg_activities_per_day'] = (
            engagement['total_activities'] / engagement['active_days']
        )

        # Add user information
        engagement = engagement.merge(
            users[['user_id', 'firstname', 'lastname', 'email']],
            on='user_id',
            how='left'
        )

        print(f"✓ Calculated engagement metrics for {len(engagement)} users")
        print(f"  - Average activities per user: {engagement['total_activities'].mean():.1f}")
        print(f"  - Average active days: {engagement['active_days'].mean():.1f}")

        self.indicators['engagement_metrics'] = engagement
        return engagement

    def generate_mel_report(self) -> Dict[str, Any]:
        """
        Generate comprehensive MEL report.

        Returns:
            Dictionary with all MEL indicators and summary statistics
        """
        print("\n" + "="*80)
        print("GENERATING MEL REPORT")
        print("="*80)

        self.calculate_knowledge_increase()
        self.calculate_completion_rates()
        self.calculate_engagement_metrics()

        # Summary statistics
        report = {
            'generation_date': datetime.now().isoformat(),
            'total_users': len(self.data['users']),
            'total_courses': len(self.data['courses']),
            'total_enrollments': len(self.data['enrollments']),
            'total_completions': len(self.data['course_completions']),
            'indicators': self.indicators
        }

        print("\n✓ MEL Report Generated")
        print(f"  - Total Users: {report['total_users']}")
        print(f"  - Total Courses: {report['total_courses']}")
        print(f"  - Total Enrollments: {report['total_enrollments']}")
        print(f"  - Total Completions: {report['total_completions']}")

        return report


# ============================================================================
# SECTION 5: GOOGLE SHEETS EXPORT
# ============================================================================

# def export_to_google_sheets(
#     extracted_data: Dict[str, pd.DataFrame],
#     spreadsheet_name: str = 'SUSTAIN MEL Data Export (RAW)'
# ):
#     """
#     Export extracted data to Google Sheets.

#     Args:
#         extracted_data: Dictionary of DataFrames to export
#         spreadsheet_name: Name for the Google Spreadsheet
#     """
#     print("\n" + "="*80)
#     print("EXPORTING TO GOOGLE SHEETS")
#     print("="*80)

#     try:
#         # Authenticate
#         from google.colab import auth
#         import gspread
#         from gspread_dataframe import set_with_dataframe
#         from google.auth import default

#         auth.authenticate_user()
#         creds, _ = default()
#         gc = gspread.authorize(creds)

#         # Create spreadsheet
#         spreadsheet = gc.create(spreadsheet_name)
#         print(f"✓ Created spreadsheet: {spreadsheet.url}")

#         # Export each dataset
#         for sheet_name, df in extracted_data.items():
#             if df.empty:
#                 print(f"⚠ Skipping empty dataset: {sheet_name}")
#                 continue

#             # Truncate sheet name to 100 characters (Google Sheets limit)
#             sheet_title = sheet_name[:100]

#             try:
#                 # Create worksheet
#                 worksheet = spreadsheet.add_worksheet(
#                     title=sheet_title,
#                     rows=max(len(df) + 1, 2),
#                     cols=max(len(df.columns), 1)
#                 )

#                 # Write data
#                 set_with_dataframe(worksheet, df)
#                 print(f"✓ Exported {sheet_name}: {len(df)} rows × {len(df.columns)} columns")

#                 # Rate limiting
#                 time.sleep(1)

#             except Exception as e:
#                 print(f"✗ Error exporting {sheet_name}: {e}")

#         # Remove default sheet
#         try:
#             default_sheet = spreadsheet.sheet1
#             spreadsheet.del_worksheet(default_sheet)
#         except:
#             pass

#         print(f"\n✓ Export complete: {spreadsheet.url}")
#         return spreadsheet.url

#     except Exception as e:
#         print(f"✗ Error during export: {e}")
#         return None

# ============================================================================
# SECTION 5: GOOGLE SHEETS EXPORT
# ============================================================================

def export_to_google_sheets(
    extracted_data: Dict[str, pd.DataFrame],
    spreadsheet_name: str = "SUSTAIN MEL Data Export (RAW)",
    folder_name: str = "SUSTAIN_ACADEMY",
):
    import time
    from google.colab import auth
    import gspread
    from gspread_dataframe import set_with_dataframe
    from google.auth import default
    from googleapiclient.discovery import build

    auth.authenticate_user()
    creds, _ = default(scopes=[
        "https://www.googleapis.com/auth/drive",
        "https://www.googleapis.com/auth/spreadsheets"
    ])
    gc = gspread.authorize(creds)
    drive = build("drive", "v3", credentials=creds)

    # find or create folder
    q = (
        "mimeType='application/vnd.google-apps.folder' "
        f"and name='{folder_name}' and trashed=false"
    )
    res = drive.files().list(q=q, fields="files(id,name)").execute()
    files = res.get("files", [])
    if files:
        folder_id = files[0]["id"]
    else:
        fld = {"name": folder_name, "mimeType": "application/vnd.google-apps.folder"}
        folder_id = drive.files().create(body=fld, fields="id").execute()["id"]

    # create spreadsheet (default goes to root) and then move it to folder
    spreadsheet = gc.create(spreadsheet_name)
    file_id = spreadsheet.id  # gspread Spreadsheet has .id
    # move: add folder as parent, remove root
    drive.files().update(
        fileId=file_id,
        addParents=folder_id,
        removeParents="root",
        fields="id, parents"
    ).execute()

    # export dataframes
    for sheet_name, df in extracted_data.items():
        if df.empty:
            continue
        title = sheet_name[:100]
        try:
            worksheet = spreadsheet.add_worksheet(
                title=title,
                rows=max(len(df) + 1, 2),
                cols=max(len(df.columns), 1),
            )
            set_with_dataframe(worksheet, df)
            time.sleep(1)
        except Exception as e:
            print(f"Error exporting {sheet_name}: {e}")

    # remove default sheet if present
    try:
        spreadsheet.del_worksheet(spreadsheet.sheet1)
    except Exception:
        pass

    print(f"Export complete: {spreadsheet.url}")
    return spreadsheet.url

# ============================================================================
# SECTION 6: DATA DOCUMENTATION
# ============================================================================

def generate_data_documentation(extracted_data: Dict[str, pd.DataFrame]) -> str:
    """
    Generate comprehensive data documentation.

    Args:
        extracted_data: Dictionary of extracted DataFrames

    Returns:
        Markdown-formatted documentation string
    """
    doc = """
# SUSTAIN MEL Data Documentation
## Seefar Academy - Pathways Moodle Database

**Generated:** {generation_date}

---

## Database Overview

This documentation describes the extracted data from the Seefar Academy Moodle database
for Monitoring, Evaluation, and Learning (MEL) reporting purposes.

### Database Connection
- **Host:** {db_host}
- **Database:** {db_name}
- **Total Tables Extracted:** {num_datasets}

---

## Extracted Datasets

""".format(
        generation_date=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        db_host=DatabaseConfig.DB_HOST,
        db_name=DatabaseConfig.DB_NAME,
        num_datasets=len(extracted_data)
    )

    # Document each dataset
    for dataset_name, df in extracted_data.items():
        doc += f"""
### {dataset_name}

**Records:** {len(df):,}
**Columns:** {len(df.columns)}

#### Schema

| Column Name | Data Type | Sample Values |
|------------|-----------|---------------|
"""
        for col in df.columns:
            dtype = df[col].dtype
            # Get sample non-null values
            sample_vals = df[col].dropna().head(3).tolist()
            sample_str = ', '.join([str(v)[:30] for v in sample_vals])
            doc += f"| `{col}` | {dtype} | {sample_str} |\n"

        doc += "\n"

    # Add MEL indicator mappings
    doc += """
---

## MEL Indicator Mappings

### Primary MEL Indicators

1. **Number of participants completing onboarding training (disaggregated by gender)**
   - Source Tables: `course_completions`, `courses`, `users`
   - Filter: `training_track = 'Onboarding Training'`
   - Disaggregation: `gender` field from `users`

2. **Number of employers completing training (disaggregated by gender)**
   - Source Tables: `course_completions`, `courses`, `user_roles`, `users`
   - Filter: `user_type_category = 'Employer'`
   - Disaggregation: `gender` field from `users`

3. **Knowledge increase indicators (pre- vs post-test)**
   - Source Tables: `quizzes`, `quiz_grades`
   - Calculation: `post_test_score - pre_test_score`
   - Fields: `assessment_type = 'Pre-Test'` or `'Post-Test'`

4. **Training usefulness indicators (EU RASM5)**
   - Source Tables: `feedback_responses`, `questionnaire_responses`
   - Fields: Response data from feedback forms

5. **Completion of four screening levels**
   - Source Tables: `course_modules_completion`, `courses`
   - Filter: Screening/assessment modules
   - Calculation: Count of completed screening levels per user

---

## Key Relationships

### User → Course Enrollments
```
users.user_id → enrollments.user_id
courses.course_id → enrollments.course_id
```

### User → Assessments
```
users.user_id → quiz_attempts.user_id → quizzes.quiz_id
users.user_id → quiz_grades.user_id → quizzes.quiz_id
```

### User → Completions
```
users.user_id → course_completions.user_id → courses.course_id
```

### User → Demographics
```
users.user_id → user_info_data.user_id
user_info_data.fieldid → user_info_field.id
```

### User → Roles
```
users.user_id → role_assignments.user_id → role.role_id
```

---

## Date Field Standardization

All timestamp fields have been converted to readable dates:

- **Format:** YYYY-MM-DD HH:MM:SS
- **Timezone:** UTC
- **Original Format:** Unix timestamp (seconds since epoch)

### Common Date Fields

| Field Name | Description |
|-----------|-------------|
| `first_login_date` | User's first platform access |
| `last_login_date` | User's most recent platform access |
| `enrollment_date` | Course enrollment date |
| `completed_date` | Course completion date |
| `attempt_start_date` | Quiz/assessment attempt start |
| `attempt_finish_date` | Quiz/assessment attempt completion |

---

## Automated Reporting Recommendations

### Recommended Tools

1. **Google Data Studio / Looker Studio**
   - Connect directly to Google Sheets export
   - Create interactive dashboards
   - Schedule automated reports

2. **Power BI**
   - Connect to MySQL database or Google Sheets
   - Advanced analytics capabilities
   - Robust disaggregation features

3. **Tableau**
   - Visual analytics platform
   - Strong filtering and disaggregation
   - Shareable dashboards

4. **Custom Python Dashboard (Streamlit/Plotly Dash)**
   - Full control over calculations
   - Direct database connection
   - Customizable to exact MEL requirements

### Implementation Strategy

1. **Phase 1:** Manual reporting using extracted Google Sheets
2. **Phase 2:** Set up Data Studio dashboards for real-time visualization
3. **Phase 3:** Implement automated scheduled reports
4. **Phase 4:** Develop custom analytics platform if needed

---

## Usage Notes

### Data Quality Considerations

- **Deleted Users:** Excluded from extraction (`deleted = 0`)
- **System Accounts:** Excluded (user_id > 2)
- **Empty Tables:** Not included in export
- **Missing Values:** Represented as `NaN` or `NULL`

### Performance Considerations

- **Activity Logs:** Limited to recent 100,000 entries (configurable)
- **Large Tables:** May require pagination for full export
- **Rate Limiting:** Google Sheets API has rate limits

---

## Support & Maintenance

For questions or issues with this data extraction:

1. Review this documentation
2. Check the extraction logs for errors
3. Verify database connectivity
4. Contact the data team

**Last Updated:** {last_updated}
""".format(last_updated=datetime.now().strftime('%Y-%m-%d'))

    return doc


# ============================================================================
# SECTION 7: MAIN EXECUTION
# ============================================================================

def main():
    """Main execution function"""

    print("""
╔══════════════════════════════════════════════════════════════════════════╗
║                                                                          ║
║              SUSTAIN MEL DATA EXTRACTION & ANALYSIS SYSTEM               ║
║                         Seefar Academy Pathways                          ║
║                                                                          ║
║  Purpose: Extract and structure Moodle data for MEL reporting            ║
║  Version: 2.0                                                            ║
║  Date: 2026-02-03                                                        ║
║                                                                          ║
╚══════════════════════════════════════════════════════════════════════════╝
""")

    # Step 1: Get database password
    try:
        from google.colab import userdata
        db_password = userdata.get('db_password')
        print("✓ Retrieved database password from Colab secrets")
    except:
        # For local testing, use environment variable
        import os
        db_password = os.getenv('DB_PASSWORD')
        if not db_password:
            print("✗ Database password not found!")
            print("  Set 'db_password' in Colab secrets or DB_PASSWORD environment variable")
            return

    # Step 2: Create database engine
    engine = create_db_engine(db_password)

    # Step 3: Extract all data
    extractor = MELDataExtractor(engine)
    extractor.extract_all_data()

    # Step 4: Calculate MEL indicators
    calculator = MELIndicatorCalculator(extractor.extracted_data)
    mel_report = calculator.generate_mel_report()

    # Step 5: Generate documentation
    documentation = generate_data_documentation(extractor.extracted_data)

    # Save documentation locally
    try:
      from pathlib import Path
      file_path = Path('content/drive/MyDrive/SUSTAIN_ACADEMY/MEL_Data_Documentation.md')
      file_path.mkdir(parents=True, exist_ok=True)
      with open(file_path, 'w') as f:
          f.write(documentation)
      print("\n✓ Saved documentation to MEL_Data_Documentation.md")
    except Exception as e:
      pass # continue
    # Step 6: Export to Google Sheets
    all_data = {**extractor.extracted_data, **calculator.indicators}
    sheets_url = export_to_google_sheets(all_data)

    # Step 7: Print summary
    print("\n" + "="*80)
    print("EXECUTION COMPLETE")
    print("="*80)
    print(f"""
Summary:
  - Total datasets extracted: {len(extractor.extracted_data)}
  - Total indicators calculated: {len(calculator.indicators)}
  - Documentation generated: MEL_Data_Documentation.md
  - Google Sheets URL: {sheets_url if sheets_url else 'Export failed'}

Next Steps:
  1. Review the Google Sheets export
  2. Read the documentation file
  3. Set up automated dashboards
  4. Configure scheduled reporting

For support, refer to the documentation or contact the data team.
""")

    return {
        'extracted_data': extractor.extracted_data,
        'indicators': calculator.indicators,
        'mel_report': mel_report,
        'sheets_url': sheets_url
    }


# ============================================================================
# EXECUTE
# ============================================================================


if __name__ == "__main__":
    results = main()



╔══════════════════════════════════════════════════════════════════════════╗
║                                                                          ║
║              SUSTAIN MEL DATA EXTRACTION & ANALYSIS SYSTEM               ║
║                         Seefar Academy Pathways                          ║
║                                                                          ║
║  Purpose: Extract and structure Moodle data for MEL reporting            ║
║  Version: 2.0                                                            ║
║  Date: 2026-02-03                                                        ║
║                                                                          ║
╚══════════════════════════════════════════════════════════════════════════╝

✓ Retrieved database password from Colab secrets
✓ Database engine created successfully

STARTING COMPREHENSIVE DATA EXTRACTION

EXTRACTING USER DATA
✓ Fetched 2014 rows × 22 columns
✓ Fetched 5019 rows × 3 columns
✓ Extracted 2



Export complete: https://docs.google.com/spreadsheets/d/1gDRS9alGMN_TAKZFs1vJKeN2dv-rBbL1qVmeBOMZYNQ

EXECUTION COMPLETE

Summary:
  - Total datasets extracted: 17
  - Total indicators calculated: 4
  - Documentation generated: MEL_Data_Documentation.md
  - Google Sheets URL: https://docs.google.com/spreadsheets/d/1gDRS9alGMN_TAKZFs1vJKeN2dv-rBbL1qVmeBOMZYNQ

Next Steps:
  1. Review the Google Sheets export
  2. Read the documentation file
  3. Set up automated dashboards
  4. Configure scheduled reporting

For support, refer to the documentation or contact the data team.



In [None]:
"""
SUSTAIN MEL Analysis Helper Functions
=====================================

This module provides specialized analysis functions for specific MEL indicators
and reporting requirements for the SUSTAIN program.

Author: Seefar Academy Data Team
Date: 2026-02-03
"""

import pandas as pd
import numpy as np
from typing import Dict, List, Tuple
from datetime import datetime


# ============================================================================
# PARTICIPANT VS CANDIDATE DIFFERENTIATION
# ============================================================================

def differentiate_user_groups(
    users_df: pd.DataFrame,
    enrollments_df: pd.DataFrame,
    roles_df: pd.DataFrame,
    courses_df: pd.DataFrame
) -> pd.DataFrame:
    """
    Differentiate between different user groups for MEL reporting.

    User Categories:
    - Participants: STEM graduates/final-year students in general preparatory training
    - Candidates: Those in international placement/onboarding track
    - Employers: Employer training participants
    - Visa Facilitators: Visa facilitator training participants

    Args:
        users_df: User information DataFrame
        enrollments_df: Enrollment data DataFrame
        roles_df: User roles DataFrame
        courses_df: Course information DataFrame

    Returns:
        DataFrame with user categorization
    """
    # Merge enrollment with course information
    user_courses = enrollments_df.merge(
        courses_df[['course_id', 'training_track']],
        on='course_id',
        how='left'
    )

    # Get primary role for each user
    user_roles = roles_df.groupby('user_id').agg({
        'user_type_category': 'first',
        'role_shortname': 'first'
    }).reset_index()

    # Merge user data
    user_categorization = users_df[['user_id', 'firstname', 'lastname', 'email', 'gender']].merge(
        user_roles,
        on='user_id',
        how='left'
    )

    # Determine primary user group based on enrollment patterns
    def categorize_user(user_id):
        user_tracks = user_courses[user_courses['user_id'] == user_id]['training_track'].unique()

        if 'Employer Training' in user_tracks:
            return 'Employer'
        elif 'Visa Facilitator Training' in user_tracks:
            return 'Visa Facilitator'
        elif 'Onboarding Training' in user_tracks:
            return 'Candidate (Onboarding Track)'
        elif 'General Preparatory Training' in user_tracks:
            return 'Participant (Preparatory Track)'
        elif 'SUSTAIN Training' in user_tracks:
            return 'SUSTAIN Participant'
        else:
            return 'Other'

    user_categorization['primary_user_group'] = user_categorization['user_id'].apply(categorize_user)

    # Add enrollment count by track
    enrollment_summary = user_courses.groupby(['user_id', 'training_track']).size().reset_index(name='enrollments')
    enrollment_pivot = enrollment_summary.pivot(
        index='user_id',
        columns='training_track',
        values='enrollments'
    ).fillna(0).reset_index()

    user_categorization = user_categorization.merge(enrollment_pivot, on='user_id', how='left')

    return user_categorization


# ============================================================================
# SCREENING LEVELS COMPLETION
# ============================================================================

def calculate_screening_completion(
    users_df: pd.DataFrame,
    module_completions_df: pd.DataFrame,
    courses_df: pd.DataFrame,
    course_modules_df: pd.DataFrame = None
) -> pd.DataFrame:
    """
    Calculate completion of four screening levels for onboarding eligibility.

    Screening levels are typically:
    1. Initial Assessment
    2. Technical Skills Test
    3. Language Proficiency
    4. Interview/Behavioral Assessment

    Args:
        users_df: User information DataFrame
        module_completions_df: Module completion data
        courses_df: Course information
        course_modules_df: Optional course module metadata

    Returns:
        DataFrame with screening level completion status
    """
    # Filter for onboarding courses
    onboarding_courses = courses_df[
        courses_df['training_track'] == 'Onboarding Training'
    ]['course_id'].tolist()

    # Get completions for onboarding modules
    screening_completions = module_completions_df[
        module_completions_df['course_module_id'].isin(onboarding_courses)
    ].copy()

    # Count completed screening levels per user
    screening_summary = screening_completions.groupby('user_id').agg({
        'module_completion_id': 'count',
        'completion_state': lambda x: (x == 2).sum()  # Count "Complete (Pass)"
    }).reset_index()

    screening_summary.columns = ['user_id', 'total_modules_completed', 'modules_passed']

    # Merge with user information
    result = users_df[['user_id', 'firstname', 'lastname', 'email', 'gender']].merge(
        screening_summary,
        on='user_id',
        how='left'
    ).fillna(0)

    # Determine if user completed all four screening levels
    # This assumes 4 key screening modules - adjust based on actual structure
    result['completed_all_four_levels'] = result['modules_passed'] >= 4
    result['eligible_for_onboarding'] = result['completed_all_four_levels']

    return result


# ============================================================================
# EMPLOYER-SPECIFIC METRICS
# ============================================================================

def calculate_employer_understanding_improvement(
    users_df: pd.DataFrame,
    quiz_grades_df: pd.DataFrame,
    quizzes_df: pd.DataFrame,
    roles_df: pd.DataFrame,
    courses_df: pd.DataFrame,
    feedback_df: pd.DataFrame = None
) -> pd.DataFrame:
    """
    Calculate improvement in employer understanding of international labour markets.

    MEL Indicator: Employers completing training and reporting improved understanding

    Args:
        users_df: User information
        quiz_grades_df: Quiz grades
        quizzes_df: Quiz metadata
        roles_df: User roles
        courses_df: Course information
        feedback_df: Optional feedback responses

    Returns:
        DataFrame with employer improvement metrics
    """

    # Merge quiz metadata with grades
    quiz_results = quiz_grades_df.merge(
        quizzes_df[['quiz_id', 'course_id', 'assessment_type', 'max_grade']],
        on='quiz_id',
        how='left'
    )
    # Identify employers
    employers = roles_df[roles_df['user_type_category'] == 'Employer']['user_id'].unique()

    # Get employer training courses
    employer_courses = courses_df[
        courses_df['training_track'] == 'Employer Training'
    ]['course_id'].tolist()

    # Get quiz results for employers in employer training
    employer_quiz_results = quiz_results[
        (quiz_results['user_id'].isin(employers)) &
        (quiz_results['course_id'].isin(employer_courses))
    ]

    # Separate pre and post tests
    pre_tests = employer_quiz_results[
        employer_quiz_results['assessment_type'] == 'Pre-Test'
    ][['user_id', 'course_id', 'final_grade']].rename(
        columns={'final_grade': 'pre_test_score'}
    )

    post_tests = employer_quiz_results[
        employer_quiz_results['assessment_type'] == 'Post-Test'
    ][['user_id', 'course_id', 'final_grade']].rename(
        columns={'final_grade': 'post_test_score'}
    )

    # Merge pre and post
    employer_improvement = pre_tests.merge(
        post_tests,
        on=['user_id', 'course_id'],
        how='outer'
    )

    # Calculate improvement
    employer_improvement['score_improvement'] = (
        employer_improvement['post_test_score'] - employer_improvement['pre_test_score']
    )
    employer_improvement['improved_understanding'] = employer_improvement['score_improvement'] > 0

    # Add user information
    result = employer_improvement.merge(
        users_df[['user_id', 'firstname', 'lastname', 'email', 'gender', 'country']],
        on='user_id',
        how='left'
    )

    return result


# ============================================================================
# TRAINING USEFULNESS (EU RASM5)
# ============================================================================

def analyze_training_usefulness(
    users_df: pd.DataFrame,
    feedback_responses_df: pd.DataFrame,
    questionnaire_responses_df: pd.DataFrame,
    courses_df: pd.DataFrame
) -> pd.DataFrame:
    """
    Analyze participants' perceived usefulness of training.

    EU RASM5 Indicator: Participant perception of training usefulness

    Args:
        users_df: User information
        feedback_responses_df: Feedback submissions
        questionnaire_responses_df: Questionnaire responses
        courses_df: Course information

    Returns:
        DataFrame with usefulness ratings by course and user type
    """
    # Combine feedback and questionnaire responses
    all_responses = pd.concat([
        feedback_responses_df[['user_id', 'feedback_id', 'response_date']].assign(
            response_type='feedback'
        ),
        questionnaire_responses_df[['user_id', 'questionnaire_id', 'submission_date']].rename(
            columns={'questionnaire_id': 'feedback_id', 'submission_date': 'response_date'}
        ).assign(response_type='questionnaire')
    ])

    # Count responses per user
    response_summary = all_responses.groupby('user_id').agg({
        'feedback_id': 'count',
        'response_date': 'max'
    }).reset_index()

    response_summary.columns = ['user_id', 'total_feedback_submissions', 'last_feedback_date']

    # Add user demographics
    result = users_df[['user_id', 'firstname', 'lastname', 'email', 'gender']].merge(
        response_summary,
        on='user_id',
        how='left'
    ).fillna(0)

    result['provided_feedback'] = result['total_feedback_submissions'] > 0

    return result


# ============================================================================
# DISAGGREGATION HELPER FUNCTIONS
# ============================================================================

def disaggregate_by_gender(
    data_df: pd.DataFrame,
    metric_column: str,
    users_df: pd.DataFrame
) -> pd.DataFrame:
    """
    Disaggregate any metric by gender.

    Args:
        data_df: DataFrame containing the data to disaggregate
        metric_column: Column name containing the metric
        users_df: User DataFrame with gender information

    Returns:
        DataFrame with gender disaggregation
    """
    # Merge gender information
    with_gender = data_df.merge(
        users_df[['user_id', 'gender']],
        on=('user_id', "gender"),
        how='left'
    )

    # Group by gender and calculate statistics
    disaggregated = with_gender.groupby('gender').agg({
        metric_column: ['count', 'sum', 'mean'],
        'user_id': 'nunique'
    }).reset_index()

    disaggregated.columns = [
        'gender',
        f'{metric_column}_count',
        f'{metric_column}_sum',
        f'{metric_column}_mean',
        'unique_users'
    ]

    return disaggregated


def disaggregate_by_country(
    data_df: pd.DataFrame,
    metric_column: str,
    users_df: pd.DataFrame
) -> pd.DataFrame:
    """
    Disaggregate any metric by country.

    Args:
        data_df: DataFrame containing the data to disaggregate
        metric_column: Column name containing the metric
        users_df: User DataFrame with country information

    Returns:
        DataFrame with country disaggregation
    """
    # Merge country information
    with_country = data_df.merge(
        users_df[['user_id', 'country']],
        on='user_id',
        how='left'
    )

    # Group by country and calculate statistics
    disaggregated = with_country.groupby('country').agg({
        metric_column: ['count', 'sum', 'mean'],
        'user_id': 'nunique'
    }).reset_index()

    disaggregated.columns = [
        'country',
        f'{metric_column}_count',
        f'{metric_column}_sum',
        f'{metric_column}_mean',
        'unique_users'
    ]

    return disaggregated


# ============================================================================
# COMPREHENSIVE MEL DASHBOARD DATA GENERATOR
# ============================================================================

def generate_mel_dashboard_data(
    users_df: pd.DataFrame,
    courses_df: pd.DataFrame,
    enrollments_df: pd.DataFrame,
    completions_df: pd.DataFrame,
    quiz_grades_df: pd.DataFrame,
    quizzes_df: pd.DataFrame,
    roles_df: pd.DataFrame,
    feedback_df: pd.DataFrame = None
) -> Dict[str, pd.DataFrame]:
    """
    Generate all necessary data for MEL dashboard.

    This function creates a comprehensive set of dashboard-ready datasets
    for all key MEL indicators.

    Returns:
        Dictionary of DataFrames ready for dashboard consumption
    """
    dashboard_data = {}

    # 1. User categorization
    print("Generating user categorization...")
    dashboard_data['user_categories'] = differentiate_user_groups(
        users_df, enrollments_df, roles_df, courses_df
    )

    # 2. Completion rates by track and gender
    print("Calculating completion rates...")
    completions_completion = completions_df.merge(
        users_df[['user_id', 'gender', 'country']],
        on='user_id'
    )
    completion_data = completions_completion.merge(
        courses_df[['course_id', 'training_track']],
        on='course_id'
    )

    dashboard_data['completion_by_track_gender'] = completion_data.groupby(
        ['training_track', 'gender']
    ).agg({
        'user_id': 'count',
        'course_id': 'nunique'
    }).reset_index().rename(columns={
        'user_id': 'completions',
        'course_id': 'unique_courses'
    })

    # 3. Knowledge increase summary
    print("Calculating knowledge increase...")
    quiz_results = quiz_grades_df.merge(
        quizzes_df[['quiz_id', 'course_id', 'assessment_type']],
        on='quiz_id'
    )

    pre_tests = quiz_results[quiz_results['assessment_type'] == 'Pre-Test'][
        ['user_id', 'course_id', 'final_grade']
    ].rename(columns={'final_grade': 'pre_score'})

    post_tests = quiz_results[quiz_results['assessment_type'] == 'Post-Test'][
        ['user_id', 'course_id', 'final_grade']
    ].rename(columns={'final_grade': 'post_score'})

    knowledge_increase = pre_tests.merge(post_tests, on=['user_id', 'course_id'], how='outer')
    knowledge_increase['score_increase'] = knowledge_increase['post_score'] - knowledge_increase['pre_score']
    knowledge_increase['improved'] = knowledge_increase['score_increase'] > 0

    # Add demographics
    knowledge_increase = knowledge_increase.merge(
        users_df[['user_id', 'gender', 'country']],
        on='user_id'
    ).merge(
        courses_df[['course_id', 'training_track']],
        on='course_id'
    )

    dashboard_data['knowledge_increase'] = knowledge_increase

    # 4. Summary statistics for quick metrics
    print("Generating summary statistics...")
    dashboard_data['summary_stats'] = pd.DataFrame({
        'metric': [
            'Total Users',
            'Total Courses',
            'Total Enrollments',
            'Total Completions',
            'Onboarding Candidates',
            'Preparatory Participants',
            'Employers',
            'Average Knowledge Increase'
        ],
        'value': [
            len(users_df),
            len(courses_df),
            len(enrollments_df),
            len(completions_df),
            len(dashboard_data['user_categories'][
                dashboard_data['user_categories']['primary_user_group'] == 'Candidate (Onboarding Track)'
            ]),
            len(dashboard_data['user_categories'][
                dashboard_data['user_categories']['primary_user_group'] == 'Participant (Preparatory Track)'
            ]),
            len(dashboard_data['user_categories'][
                dashboard_data['user_categories']['primary_user_group'] == 'Employer'
            ]),
            knowledge_increase['score_increase'].mean() if len(knowledge_increase) > 0 else 0
        ]
    })

    print(f"✓ Generated {len(dashboard_data)} dashboard datasets")
    return dashboard_data


# ============================================================================
# EXPORT HELPER
# ============================================================================

def export_mel_summary_report(dashboard_data: Dict[str, pd.DataFrame], output_path: str = 'mel_summary.xlsx'):
    """
    Export MEL dashboard data to Excel with multiple sheets.

    Args:
        dashboard_data: Dictionary of DataFrames
        output_path: Path for output Excel file
    """
    try:
        with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
            for sheet_name, df in dashboard_data.items():
                df.to_excel(writer, sheet_name=sheet_name[:31], index=False)

        print(f"✓ Exported MEL summary to {output_path}")
        return output_path
    except Exception as e:
        print(f"✗ Error exporting to Excel: {e}")
        return None


In [None]:


if __name__ == "__main__":

    # Step 1: Get database password
    try:
        from google.colab import userdata
        db_password = userdata.get('db_password')
        print("✓ Retrieved database password from Colab secrets")
    except:
        # For local testing, use environment variable
        import os
        db_password = os.getenv('DB_PASSWORD')
        if not db_password:
            print("✗ Database password not found!")
            print("  Set 'db_password' in Colab secrets or DB_PASSWORD environment variable")
            sys.exit(1)
        else:
            print("✓ Retrieved database password from environment variable")

    # Step 2: Create database engine
    engine = create_db_engine(db_password)

    # Step 3: Extract all data
    extractor = MELDataExtractor(engine)
    extractor.extract_all_data()

✓ Retrieved database password from Colab secrets
✓ Database engine created successfully

STARTING COMPREHENSIVE DATA EXTRACTION

EXTRACTING USER DATA
✓ Fetched 2005 rows × 22 columns
✓ Fetched 4992 rows × 3 columns
✓ Extracted 2005 users with demographics

EXTRACTING COURSE DATA
✓ Fetched 29 rows × 14 columns
✓ Extracted 29 courses

Course distribution by training track:
training_track
Other    29
Name: count, dtype: int64

EXTRACTING USER ROLES
✓ Fetched 1495 rows × 9 columns
✓ Extracted 1495 role assignments

User type distribution:
user_type_category
Participant/Candidate     1460
Other                       16
Manager                     12
Facilitator/Instructor       7
Name: count, dtype: int64

EXTRACTING ENROLLMENT DATA
✓ Fetched 1472 rows × 13 columns
✓ Extracted 1472 enrollments

EXTRACTING QUIZ/ASSESSMENT DATA
✓ Fetched 299 rows × 11 columns
✓ Fetched 6929 rows × 12 columns
✓ Fetched 5328 rows × 6 columns
✓ Extracted 299 quizzes
✓ Extracted 6929 quiz attempts
✓ Extracted 532

In [None]:
# Differentiate user types
user_categories = differentiate_user_groups(
    extractor.extracted_data['users'],
    extractor.extracted_data['enrollments'],
    extractor.extracted_data['user_roles'],
    extractor.extracted_data['courses']
)

# Calculate screening completion
screening_status = calculate_screening_completion(
    extractor.extracted_data['users'],
    extractor.extracted_data['module_completions'],
    extractor.extracted_data['courses']
)

# Calculate employers understanding rate
employers_understanding_rate = calculate_employer_understanding_improvement(
    extractor.extracted_data['users'],
    extractor.extracted_data['quiz_grades'],
    extractor.extracted_data['quizzes'],
    extractor.extracted_data['user_roles'],
    extractor.extracted_data['courses']
)

training_completion = analyze_training_usefulness(
    extractor.extracted_data['users'],
    extractor.extracted_data['feedback_responses'],
    extractor.extracted_data['questionnaire_responses'],
    extractor.extracted_data['courses']
)

gender_disagg = disaggregate_by_gender(
    training_completion,
    'provided_feedback',
    extractor.extracted_data['users']
)

country_disagg = disaggregate_by_country(
    training_completion,
    'provided_feedback',
    extractor.extracted_data['users']
)

dashboard_data = generate_mel_dashboard_data(
    extractor.extracted_data['users'],
    extractor.extracted_data['courses'],
    extractor.extracted_data['enrollments'],
    extractor.extracted_data['course_completions'], # Changed from module_completions
    extractor.extracted_data['quiz_grades'],
    extractor.extracted_data['quizzes'],
    extractor.extracted_data['user_roles'],
    feedback_df=extractor.extracted_data['feedback']
)

In [None]:

display(f"Differentiation between user types:")
display(user_categories)
display(f"✓ {len(user_categories)} user categories calculated")


display(f"Screening completion:")
display(screening_status)
display(f"✓ Screening completion calculated")


display(f"Employers understanding rate:")
display(employers_understanding_rate)
display(f"✓ Employers understanding rate calculated")

display(f"Training usefulness:")
display(training_completion)
display(f"✓ Training usefulness calculated")

display(f"Gender disaggregation:")
display(gender_disagg)
display(f"✓ Gender disaggregation calculated")

display(f"Country disaggregation:")
display(country_disagg)
display(f"✓ Country disaggregation calculated")

display(f"Dashboard data:")
for sheet_name, df in dashboard_data.items():
    display(df.head())
display(f"✓ Dashboard data generated")


'Differentiation between user types:'

Unnamed: 0,user_id,firstname,lastname,email,gender,user_type_category,role_shortname,primary_user_group,Other
0,3,Student,User,nabilah@tyne-solutions.com,Other,,,Other,
1,6,Student,Demo,demo@test-email.com,Female,,,Other,
2,7,Dedy,A,dedy@seefar.org,Male,,,Other,
3,8,Admin,Seefar,sa-administrator@seefaracademy.org,Female,,,Other,
4,9,Gerardo,Molina,gerardo.m@seefar.org,"<span lang=""en"" class=""multilang"">Male</span><...",,,Other,
...,...,...,...,...,...,...,...,...,...
2000,2859,Justine,Baldessin,justine_baldessin_4535@spacemaiil.ru,"<span lang=""en"" class=""multilang"">Other</span>...",,,Other,
2001,2860,Kashish,Suvarna,kashish@seefar.org,"<span lang=""en"" class=""multilang"">Female</span...",,,Other,
2002,2861,Sibyl,Sheil,sibylsheil-3442@smass.store,"<span lang=""en"" class=""multilang"">Other</span>...",,,Other,
2003,2862,Andre,Taber,andre.taber6012@poochta.ru,"<span lang=""en"" class=""multilang"">Other</span>...",,,Other,


'✓ 2005 user categories calculated'

'Screening completion:'

Unnamed: 0,user_id,firstname,lastname,email,gender,total_modules_completed,modules_passed,completed_all_four_levels,eligible_for_onboarding
0,3,Student,User,nabilah@tyne-solutions.com,Other,0.0,0.0,False,False
1,6,Student,Demo,demo@test-email.com,Female,0.0,0.0,False,False
2,7,Dedy,A,dedy@seefar.org,Male,0.0,0.0,False,False
3,8,Admin,Seefar,sa-administrator@seefaracademy.org,Female,0.0,0.0,False,False
4,9,Gerardo,Molina,gerardo.m@seefar.org,"<span lang=""en"" class=""multilang"">Male</span><...",0.0,0.0,False,False
...,...,...,...,...,...,...,...,...,...
2000,2859,Justine,Baldessin,justine_baldessin_4535@spacemaiil.ru,"<span lang=""en"" class=""multilang"">Other</span>...",0.0,0.0,False,False
2001,2860,Kashish,Suvarna,kashish@seefar.org,"<span lang=""en"" class=""multilang"">Female</span...",0.0,0.0,False,False
2002,2861,Sibyl,Sheil,sibylsheil-3442@smass.store,"<span lang=""en"" class=""multilang"">Other</span>...",0.0,0.0,False,False
2003,2862,Andre,Taber,andre.taber6012@poochta.ru,"<span lang=""en"" class=""multilang"">Other</span>...",0.0,0.0,False,False


'✓ Screening completion calculated'

'Employers understanding rate:'

Unnamed: 0,user_id,course_id,pre_test_score,post_test_score,score_improvement,improved_understanding,firstname,lastname,email,gender,country


'✓ Employers understanding rate calculated'

'Training usefulness:'

Unnamed: 0,user_id,firstname,lastname,email,gender,total_feedback_submissions,last_feedback_date,provided_feedback
0,3,Student,User,nabilah@tyne-solutions.com,Other,2.0,2023-11-27 03:39:54,True
1,6,Student,Demo,demo@test-email.com,Female,0.0,0,False
2,7,Dedy,A,dedy@seefar.org,Male,0.0,0,False
3,8,Admin,Seefar,sa-administrator@seefaracademy.org,Female,0.0,0,False
4,9,Gerardo,Molina,gerardo.m@seefar.org,"<span lang=""en"" class=""multilang"">Male</span><...",0.0,0,False
...,...,...,...,...,...,...,...,...
2000,2859,Justine,Baldessin,justine_baldessin_4535@spacemaiil.ru,"<span lang=""en"" class=""multilang"">Other</span>...",0.0,0,False
2001,2860,Kashish,Suvarna,kashish@seefar.org,"<span lang=""en"" class=""multilang"">Female</span...",0.0,0,False
2002,2861,Sibyl,Sheil,sibylsheil-3442@smass.store,"<span lang=""en"" class=""multilang"">Other</span>...",0.0,0,False
2003,2862,Andre,Taber,andre.taber6012@poochta.ru,"<span lang=""en"" class=""multilang"">Other</span>...",0.0,0,False


'✓ Training usefulness calculated'

'Gender disaggregation:'

Unnamed: 0,gender,provided_feedback_count,provided_feedback_sum,provided_feedback_mean,unique_users
0,0,342,0,0.0,342
1,,35,0,0.0,35
2,"<span lang=""en"" class=""multilang"">Female</span...",506,306,0.604743,506
3,"<span lang=""en"" class=""multilang"">Male</span><...",1062,395,0.37194,1062
4,"<span lang=""en"" class=""multilang"">Other</span>...",48,3,0.0625,48
5,Female,5,3,0.6,5
6,Male,5,2,0.4,5
7,Other,2,1,0.5,2


'✓ Gender disaggregation calculated'

'Country disaggregation:'

Unnamed: 0,country,provided_feedback_count,provided_feedback_sum,provided_feedback_mean,unique_users
0,,548,58,0.105839,548
1,AE,1,0,0.000000,1
2,AF,41,11,0.268293,41
3,AL,1,1,1.000000,1
4,AM,1,0,0.000000,1
...,...,...,...,...,...
105,YE,3,0,0.000000,3
106,YT,1,0,0.000000,1
107,ZA,2,0,0.000000,2
108,ZM,1,0,0.000000,1


'✓ Country disaggregation calculated'

'Dashboard data:'

Unnamed: 0,user_id,firstname,lastname,email,gender,user_type_category,role_shortname,primary_user_group,Other
0,3,Student,User,nabilah@tyne-solutions.com,Other,,,Other,
1,6,Student,Demo,demo@test-email.com,Female,,,Other,
2,7,Dedy,A,dedy@seefar.org,Male,,,Other,
3,8,Admin,Seefar,sa-administrator@seefaracademy.org,Female,,,Other,
4,9,Gerardo,Molina,gerardo.m@seefar.org,"<span lang=""en"" class=""multilang"">Male</span><...",,,Other,


Unnamed: 0,training_track,gender,completions,unique_courses
0,Other,"<span lang=""en"" class=""multilang"">Female</span...",170,4
1,Other,"<span lang=""en"" class=""multilang"">Male</span><...",200,10
2,Other,Male,1,1


Unnamed: 0,user_id,course_id,pre_score,post_score,score_increase,improved,gender,country,training_track
0,23,10,22.0,,,False,Female,BH,Other
1,34,38,3.33333,,,False,"<span lang=""en"" class=""multilang"">Male</span><...",,Other
2,117,38,9.16668,,,False,"<span lang=""en"" class=""multilang"">Male</span><...",NG,Other
3,141,10,12.22222,,,False,"<span lang=""en"" class=""multilang"">Male</span><...",TR,Other
4,209,38,10.0,,,False,"<span lang=""en"" class=""multilang"">Female</span...",NG,Other


Unnamed: 0,metric,value
0,Total Users,2005.0
1,Total Courses,29.0
2,Total Enrollments,1472.0
3,Total Completions,371.0
4,Onboarding Candidates,0.0


'✓ Dashboard data generated'

In [None]:
sheet_location = export_mel_summary_report(dashboard_data)
print(f"✓ MEL summary report exported to {sheet_location}")

✓ Exported MEL summary to mel_summary.xlsx
✓ MEL summary report exported to mel_summary.xlsx
