Below is the original Python code. Includes data cleaning, pre-processing and categorizing.

* AI assisted to format the code with comments and better readability for portfolio purposes.
* Note that the code does not run as the input file is not present in this repository. It is only for reference purposes.

In [None]:
import pandas as pd
import numpy as np
import re

# ────────────────────────────────────────────────
# Configuration flag - controls whether we remove exact duplicate rows
# (same ID + Activity + Development Objective combination)
DEDUPLICATE_INPUT = True

# ────────────────────────────────────────────────
# Load the source Excel file
input_file = "input_file_example.xlsx"

try:
    # Read Excel, forcing certain columns to be read as strings (preserves leading zeros, etc.)
    df = pd.read_excel(
        input_file,
        sheet_name="Sheet1",
        converters={
            "ID": str,
            "Activity": str,
            "Development Objective": str
        }
    )
except Exception as e:
    print(f"Error reading Excel file: {e}")
    exit(1)

# ────────────────────────────────────────────────
# Define cleaning functions for each key column

def clean_activity(value):
    """Clean Activity column: remove control characters, handle various forms of missing/empty values"""
    value = str(value).strip()                         # Convert to string + remove leading/trailing whitespace
    value = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', value) # Remove all control characters
    # Treat various representations of "missing" as "N/A"
    if pd.isna(value) or value.lower() in ["", "nan", "none", "null", "\xa0"]:
        return "N/A"
    if value.lower() in ["n/a", "na"]:
        return "N/A"
    return value

def clean_id(value):
    """Clean ID column: similar logic but different default for missing values"""
    value = str(value).strip()
    value = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', value)
    if pd.isna(value) or value.lower() in ["", "nan", "none", "null", "\xa0"]:
        return "Unknown_ID"
    return value

def clean_dev_objective(value):
    """Clean Development Objective - almost same logic as Activity"""
    value = str(value).strip()
    value = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', value)
    if pd.isna(value) or value.lower() in ["", "nan", "none", "null", "\xa0"]:
        return "N/A"
    if value.lower() in ["n/a", "na"]:
        return "N/A"
    return value

# Apply cleaning functions to the three main columns
df["Activity"] = df["Activity"].apply(clean_activity)
df["ID"] = df["ID"].apply(clean_id)
df["Development Objective"] = df["Development Objective"].apply(clean_dev_objective)

# Extra safety: fill any remaining NaN values after cleaning
df["Activity"] = df["Activity"].fillna("N/A")
df["ID"] = df["ID"].fillna("Unknown_ID")
df["Development Objective"] = df["Development Objective"].fillna("N/A")

# ────────────────────────────────────────────────
# Show basic statistics before any deduplication
print(f"Initial input records: {len(df)}")
print(f"Unique IDs: {df['ID'].nunique()}")
print(f"Unique Activities: {df['Activity'].nunique()}")
print(f"Unique Development Objectives: {df['Development Objective'].nunique()}")

# ────────────────────────────────────────────────
# Check for exact duplicates on the business key (ID + Activity + Dev Objective)
id_activity_dev_counts = df.groupby(
    ["ID", "Activity", "Development Objective"]
).size().reset_index(name="count")

duplicates_id_activity_dev = id_activity_dev_counts[
    id_activity_dev_counts["count"] > 1
][["ID", "Activity", "Development Objective", "count"]]

if not duplicates_id_activity_dev.empty:
    print(f"Found {len(duplicates_id_activity_dev)} duplicate ID-Activity-Development Objective pairs:")
    print(duplicates_id_activity_dev.head(10))

# ────────────────────────────────────────────────
# Optional deduplication step (controlled by config flag)
if DEDUPLICATE_INPUT:
    initial_count = len(df)
    # Keep first occurrence only when all three key columns match
    df = df.drop_duplicates(subset=["ID", "Activity", "Development Objective"])
    print(f"Deduplicated input: {initial_count} → {len(df)} records")

# ────────────────────────────────────────────────
# Create a surrogate key that should be unique per logical record
# Format: ID_Activity_DevObjective_rowindex
df["Unique_Record_ID"] = df.apply(
    lambda x: f"{x['ID']}_{x['Activity']}_{x['Development Objective']}_{x.name}"
    if pd.notna(x['ID']) and pd.notna(x['Activity']) and pd.notna(x['Development Objective'])
    else f"Unknown_{x.name}", 
    axis=1
)

print(f"Unique Unique_Record_IDs after deduplication: {df['Unique_Record_ID'].nunique()}")

# ────────────────────────────────────────────────
# Save intermediate debug files
df.to_csv("input_debug.csv", index=False)
unique_activities = pd.DataFrame(df["Activity"].unique(), columns=["Activity"])
unique_activities.to_csv("unique_activities_debug.csv", index=False)

print("Raw input saved to 'input_debug.csv'")
print("Unique activities saved to 'unique_activities_debug.csv'")

# ────────────────────────────────────────────────
# Report how many records ended up with placeholder values
blank_activities = df[df["Activity"] == "N/A"][["ID", "Activity", "Development Objective", "Unique_Record_ID"]]
if not blank_activities.empty:
    print(f"Found {len(blank_activities)} N/A or blank activities:")
    print(blank_activities.head(10))

blank_ids = df[df["ID"] == "Unknown_ID"][["ID", "Activity", "Development Objective", "Unique_Record_ID"]]
if not blank_ids.empty:
    print(f"Found {len(blank_ids)} Unknown_ID assignments:")
    print(blank_ids.head(10))

blank_dev_objectives = df[df["Development Objective"] == "N/A"][["ID", "Activity", "Development Objective", "Unique_Record_ID"]]
if not blank_dev_objectives.empty:
    print(f"Found {len(blank_dev_objectives)} N/A Development Objectives:")
    print(blank_dev_objectives.head(10))

# ────────────────────────────────────────────────
# Keyword-based classification rules (multi-label possible)
activity_categories = {
    "Training / Workshop": ["training", "workshop", "course", "enroll", "structured", "class", "program", "internal courses", "external training", "leadership training", "management training", "ai course", "bim training", "business writing course", "project management training", "safety training", "communication training", "technical training", "refresher training", "in-house training", "classroom training", "job rotation", "couse", "technovation talk", "business talk series"],
    "Seminar": ["seminar", "session", "sharing session", "conference", "webinar", "forum", "exhibition", "cpd seminar", "vendor demonstration", "case study sharing", "e-talks", "symposium", "town hall", "internal talks", "technology briefings", "briefing sessions", "panel discussions", "6g summit", "expo critical communication", "summit", "expo"],
    "Self-Study": ["self-study", "self-learning", "youtube", "books", "mobile app", "reading", "online learning", "e-learning", "self-paced", "web-based", "mylearning", "self-development", "audio books", "self reading", "update oneself", "chat gpt", "chatgpt", "deepseek", "comfyui", "navisworks", "rhino", "intranet", "icao", "redhat", "primavera"],
    "Language Exchange": ["language exchange", "conversational", "practice conversational", "language practice", "language partner", "toastmasters", "impromptu speaking", "public speaking", "cantonese lessons", "writing practice"],
    "Professional Development": ["professional development", "enhance skills", "technical knowledge", "problem solving", "leadership", "management skills", "people management", "people mindset", "culture change", "emotional intelligence", "change management", "time management", "communication skill", "technical report writing", "minto pyramid principle", "intercultural communication", "business acumen", "strategic thinking", "commercial awareness", "creative problem-solving", "corporate perspective", "growth mindset", "confidence", "persuasiveness", "delegation", "negotiation", "trust", "rapport", "high-quality work", "presentation skills", "business analytics", "industry acumen", "decision-making"],
    "Involvement in Projects": ["project", "implement", "project work", "hands-on", "practical application", "involve", "administrative work", "project assignments", "nec", "special project", "operation model review", "handover process", "attachment specialist", "design development", "construction management", "system development", "lidar drone", "bim clash"],
    "Mentoring / Coaching": ["mentoring", "coaching", "guidance", "mentor", "tutoring", "one-on-one", "feedback", "supervisor", "learn from", "knowledge sharing", "facilitator", "support assistance", "transfer know-how", "develop subordinates", "guide team"],
    "Certification": ["certification", "certificate", "accreditation", "credential", "exam preparation", "register electrical worker", "intercultural communication certification", "chartered engineer", "nace coating", "cissp", "rew grade", "cic ccbm"],
    "Networking": ["networking", "meetup", "industry event", "professional network", "collaboration", "professional events", "expand connections", "knowledge and experience exchange", "build relationships", "stakeholder", "public consultation", "business meetings", "cross-department", "event", "supplier meetings", "cross-industry exchange", "professional organizations", "senior management discussions"],
    "Research": ["research", "study", "analysis", "investigation", "data analysis", "mini research project", "product features", "business model", "new technologies", "market research", "system assessment", "code requirements", "literature review"],
    "Safety / Compliance": ["safety", "emergency response", "business continuity", "statutory requirement", "safety forums", "safety workshops", "risk management", "regulatory compliance"],
    "Site Visit": ["visit", "data center", "site visit", "facility tour", "travel technology", "site inspection"],
    "Team Building": ["team building", "teamwork", "collaboration", "interpersonal relationship", "people meetings", "cross-department", "collaborative decision-making", "team efficiency", "team discussions"],
    "General Development": ["to be update", "general development", "showcase", "general practices", "continuous engagement", "any suitable involvement", "explore industrial options"],
    "On-the-Job Learning": ["on-the-job", "on job", "practical experience", "learn from bosses", "practical learning", "duty attachment", "role rotation"],
    "Operational Duties": ["routine duty", "daily operations", "monitor progress", "site supervision", "operational requirements", "handover process", "contractor coordination", "works monitoring", "document preparation"],
    "Leadership / Management": ["lead", "manage", "supervise", "team leadership", "strategic initiatives", "business decision", "set goals", "manage capex", "team coordination"],
    "Event Planning": ["arrange events", "organise activities", "event planning", "promotion activities", "launch event", "marketing activities"]
}

def classify_activity(activity):
    """Rule-based classifier: returns list of matching categories (can be multiple)"""
    activity_clean = str(activity).lower().strip("- ")
    activity_clean = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', activity_clean)
    
    if activity_clean.lower() == "n/a":
        return ["Other"]
    
    matching_categories = set()
    for category, keywords in activity_categories.items():
        if any(keyword in activity_clean for keyword in keywords):
            matching_categories.add(category)
    
    return list(matching_categories) if matching_categories else ["Other"]

# ────────────────────────────────────────────────
# Create one row per category (exploding multi-category records)
new_rows = []
unique_inputs = set()

# Group by the business key + surrogate key
for (id_val, activity, dev_objective, unique_record_id), group in df.groupby(
    ["ID", "Activity", "Development Objective", "Unique_Record_ID"]
):
    unique_inputs.add((id_val, activity, dev_objective, unique_record_id))
    
    # Get list of categories for this activity
    categories = classify_activity(activity)
    if not categories:
        categories = ["Other"]
    
    # Take the first row of the group as base
    base_row = group.iloc[0].copy()
    
    # Create one new row per category
    for category in categories:
        new_row = base_row.copy()
        new_row["Activity Category"] = category
        new_rows.append(new_row)

# Convert list of dicts/series → final DataFrame
new_df = pd.DataFrame(new_rows)

# ────────────────────────────────────────────────
# Final statistics
print(f"Unique ID-Activity-Development Objective-Unique_Record_ID pairs in input: {len(unique_inputs)}")
print(f"Total output records (with multi-category duplicates): {len(new_df)}")
print(f"Unique Unique_Record_IDs in output: {new_df['Unique_Record_ID'].nunique()}")

other_activities = new_df[new_df["Activity Category"] == "Other"][
    ["ID", "Activity", "Development Objective", "Unique_Record_ID"]
]
print(f"Total activities classified as Other: {len(other_activities)}")
if not other_activities.empty:
    print("Sample of activities classified as Other:")
    print(other_activities.head(10))

# ────────────────────────────────────────────────
# Save results and debug files
new_df.to_excel("output_file_example.xlsx", index=False)
unique_pairs = pd.DataFrame(list(unique_inputs), columns=["ID", "Activity", "Development Objective", "Unique_Record_ID"])
unique_pairs.to_csv("unique_pairs_debug.csv", index=False)

category_counts = new_df.groupby("Activity Category")["Unique_Record_ID"].nunique().reset_index(name="Unique_Record_Count")
category_counts.to_csv("category_counts_debug.csv", index=False)

print("Output saved to 'output_file_example.xlsx'")
print("Unique pairs saved to 'unique_pairs_debug.csv'")
print("Category counts saved to 'category_counts_debug.csv'")

# ────────────────────────────────────────────────
# Show preview of final result
print("\nSample of categorized DataFrame:")
print(new_df[["ID", "Activity", "Development Objective", "Activity Category", "Unique_Record_ID"]].head(20))