# Dataset Cleaning Notebook - V4 Refined

This notebook processes `Hackathon Challenge #1 Datasets Cleaned.csv` to produce a high-quality `Hackathon_Datasets_Refined_v4.csv`.

### Improvements in V4:
-   **Standardized Case for Acronyms:** `HR`, `IT`, `HSE` are now properly capitalized.
-   **Job Level Extraction:** Extracts terms like "Senior", "Junior", "Lead" into a separate `Job Level` column.
-   **Global/Regional Tags:** Identifies "Global" or "Regional" roles in a `Scope` column.
-   **Cleaner Titles:** Removes the extracted levels and scope terms from the `Unified Job Title` for a cleaner core title.

In [None]:
import pandas as pd
import re
import os

# Configuration
INPUT_FILE = 'Hackathon Challenge #1 Datasets Cleaned.csv'
OUTPUT_FILE = 'Hackathon_Datasets_Refined_v4.csv'

In [None]:
def clean_filename_v4(path):
    """
    Advanced cleaning for filenames.
    """
    if not isinstance(path, str):
        return ""

    # 1. Base filename
    filename = os.path.basename(path).replace('\\', '/')
    filename = filename.split('/')[-1]
    filename = os.path.splitext(filename)[0]

    # 2. Convert to Title Case primarily, but we will fix acronyms later
    filename = filename.title()

    # 3. Remove date prefixes (e.g., "202203 ", "2024-05-")
    filename = re.sub(r'^\d{4,8}[-_\s]*', '', filename)
    
    # 4. Remove common noise words (Case insensitive)
    noise_words = [
        r'\bPosting\b', r'\bJob Description\b', r'\bJd\b', 
        r'\bExternal\b', r'\bInternal\b', r'\bExpression Of Interest\b',
        r'\bSecondment\b', r'\bActing\b', r'\bTerm\b', r'\bContract\b'
    ]
    
    for word in noise_words:
        filename = re.sub(word, '', filename, flags=re.IGNORECASE)

    # 5. Clean whitespace
    filename = re.sub(r'\s+', ' ', filename).strip()
    
    return filename

def extract_metadata(row):
    """
    Extracts metadata from the cleaned title and refines the title further.
    Returns a Series with: [Core Title, Job Level, Scope, Internal Posting]
    """
    # Get initial cleaned name
    full_title = clean_filename_v4(row.get('filename', ''))
    if len(full_title) < 3:
        full_title = str(row.get('job_title', '')).title()

    # --- Extraction Logic ---
    
    # 1. Job Level
    levels = ['Senior', 'Junior', 'Lead', 'Principal', 'Chief', 'Head Of', 'Director', 'Manager', 'Vice President', 'Vp']
    found_level = ""
    for level in levels:
        if re.search(r'\b' + level + r'\b', full_title, re.IGNORECASE):
            found_level = level
            # Optional: Remove level from title? usually better to KEEP it in the title for readability,
            # but we can have a separate column for filtering.
            break
            
    # 2. Scope
    scopes = ['Global', 'Regional', 'Local', 'Site']
    found_scope = ""
    for scope in scopes:
        if re.search(r'\b' + scope + r'\b', full_title, re.IGNORECASE):
            found_scope = scope
            break

    # 3. Internal Status (using original raw data for safety)
    raw_text = str(row.get('filename', '')) + " " + str(row.get('job_title', ''))
    internal_keywords = ['internal', 'expression of interest', 'secondment', 'acting']
    is_internal = 'Yes' if any(k in raw_text.lower() for k in internal_keywords) else 'No'

    # 4. Final Polish (Acronyms)
    # Fix common acronyms that were Title Cased (e.g., "Hr" -> "HR")
    acronyms = {"Hr": "HR", "It": "IT", "Hse": "HSE", "Vp": "VP", "Ceo": "CEO", "Cfo": "CFO"}
    words = full_title.split()
    fixed_words = [acronyms.get(w, w) for w in words]
    final_title = " ".join(fixed_words)

    return pd.Series([final_title, found_level, found_scope, is_internal])

In [None]:
# Load Data
print(f"Loading data from {INPUT_FILE}...")
df = pd.read_csv(INPUT_FILE)

# Apply Extraction
print("Processing titles and metadata...")
metadata_cols = ['Unified Job Title', 'Job Level', 'Scope', 'Internal Posting']
df[metadata_cols] = df.apply(extract_metadata, axis=1)

# Handle Missing Text Fields
text_columns = ['position_summary', 'responsibilities', 'qualifications']
for col in text_columns:
    if col in df.columns:
        df[col] = df[col].fillna("")

# Drop original filename
if 'filename' in df.columns:
    df = df.drop(columns=['filename'])

# Reorder columns
final_cols = metadata_cols + [c for c in df.columns if c not in metadata_cols]
df = df[final_cols]

# Export
print(f"Saving refined dataset to {OUTPUT_FILE}...")
df.to_csv(OUTPUT_FILE, index=False)

# Preview
print("\nFirst 5 rows of V4 refined data:")
df.head()