# Dataset Cleaning Notebook

This notebook processes the `Hackathon Challenge #1 Datasets Cleaned.csv`. 
It performs the following operations:
1.  Cleans filenames to extract a readable "Unified Job Title".
2.  Determines if a posting is "Internal" based on keywords.
3.  Removes the original `filename` column.
4.  Exports the result to `Hackathon_Datasets_Refined_v3.csv`.

In [None]:
import pandas as pd
import re
import os

# Configuration
INPUT_FILE = 'Hackathon Challenge #1 Datasets Cleaned.csv'
OUTPUT_FILE = 'Hackathon_Datasets_Refined_v3.csv'

In [None]:
def clean_filename(path):
    """
    Extracts a clean job title from a file path.
    """
    if not isinstance(path, str):
        return ""

    # 1. Get base filename (handle Windows/Unix paths)
    filename = os.path.basename(path).replace('\\', '/')
    filename = filename.split('/')[-1]

    # 2. Remove extension
    filename = os.path.splitext(filename)[0]

    # 3. Remove date prefixes (e.g., "202203 ", "2024-05-")
    # Matches: Start of string, 4-8 digits, optional separators like - or _ or space
    filename = re.sub(r'^\d{4,8}[-_\s]*', '', filename)
    
    # 4. Remove common noise words (Case insensitive)
    noise_words = [
        r'\bposting\b', 
        r'\bjob description\b', 
        r'\bjd\b', 
        r'\bexternal\b', 
        r'\binternal\b',
        r'\bexpression of interest\b',
        r'\bsecondment\b',
        r'\bacting\b',
        r'\bterm\b',
        r'\bcontract\b'
    ]
    
    for word in noise_words:
        filename = re.sub(word, '', filename, flags=re.IGNORECASE)

    # 5. Clean whitespace
    filename = re.sub(r'\s+', ' ', filename).strip()
    
    return filename

def determine_internal(row):
    """
    Determines if a role is an internal posting based on keywords
    in the original filename or job title.
    """
    text_to_search = str(row.get('filename', '')) + " " + str(row.get('job_title', ''))
    keywords = ['internal', 'expression of interest', 'secondment', 'acting']
    
    if any(keyword in text_to_search.lower() for keyword in keywords):
        return 'Yes'
    return 'No'

def create_unified_title(row):
    """
    Creates the final 'Unified Job Title'.
    Prioritizes the cleaned filename. Falls back to original job_title if cleaned filename is too short.
    """
    cleaned_name = clean_filename(row.get('filename', ''))
    original_title = str(row.get('job_title', '')).strip()
    
    if len(cleaned_name) > 3:
        return cleaned_name.title()
    elif len(original_title) > 3:
        return original_title.title()
    else:
        return "Unknown Position"

In [None]:
# Load Data
print(f"Loading data from {INPUT_FILE}...")
df = pd.read_csv(INPUT_FILE)

# Apply Cleaning Logic
print("Generating Unified Job Titles...")
df['Unified Job Title'] = df.apply(create_unified_title, axis=1)

print("Determining Internal Posting status...")
df['Internal Posting'] = df.apply(determine_internal, axis=1)

# Fill missing text fields
text_columns = ['position_summary', 'responsibilities', 'qualifications']
for col in text_columns:
    if col in df.columns:
        df[col] = df[col].fillna("")

# Remove the original 'filename' column as requested
if 'filename' in df.columns:
    print("Dropping original filename column...")
    df = df.drop(columns=['filename'])

# Reorder columns to put new ones first (optional but nice)
cols = ['Unified Job Title', 'Internal Posting'] + [c for c in df.columns if c not in ['Unified Job Title', 'Internal Posting']]
df = df[cols]

# Export
print(f"Saving refined dataset to {OUTPUT_FILE}...")
df.to_csv(OUTPUT_FILE, index=False)

# Preview
print("\nFirst 5 rows of refined data:")
df.head()