In [None]:
from google.colab import drive

import pandas as pd
import numpy as np
import re

from matplotlib import pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_seq_items', None)

FILE_PATH = "/content/drive/MyDrive/BT4012 Group 16!!/" # change the file path accordingly

drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np

def preprocess_job_df(
    df,
    cols_to_drop=None,
    cols_to_merge=None,
    education_col="required_education",
    new_text_col="text"
):

    df = df.copy()

    # 1. Drop unwanted columns
    if cols_to_drop:
        df = df.drop(columns=cols_to_drop, errors='ignore')

    # 2. Handle missing values
    #    - keep NaN for all
    #    - convert "unspecified" (any casing) in required_education to NaN
    if education_col in df.columns:
        df[education_col] = df[education_col].apply(
            lambda x: np.nan if isinstance(x, str) and x.strip().lower() == "unspecified" else x
        )

    # 3. Concatenate text columns
    if cols_to_merge:
        # Replace NaN with empty string
        df[cols_to_merge] = df[cols_to_merge].fillna("")

        # Merge into new column
        df[new_text_col] = df[cols_to_merge].astype(str).agg(" ".join, axis=1)

        # 4. Drop original columns
        df = df.drop(columns=cols_to_merge, errors="ignore")

    return df


In [None]:
cols_to_drop = [
    "job_id",
    "salary_range",
    "telecommuting",
    "has_company_logo",
    "has_questions",
]

cols_to_merge = [
    "title",
    "location",
    "department",
    "company_profile",
    "description",
    "requirements",
    "benefits",
    "employment_type",
    "required_experience",
    "required_education",
    "industry",
    "function"
]



In [None]:
import re
import numpy as np
import pandas as pd

def clean(df, text_col="text"):
    """
    Cleans a dataframe by:
    - stripping whitespace for all object (string) columns
    - converting empty strings to NaN
    - removing URLs from the specified text column

    Parameters:
        df (pd.DataFrame): Input dataframe
        text_col (str): Column to apply URL cleaning to

    Returns:
        pd.DataFrame: Cleaned dataframe
    """

    # 1. Strip leading/trailing spaces from all object columns
    df = df.apply(lambda col: col.str.strip() if col.dtype == "object" else col)

    # 2. Convert empty strings to NaN
    df = df.replace(r'^\s*$', np.nan, regex=True)

    # 3. Remove URLs from the main text column
    if text_col in df.columns:
        df[text_col] = df[text_col].astype(str).apply(lambda x: re.sub(r"http\S+", "", x))

    return df


# Base Bert

In [None]:
# BASE BERT 5-FOLDS AND FINE TUNING

import re, time, gc
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score


# BASE BERT CLASSIFIER MODEL
class TextDetector(nn.Module):
    def __init__(self, model_name="bert-base-uncased", dropout=0.2):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden_size = self.encoder.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)

        # BERT uses pooled CLS output at outputs.pooler_output
        pooled = outputs.pooler_output
        pooled = self.dropout(pooled)

        logits = self.classifier(pooled)
        return logits

    def predict(self, tokenizer, texts, device='cuda'):
        self.eval()
        device = torch.device(device if torch.cuda.is_available() else 'cpu')
        self.to(device)

        all_probs = []
        with torch.no_grad():
            for i in range(0, len(texts), 4):
                batch = texts[i:i+4]
                inputs = tokenizer(
                    batch, return_tensors='pt',
                    truncation=True, padding=True, max_length=512
                ).to(device)

                if 'token_type_ids' in inputs:
                    del inputs['token_type_ids']

                logits = self(**inputs)
                probs = torch.sigmoid(logits).cpu().numpy().flatten()
                all_probs.extend(probs)

                del inputs, logits, probs
                torch.cuda.empty_cache()

        return np.array(all_probs)

# TRAINING FUNCTION
def train_detector(model, tokenizer, train_texts, train_labels,
                   val_texts, val_labels,
                   lr=1e-5, epochs=5, batch_size=16, patience=2):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    def tokenize_batch(texts, labels):
        enc = tokenizer(texts, truncation=True, padding=True,
                        max_length=512, return_tensors='pt')
        if 'token_type_ids' in enc:
            del enc['token_type_ids']
        return enc['input_ids'], enc['attention_mask'], torch.tensor(labels, dtype=torch.float)

    train_ids, train_mask, train_y = tokenize_batch(train_texts, train_labels)
    val_ids, val_mask, val_y = tokenize_batch(val_texts, val_labels)

    train_loader = DataLoader(TensorDataset(train_ids, train_mask, train_y),
                              batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TensorDataset(val_ids, val_mask, val_y),
                            batch_size=batch_size)

    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=0.01)

    # class imbalance handling
    pos_weight_value = (train_labels.count(0) / train_labels.count(1))
    pos_weight = torch.tensor([pos_weight_value], device=device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    scaler = torch.cuda.amp.GradScaler()

    # scheduler
    num_training_steps = len(train_loader) * epochs
    warmup_steps = int(0.1 * num_training_steps)

    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps
    )

    best_auc = 0
    epochs_no_improve = 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for input_ids, mask, labels in train_loader:
            input_ids, mask, labels = input_ids.to(device), mask.to(device), labels.to(device)
            optimizer.zero_grad()

            with torch.cuda.amp.autocast():
                logits = model(input_ids, mask).squeeze(-1)
                loss = criterion(logits, labels)

            scaler.scale(loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            total_loss += loss.item()

        # validation
        model.eval()
        preds, probs, gold = [], [], []

        with torch.no_grad():
            for input_ids, mask, labels in val_loader:
                input_ids, mask = input_ids.to(device), mask.to(device)
                logits = model(input_ids, mask).squeeze(-1)
                p = torch.sigmoid(logits)

                preds.extend((p > 0.5).int().cpu().numpy())
                probs.extend(p.cpu().numpy())
                gold.extend(labels.cpu().numpy())

        acc = accuracy_score(gold, preds)
        prec = precision_score(gold, preds)
        rec = recall_score(gold, preds)
        f1 = f1_score(gold, preds)
        roc_auc = roc_auc_score(gold, probs)
        pr_auc = average_precision_score(gold, probs)

        print(
            f"Epoch {epoch+1}: "
            f"Acc={acc:.4f} Prec={prec:.4f} Rec={rec:.4f} "
            f"F1={f1:.4f} ROC_AUC={roc_auc:.4f} PR_AUC={pr_auc:.4f}"
        )

        if pr_auc > best_auc:
            best_auc = pr_auc
            epochs_no_improve = 0
            torch.save(model.state_dict(), "best_model.pt")
        else:
            epochs_no_improve += 1
            if epoch > 0 and epochs_no_improve >= patience:
                print(f"Early stopping. Best PR_AUC={best_auc:.4f}")
                model.load_state_dict(torch.load("best_model.pt"))
                break

    print(f"Training complete. Best PR_AUC={best_auc:.4f}")
    return model

# K-FOLD TRAINING
def run_kfold_training(train_df, tokenizer, model_name="bert-base-uncased",
                       lr=1e-5, epochs=5, batch_size=16, patience=2, n_splits=5):

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    oof_preds = np.zeros(len(train_df))
    fold_aucs = []
    models = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(train_df['text'], train_df['fraudulent'])):
        print(f"\nFold {fold+1}/{n_splits}")

        tr_df = train_df.iloc[train_idx]
        val_df = train_df.iloc[val_idx]

        model = TextDetector(model_name=model_name)

        trained_model = train_detector(
            model, tokenizer,
            tr_df['text'].tolist(), tr_df['fraudulent'].tolist(),
            val_df['text'].tolist(), val_df['fraudulent'].tolist(),
            lr=lr, epochs=epochs, batch_size=batch_size, patience=patience
        )

        val_probs = trained_model.predict(tokenizer, val_df['text'].tolist(), device=device)
        oof_preds[val_idx] = val_probs

        fold_auc = average_precision_score(val_df["fraudulent"], val_probs)
        fold_aucs.append(fold_auc)

        print(f"Fold {fold+1} PR_AUC={fold_auc:.4f}")

        torch.save(trained_model.state_dict(), f"bert_fold{fold+1}.pt")
        models.append(trained_model)

        del tr_df, val_df, trained_model
        gc.collect()
        torch.cuda.empty_cache()

    print("\nCV Results")
    print("Fold AUCs:", [round(x, 4) for x in fold_aucs])
    print(f"Mean AUC = {np.mean(fold_aucs):.4f}")

    return models, np.mean(fold_aucs)


#Main


In [None]:
def main():
    test_df = pd.read_csv(f"/content/drive/MyDrive/BT4012 Group 16!!/test_df.csv", keep_default_na=True) # automatically treat certain null values as "NaN"
    train_df = pd.read_csv(f"/content/drive/MyDrive/BT4012 Group 16!!/train_df.csv", keep_default_na=True)
    train_df = preprocess_job_df(train_df, cols_to_drop, cols_to_merge)
    test_df = preprocess_job_df(test_df, cols_to_drop, cols_to_merge)
    train_df = clean(train_df, text_col="text")
    test_df = clean(test_df, text_col="text")

    if 'fraudulent' in test_df.columns:
      test_df = test_df.drop(columns=['fraudulent'])

    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model_shortname = model_name.split("/")[-1]

    start = time.time()
    models, mean_auc = run_kfold_training(
        train_df, tokenizer, model_name=model_name,
        lr=1e-5, epochs=5, batch_size=16, patience=2, n_splits=5
    )
    print(f"\nTraining done in {(time.time()-start)/60:.1f} min")

    all_test_probs = []
    for i, model in enumerate(models):
        print(f"Predicting with fold {i+1} model...")
        probs = model.predict(tokenizer, test_df['text'].tolist())
        all_test_probs.append(probs)

    final_probs = np.mean(all_test_probs, axis=0)

    submission = pd.DataFrame({'id': range(len(final_probs)), 'fraudulent': final_probs})
    path = f"/content/drive/MyDrive/BT4012 Group 16!!/submission_{model_shortname}_5fold.csv"
    submission.to_csv(path, index=False)
    print(f"\n Submission saved to {path}")


if __name__ == "__main__":
    main()
