In [1]:
!pip install pandas numpy scikit-learn nltk xgboost



In [3]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline 
from xgboost import XGBClassifier

# --- 1. Essential Setup and NLTK Fix ---

# Ensure NLTK resources are downloaded (This fixes the previous error)
try:
    # Use find to check if the resource is available
    nltk.data.find('corpora/stopwords')
    print("NLTK stopwords data is already downloaded.")
except LookupError:
    # If not found, download it
    print("Downloading NLTK stopwords...")
    # Using the standard download function
    nltk.download('stopwords')
    print("Download complete.")


## --- 2. Data Loading and Cleaning ---

# Load the dataset
# NOTE: Replace 'fake_job_postings.csv' with your actual file name
try:
    df = pd.read_csv('fake_job_postings.csv') 
except FileNotFoundError:
    print("ERROR: Please ensure 'fake_job_postings.csv' is in the same directory as this notebook.")
    # Exiting the script if the file is not found
    # In a real notebook, you would stop here or load data from another source.
    # For this demonstration, we'll assume the file is correctly placed.
    raise

# Rename the target column for clarity (assuming it's named 'fraudulent')
df = df.rename(columns={'fraudulent': 'is_fake'})

# Drop unneeded columns
df = df.drop(columns=['salary_range', 'job_id'], errors='ignore')

# Identify the target variable
TARGET = 'is_fake'
X = df.drop(TARGET, axis=1)
y = df[TARGET]

print(f"\nOriginal Dataset Shape: {df.shape}")
print(f"Class Distribution (0=Real, 1=Fake):\n{y.value_counts()}")
print("-" * 40)

# Function for text cleaning
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    if pd.isna(text):
        return ""
    # Convert to string and handle HTML tags
    text = re.sub(r'<.*?>', '', str(text))
    # Remove non-alphanumeric characters and lower case
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    # Remove stop words and single characters
    text = ' '.join(word for word in text.split() if word not in STOPWORDS and len(word) > 1)
    return text

# Apply text cleaning to all relevant text columns
text_cols = ['title', 'company_profile', 'description', 'requirements', 'benefits']
for col in text_cols:
    X[col] = df[col].apply(clean_text)

# Combine all cleaned text into a single feature for the TF-IDF model
X['combined_text'] = X[text_cols].astype(str).agg(' '.join, axis=1)


## --- 3. Feature Engineering and Preprocessing Pipeline ---

# Identify column types
text_feature = 'combined_text'
binary_features = ['telecommuting', 'has_company_logo', 'has_questions']
# Note: 'location' is often too high-cardinality for OHE but we use it here as per standard approach.
categorical_features = ['location', 'employment_type', 'required_experience', 'required_education', 'function', 'industry']

# Fill missing values:
# For categorical columns, fill NaNs with a unique string 'Missing'
for col in categorical_features:
    X[col] = X[col].fillna('Missing')
    
# For binary features, ensure they are 0/1 integers
for col in binary_features:
    X[col] = X[col].fillna(0).astype(int) 


# Create the Column Transformer (Preprocessor)
preprocessor = ColumnTransformer(
    transformers=[
        # 1. TF-IDF for the combined text (Max features prevents dimensionality explosion)
        ('text_vec', 
         TfidfVectorizer(ngram_range=(1, 2), max_features=10000), 
         text_feature),
        
        # 2. One-Hot Encoding for categorical features
        ('cat', 
         OneHotEncoder(handle_unknown='ignore', sparse_output=True), # Use sparse output for memory efficiency
         categorical_features),
         
        # 3. Pass through binary features
        ('bin', 
         'passthrough', 
         binary_features)
    ],
    remainder='drop'
)


## --- 4. Splitting Data and Handling Imbalance (SMOTE) ---

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Train/Test Split: {X_train.shape[0]} / {X_test.shape[0]}")
print(f"Train Fake Ratio: {y_train.mean():.4f}")
print("-" * 40)

# SMOTE is used inside the pipeline to only apply to the training data *after* transformation
smote_sampler = SMOTE(random_state=42)

## --- 5. Model Training: Logistic Regression (Baseline) ---

# Use ImbPipeline to chain Preprocessor, SMOTE, and the Model
logreg_model = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote_sampler), # Applies SMOTE on the vectorized, one-hot encoded training data
    ('classifier', LogisticRegression(solver='liblinear', random_state=42, C=1.0))
])

# Train the model
print("Training Logistic Regression Model...")
logreg_model.fit(X_train, y_train)
print("Training complete.")

# --- Evaluation ---
y_pred_logreg = logreg_model.predict(X_test)

print("\n## üéØ Logistic Regression Model Evaluation (with SMOTE) ##")
print(classification_report(y_test, y_pred_logreg))
print(f"F1 Score (Fake Jobs, class 1): {f1_score(y_test, y_pred_logreg, pos_label=1):.4f}")

# Display Confusion Matrix
cm_logreg = confusion_matrix(y_test, y_pred_logreg)
print("\nConfusion Matrix:")
print(pd.DataFrame(cm_logreg, index=['Actual Real (0)', 'Actual Fake (1)'], columns=['Predicted Real (0)', 'Predicted Fake (1)']))
print("-" * 40)


## --- 6. Model Training: XGBoost (Recommended Performer) ---

# We don't use SMOTE here, but rely on XGBoost's built-in class weighting
# ratio = total_negative_samples / total_positive_samples
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

xgb_model = Pipeline(steps=[
    ('preprocessor', preprocessor), # Use the same preprocessor
    ('classifier', XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        # Crucial for imbalance: tells XGBoost to give more importance to the fake class
        scale_pos_weight=scale_pos_weight 
    ))
])

# Train the model
print("Training XGBoost Model...")
xgb_model.fit(X_train, y_train)
print("Training complete.")

# --- Evaluation ---
y_pred_xgb = xgb_model.predict(X_test)

print("\n## üöÄ XGBoost Model Evaluation (with Class Weighting) ##")
print(classification_report(y_test, y_pred_xgb))
print(f"F1 Score (Fake Jobs, class 1): {f1_score(y_test, y_pred_xgb, pos_label=1):.4f}")

# Display Confusion Matrix
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
print("\nConfusion Matrix:")
print(pd.DataFrame(cm_xgb, index=['Actual Real (0)', 'Actual Fake (1)'], columns=['Predicted Real (0)', 'Predicted Fake (1)']))
print("-" * 40)

# Final step: Choose the model with the higher F1 Score for class 1 (Fake Jobs).

Downloading NLTK stopwords...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\daree\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Download complete.

Original Dataset Shape: (17880, 16)
Class Distribution (0=Real, 1=Fake):
is_fake
0    17014
1      866
Name: count, dtype: int64
----------------------------------------
Train/Test Split: 14304 / 3576
Train Fake Ratio: 0.0484
----------------------------------------
Training Logistic Regression Model...
Training complete.

## üéØ Logistic Regression Model Evaluation (with SMOTE) ##
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3403
           1       0.76      0.90      0.82       173

    accuracy                           0.98      3576
   macro avg       0.88      0.94      0.91      3576
weighted avg       0.98      0.98      0.98      3576

F1 Score (Fake Jobs, class 1): 0.8223

Confusion Matrix:
                 Predicted Real (0)  Predicted Fake (1)
Actual Real (0)                3354                  49
Actual Fake (1)                  18                 155
----------------------------------------
T

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training complete.

## üöÄ XGBoost Model Evaluation (with Class Weighting) ##
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3403
           1       0.83      0.82      0.82       173

    accuracy                           0.98      3576
   macro avg       0.91      0.90      0.91      3576
weighted avg       0.98      0.98      0.98      3576

F1 Score (Fake Jobs, class 1): 0.8222

Confusion Matrix:
                 Predicted Real (0)  Predicted Fake (1)
Actual Real (0)                3374                  29
Actual Fake (1)                  32                 141
----------------------------------------


In [4]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from scipy.stats import randint as sp_randint, uniform as sp_uniform
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import pandas as pd

# 
# --- ASSUMPTION: The following variables are already defined from previous data processing steps: ---
# preprocessor, X_train, y_train, X_test, y_test, scale_pos_weight
# 
# Example placeholder values for this standalone block to function (These should be replaced 
# by your actual variables if running this block alone without the setup steps):
# 
# scale_pos_weight = 19.66 # Example value based on a typical 5% fake job rate
# preprocessor = ... 
# -------------------------------------------------------------------------------------------------

# 1. Define the full pipeline structure
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        # Set the imbalance weight here
        scale_pos_weight=scale_pos_weight 
    ))
])

# 2. Define the parameter search space for the classifier step
param_dist = {
    'classifier__n_estimators': sp_randint(100, 500), 
    'classifier__max_depth': sp_randint(3, 10),       
    'classifier__learning_rate': sp_uniform(0.01, 0.3), 
    'classifier__colsample_bytree': sp_uniform(0.5, 0.5), 
    'classifier__subsample': sp_uniform(0.6, 0.4)       
}

# 3. Setup Randomized Search Cross-Validation
# Scoring is set to 'f1' (F1-score) to prioritize performance on the minority class (Fake Jobs)
random_search = RandomizedSearchCV(
    estimator=xgb_pipeline, 
    param_distributions=param_dist, 
    n_iter=50, # Number of parameter settings that are sampled (adjust based on time/resources)
    scoring='f1', 
    cv=3, # Number of cross-validation folds
    verbose=2, 
    random_state=42,
    n_jobs=-1 # Use all available CPU cores
)

# 4. Run the search
print("Starting Randomized Search for XGBoost Hyperparameters...")
random_search.fit(X_train, y_train)
print("Randomized Search complete.")


# 5. Final Model Evaluation

# Get the best estimator found by the search
best_xgb_model = random_search.best_estimator_
best_params = random_search.best_params_
best_score = random_search.best_score_

# Make predictions on the test set using the best model
y_pred_tuned = best_xgb_model.predict(X_test)

print("\n" + "="*50)
print("## üèÜ FINAL TUNED XGBOOST MODEL EVALUATION ##")
print("="*50)
print(f"Best Cross-Validation F1-Score: {best_score:.4f}")
print("Best Hyperparameters Found:")
# Print only the classifier parameters for cleaner output
for k, v in best_params.items():
    if k.startswith('classifier__'):
        print(f"  {k.split('__')[1]}: {v}")

print("-" * 50)
print(classification_report(y_test, y_pred_tuned))
print(f"F1 Score (Fake Jobs, class 1): {f1_score(y_test, y_pred_tuned, pos_label=1):.4f}")

# Display Confusion Matrix
cm_tuned = confusion_matrix(y_test, y_pred_tuned)
print("\nConfusion Matrix:")
print(pd.DataFrame(cm_tuned, index=['Actual Real (0)', 'Actual Fake (1)'], columns=['Predicted Real (0)', 'Predicted Fake (1)']))
print("-" * 50)

Starting Randomized Search for XGBoost Hyperparameters...
Fitting 3 folds for each of 50 candidates, totalling 150 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Randomized Search complete.

## üèÜ FINAL TUNED XGBOOST MODEL EVALUATION ##
Best Cross-Validation F1-Score: 0.7997
Best Hyperparameters Found:
  colsample_bytree: 0.6943386448447411
  learning_rate: 0.09140470953216877
  max_depth: 7
  n_estimators: 379
  subsample: 0.7427013306774357
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3403
           1       0.92      0.84      0.88       173

    accuracy                           0.99      3576
   macro avg       0.96      0.92      0.94      3576
weighted avg       0.99      0.99      0.99      3576

F1 Score (Fake Jobs, class 1): 0.8795

Confusion Matrix:
                 Predicted Real (0)  Predicted Fake (1)
Actual Real (0)                3390                  13
Actual Fake (1)                  27                 146
--------------------------------------------------


In [5]:
import pickle
import time

# Get the best model that resulted from the RandomizedSearchCV process
# Note: This assumes 'best_xgb_model' is defined from the previous code block (Step 6)
final_model = best_xgb_model

# Create a unique filename that includes the current F1 score and a timestamp
f1_score_str = f1_score(y_test, y_pred_tuned, pos_label=1)
filename = f'xgb_fake_job_detector_f1_{f1_score_str:.4f}_{int(time.time())}.pkl'

# Save the model to disk
try:
    with open(filename, 'wb') as file:
        pickle.dump(final_model, file)
    print(f"\n‚úÖ Successfully saved the final model to: {filename}")
    print("The model can now be loaded and used for new predictions.")

except NameError:
    print("\n‚ö†Ô∏è ERROR: The variable 'best_xgb_model' or necessary data (y_test, y_pred_tuned) was not found.")
    print("Please ensure you run the Hyperparameter Tuning block completely before attempting to save the model.")
except Exception as e:
    print(f"\n‚ùå An unexpected error occurred while saving: {e}")


‚úÖ Successfully saved the final model to: xgb_fake_job_detector_f1_0.8795_1765280759.pkl
The model can now be loaded and used for new predictions.


In [10]:
import pandas as pd
import re
from nltk.corpus import stopwords
import numpy as np
import time

# --- ASSUMPTION: The following variables/objects must be defined in the current kernel: ---
# random_search (the result of the RandomizedSearchCV run)
# ---------------------------------------------------------------------------------------

# --- 1. Define required functions (copied from the training script) ---

# Function for text cleaning
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'<.*?>', '', str(text))
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    text = ' '.join(word for word in text.split() if word not in STOPWORDS and len(word) > 1)
    return text

# --- 2. Retrieve the Best Model ---

try:
    loaded_model = random_search.best_estimator_
    print("‚úÖ Best model successfully retrieved from the 'random_search' object.\n")
except NameError:
    print("\n‚ùå ERROR: The 'random_search' object was not found in the kernel.")
    raise

# --- 3. Create New Job Posting DataFrames ---

# Sample 1: Looks REAL
real_sample = {
    'title': ['Senior Data Scientist'], 'location': ['London, UK'], 'department': ['Analytics'],
    'company_profile': ['A multinational technology company with over 10,000 employees globally, specializing in AI software development. We offer competitive salaries and excellent benefits.'],
    'description': ['Lead a team focused on developing next-generation predictive models. Must be proficient in Python, SQL, and cloud platforms.'],
    'requirements': ['PhD in Computer Science or related field. 5+ years experience in a senior role.'],
    'benefits': ['Healthcare, 401K match, remote flexibility, annual bonus.'],
    'employment_type': ['Full-time'], 'required_experience': ['Director'], 'required_education': ['Doctorate'],
    'industry': ['Information Technology and Services'], 'function': ['Information Technology'],
    'telecommuting': [0], 'has_company_logo': [1], 'has_questions': [1],
}

# Sample 2: Looks FAKE (Contains NaNs in department and required_education)
fake_sample = {
    'title': ['WORK FROM HOME - AMAZING CASH'], 'location': ['US, Remote'], 'department': [np.nan],
    'company_profile': ['Be your own boss! Financial independence guaranteed. No experience necessary. Apply now to get rich quick!'],
    'description': ['Simple data entry job that requires NO upfront fee. Earn $500 per day doing simple tasks from your smartphone or tablet!'],
    'requirements': ['Must have internet access.'], 'benefits': ['Fast cash, zero commute.'],
    'employment_type': ['Part-time'], 'required_experience': ['Entry level'], 'required_education': [np.nan],
    'industry': ['Accounting'], 'function': ['Administrative'],
    'telecommuting': [1], 'has_company_logo': [0], 'has_questions': [0],
}

df_test_real = pd.DataFrame(real_sample)
df_test_fake = pd.DataFrame(fake_sample)


# --- 4. Process and Predict Function (FIXED) ---

def predict_job_fraud(df_new):
    # --- FIX START ---
    # Categorical columns that were imputed with 'Missing' during training MUST be imputed here.
    categorical_features = ['location', 'employment_type', 'required_experience', 
                            'required_education', 'function', 'industry', 'department']
    for col in categorical_features:
        # Fill all NaNs with the string 'Missing'
        df_new[col] = df_new[col].fillna('Missing')
    
    # Binary features that were imputed with 0 during training MUST be imputed here.
    binary_features = ['telecommuting', 'has_company_logo', 'has_questions']
    for col in binary_features:
        df_new[col] = df_new[col].fillna(0).astype(int) 
    # --- FIX END ---
    
    # Text cleaning and combining
    text_cols = ['title', 'company_profile', 'description', 'requirements', 'benefits']
    for col in text_cols:
        df_new[col] = df_new[col].apply(clean_text)
    
    # Create the combined_text feature which the model pipeline expects
    df_new['combined_text'] = df_new[text_cols].astype(str).agg(' '.join, axis=1)

    # Use the loaded model pipeline to predict
    prediction = loaded_model.predict(df_new)
    probability = loaded_model.predict_proba(df_new)
    
    result = "FAKE/FRAUDULENT (1)" if prediction[0] == 1 else "REAL/LEGITIMATE (0)"
    confidence = probability[0][prediction[0]]
    
    return result, confidence


# --- 5. Run Tests and Output Results ---

print("--- Testing REAL Sample (Expected 0) ---")
result_real, confidence_real = predict_job_fraud(df_test_real.copy())
print(f"Prediction: {result_real}")
print(f"Confidence: {confidence_real:.4f}\n")

print("--- Testing FAKE Sample (Expected 1) ---")
result_fake, confidence_fake = predict_job_fraud(df_test_fake.copy())
print(f"Prediction: {result_fake}")
print(f"Confidence: {confidence_fake:.4f}")

‚úÖ Best model successfully retrieved from the 'random_search' object.

--- Testing REAL Sample (Expected 0) ---
Prediction: REAL/LEGITIMATE (0)
Confidence: 0.9927

--- Testing FAKE Sample (Expected 1) ---
Prediction: FAKE/FRAUDULENT (1)
Confidence: 0.9995
