# **Random Forest Classifier Model - Fake Job Postings detector**
This model is trained to detect the patterns and keywords that are used in job postings to determine whether its fake or real.

### **step 1: loading dataset and libraries** 

In [2]:
# loading all the necessary libraries

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import re

In [4]:
# Load the dataset
file_path = "C:\\Users\\dm590\\Desktop\\fake_job_postings.csv"
dat1 = pd.read_csv(file_path)

### **Step 2:Cleaning and preprocessing of the data**

In [5]:
# data cleaning and preprocessing

dat1.fillna('Unknown', inplace = True)
dat1['text'] = (dat1['title']+' '+dat1['location']+' '+dat1['company_profile']+' '+dat1['description']+' '+dat1['requirements']+' '+dat1['benefits']).str.lower()
dat1 = dat1[['text', 'fraudulent']]

### **Step 3: Defining the keywords and feature engineering.**
     
Both job related and spam related words are defined in this step.

In [6]:
# Define a list of professional job-related words
professional_words = ['Job', 'Hiring', 'Position', 'Role', 'Opportunity', 'Company', 'Industry', 'Experience', 'Skills', 'Requirements', 'Responsibilities', 'Qualifications', 'Professional', 'Career', 'Employment', 'Salary', 'Benefits', 'Compensation', 'Perks', 'Work', 'Full-time', 'Part-time', 'Contract', 'Internship', 'Remote', 'On-site', 'Hybrid', 'Office', 'Location', 'Schedule', 'Flexible', 'Hours', 'Team', 'Collaboration', 'Leadership', 'Manager', 'Director', 'Executive', 'Analyst', 'Engineer', 'Developer', 'Designer', 'Consultant', 'Specialist', 'Coordinator', 'Assistant', 'Entry-level', 'Mid-level', 'Senior-level', 'Training', 'Learning', 'Development', 'Growth', 'Promotion', 'Career Path', 'Certification', 'Degree', 'Bachelor’s', 'Master’s', 'PhD', 'Diploma', 'Associate', 'Communication', 'Problem-solving', 'Initiative', 'Creativity', 'Analytical', 'Strategy', 'Planning', 'Execution', 'Project Management', 'Deadline', 'Deliverables', 'Client', 'Customer', 'Stakeholder', 'Vendor', 'Networking', 'Negotiation', 'Reporting', 'Data', 'Analysis', 'Research', 'Metrics', 'KPIs', 'Forecasting', 'Budget', 'Finance', 'Accounting', 'Marketing', 'Sales', 'Operations', 'Supply Chain', 'Human Resources', 'IT', 'Software', 'Security', 'Compliance', 'Regulations', 'Ethics']

# Define a list of suspicious words often found in fake job postings
suspicious_words = ['work from home', 'quick money', 'startup fee', 'cash payment', 'easy money', 'no experience needed', 'immediate hire', 'click here', 'urgent requirement']


# Function to count occurrences of professional and suspicious words
def count_keywords(text, keywords):
    text = text.lower() # changing all the string in text to lower
    return sum(text.count(word) for word in keywords)# count of words


### **Step 4: creating features so that it helps to train and develop more accuracy.**

some of them are : text length, contact information, salary information, etc.

In [7]:
# Create features
dat1['text_length'] = dat1['text'].apply(len)  # Length of job posting
dat1['professional_word_count'] = dat1['text'].apply(lambda x: count_keywords(x, professional_words))#professional word count
dat1['suspicious_word_count'] = dat1['text'].apply(lambda x: count_keywords(x, suspicious_words))#suspicious word count
dat1['has_salary_info'] = dat1['text'].apply(lambda x: 1 if 'salary' in x or 'compensation' in x else 0)# salary information
dat1['has_contact_info'] = dat1['text'].apply(lambda x: 1 if re.search(r'\b\w+@\w+\.\w+\b', x) or re.search(r'\d{10}', x) else 0)


In [8]:


#this extracts the word count, unique word count and sentence count from dat1['text']
def job_detail_level(text):
    words = re.findall(r'\b\w+\b', text)  # Extract words using regex
    unique_words = set(words)
    sentences = re.split(r'[.!?]', text)  # Split sentences by punctuation
    sentences = [s for s in sentences if s.strip()]  # Remove empty sentences
    return len(words), len(unique_words), len(sentences), len(unique_words) / len(words)

# Apply function
dat1[['word_count', 'unique_word_count', 'sentence_count', 'unique_word_ratio']] = dat1['text'].apply(
    lambda x: pd.Series(job_detail_level(x))
)


### **Step 5: TF - IDF, combining all features and converting all columns to string**
           
 TF - IDF: it helps the machine to process the data more efficiently.

In [9]:
# Convert text to numerical features with TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2), vocabulary=professional_words)
X_text = vectorizer.fit_transform(dat1['text'])



In [10]:
# Combine all features
X = pd.concat([
    pd.DataFrame(X_text.toarray()),
    dat1[['text_length', 'professional_word_count', 'suspicious_word_count', 'has_salary_info', 'has_contact_info']].reset_index(drop=True)
], axis=1)

y = dat1['fraudulent']

In [11]:
# Ensure all column names are strings
X.columns = X.columns.astype(str)

### **Step 6: training data using** :
 1. Scikit's : Model Selection ( Train test split ) and Metrics (Accuracy score and Classification Report)
 2. Imbalanced - learn (SMOTE)
 3. Random Forest Classifier
 

In [12]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state =42 )

# Handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)



### **Using Random Forest classifier to increase accuracy.**


In [13]:
# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)

### **Step 7: Predictions**

  Using Random Forest to predict the result.

In [14]:
# Predictions
rf_pred = rf_model.predict(X_test)

### **Step 8: Evaluating the Models and checking the accuracy score.**

In [15]:
# Evaluate Models
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print("Random Forest Report:\n", classification_report(y_test, rf_pred))


Random Forest Accuracy: 0.7175615212527964
Random Forest Report:
               precision    recall  f1-score   support

           0       0.97      0.73      0.83      3395
           1       0.09      0.54      0.16       181

    accuracy                           0.72      3576
   macro avg       0.53      0.63      0.50      3576
weighted avg       0.92      0.72      0.80      3576



### **Step 9: Making predictions**

This part of code takes a user-entered job posting description, processes it into features, and then predicts whether it's real or fake using the trained models.



In [16]:
def predict_job_posting(job_text, model):
    job_text = job_text.lower()
    job_vectorized = vectorizer.transform([job_text])
    job_features = pd.DataFrame(job_vectorized.toarray())

    additional_features = pd.DataFrame([[
        len(job_text),
        count_keywords(job_text, professional_words),
        count_keywords(job_text, suspicious_words),
        1 if 'salary' in job_text or 'compensation' in job_text else 0,
        1 if re.search(r'\b\w+@\w+\.\w+\b', job_text) or re.search(r'\d{10}', job_text) else 0
    ]], columns=['text_length', 'professional_word_count', 'suspicious_word_count', 'has_salary_info', 'has_contact_info'])

    # Ensure all column names are strings
    job_features.columns = job_features.columns.astype(str)

    # Combine features
    job_input = pd.concat([job_features, additional_features], axis=1)

    # Predict
    prediction = model.predict(job_input)
    return "Fake Job Posting" if prediction[0] == 1 else "Real Job Posting"

### **Step 10: User input and Real-time Prediction**

In [18]:
# User Input for Prediction
user_input = input("Enter a job posting description: ")
print("Prediction : ", predict_job_posting(user_input, rf_model))



Prediction :  Real Job Posting
