In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from sklearn.model_selection import cross_val_score

from joblib import Memory
from scipy.sparse import hstack
from scipy.sparse import save_npz,load_npz

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
# Set up memory caching
memory = Memory('./cachedir', verbose=0)

## Given:
### [Real or Fake] : Fake Job Description Prediction
### This dataset contains 18K job descriptions out of which about 800 are fake. The data consists of both textual information and meta-information about the jobs. The dataset can be used to create classification models which can learn the job descriptions which are fraudulent.

## Goal:

Process the dataset to prepare features for a classification model that predicts whether job descriptions are fraudulent or real.

## Load and explore the data:

In [3]:
@memory.cache
def load_data():
    return pd.read_csv("../data/raw/fake_job_postings.csv", index_col=0)

# Load the DataFrame (will use cache if available)
df = load_data()

In [4]:
# Display the first two rows
df.head(2)

Unnamed: 0_level_0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0


In [5]:
df[df['fraudulent']==1].head(2)

Unnamed: 0_level_0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
99,IC&E Technician,"US, , Stocton, CA",Oil & Energy,95000-115000,...,"IC&amp;E Technician | Bakersfield, CA Mt. Poso...","QualificationsKnowledge, Skills &amp; Abilitie...",BENEFITSWhat is offered:Competitive compensati...,0,1,1,Full-time,Mid-Senior level,High School or equivalent,Oil & Energy,Other,1
145,Forward Cap.,,,,,The group has raised a fund for the purchase o...,,,0,0,0,,,,,,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17880 entries, 1 to 17880
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   title                17880 non-null  object
 1   location             17534 non-null  object
 2   department           6333 non-null   object
 3   salary_range         2868 non-null   object
 4   company_profile      14572 non-null  object
 5   description          17879 non-null  object
 6   requirements         15184 non-null  object
 7   benefits             10668 non-null  object
 8   telecommuting        17880 non-null  int64 
 9   has_company_logo     17880 non-null  int64 
 10  has_questions        17880 non-null  int64 
 11  employment_type      14409 non-null  object
 12  required_experience  10830 non-null  object
 13  required_education   9775 non-null   object
 14  industry             12977 non-null  object
 15  function             11425 non-null  object
 16  fraudulen

## Handle Missing Values

- drop high missing value columns
- drop rows depending on the importance of the columns and how much data is missing.
- fill missning values with placeholder or mode

### Drop rows/cols

In [7]:
# check for NaN values
df.isna().sum().sort_values(ascending=False)

salary_range           15012
department             11547
required_education      8105
benefits                7212
required_experience     7050
function                6455
industry                4903
employment_type         3471
company_profile         3308
requirements            2696
location                 346
description                1
title                      0
telecommuting              0
has_questions              0
has_company_logo           0
fraudulent                 0
dtype: int64

In [8]:
# check for NaN values - in percentage:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False)

salary_range           83.959732
department             64.580537
required_education     45.329978
benefits               40.335570
required_experience    39.429530
function               36.101790
industry               27.421700
employment_type        19.412752
company_profile        18.501119
requirements           15.078300
location                1.935123
description             0.005593
title                   0.000000
telecommuting           0.000000
has_questions           0.000000
has_company_logo        0.000000
fraudulent              0.000000
dtype: float64

In [9]:
# drop columns with high percentage missing values
df.drop(columns=['salary_range', 'department'], inplace=True)

In [10]:
# drop rows with missing 'description'
df.dropna(subset=['description'], inplace=True)

### Impute missing values

In [11]:
# Get list of columns with missing values
missing_values_columns = df.columns[df.isna().sum() != 0].tolist()
missing_values_columns

['location',
 'company_profile',
 'requirements',
 'benefits',
 'employment_type',
 'required_experience',
 'required_education',
 'industry',
 'function']

In [12]:
# check uniqueness values
df[missing_values_columns].nunique().sort_values()

employment_type            5
required_experience        7
required_education        13
function                  37
industry                 131
company_profile         1709
location                3105
benefits                6204
requirements           11967
dtype: int64

In [13]:
df['benefits'].value_counts()

benefits
See job description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       726
Career prospects.                                                                                                                

In [14]:
# Impute categorical and text columns with placeholders
placeholder_map = {
    'location': 'Other',
    'company_profile': 'Missing',
    'requirements': 'Missing',
    'benefits':'See job description', # mode
    'employment_type':'Other',
    'required_experience': 'Not Applicable',
    'required_education':'Unspecified',
    'industry':'Other',
    'function': 'Other'
}

df.fillna(value=placeholder_map, inplace=True)

In [15]:
df.isna().sum().sum()

np.int64(0)

In [16]:
df.shape

(17879, 15)

## Encode Categorical Variables:

Apply appropriate encoding techniques:
- **Binary columns**: Use direct mapping (0/1).
- **Ordinal columns**: Use label encoding.
- **Nominal columns**: Use One-Hot Encoding, target encoding or frequency encoding 


In [17]:
# Define lists for different types of columns
binary_columns = ['has_company_logo', 'has_questions', 'telecommuting', 'fraudulent']
ordinal_columns = ['employment_type', 'required_experience', 'required_education']
nominal_columns = ['function', 'industry', 'location', 'benefits']
text_columns = ['title', 'requirements', 'description', 'company_profile']

In [18]:
df[binary_columns].dtypes

has_company_logo    int64
has_questions       int64
telecommuting       int64
fraudulent          int64
dtype: object

### Encode Ordinal columns using Label Encoding

In [19]:
ordinal_encoder = LabelEncoder()
for column in ordinal_columns:
    df[column] = ordinal_encoder.fit_transform(df[column])


### Encode Nominal columns

In [20]:
df[nominal_columns].nunique()

function      37
industry     132
location    3106
benefits    6204
dtype: int64

In [21]:
def frequency_encode(df, nominal_columns):
    for column in nominal_columns:
        # Calculate frequency of each category
        frequency = df[column].value_counts() / len(df)
        # Map frequencies to the column
        df[column] = df[column].map(frequency)
    return df

df = frequency_encode(df, nominal_columns)

## Select features

In [22]:
# use all columns, except fraudulent
X = df.drop(columns=['fraudulent'])
y = df['fraudulent']

## Text Data Cleaning
   - Remove special characters and unnecessary whitespace from text columns.
   - Convert text to lowercase for uniformity.
   - Remove leading/trainling whitespaces
   - Optionally, perform lemmatization or stemming on text data.

In [23]:
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^a-zA-Z0-9_\s]', '', text)
        text = text.lower()
        text = text.strip()
    return text

In [24]:
text_columns

['title', 'requirements', 'description', 'company_profile']

In [25]:
# Apply the clean_text function to each specified text column
for column in text_columns:
    X[column] = X[column].apply(clean_text)

## Split the Data

In [26]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [27]:
X.info()
text_columns

<class 'pandas.core.frame.DataFrame'>
Index: 17879 entries, 1 to 17880
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   title                17879 non-null  object 
 1   location             17879 non-null  float64
 2   company_profile      17879 non-null  object 
 3   description          17879 non-null  object 
 4   requirements         17879 non-null  object 
 5   benefits             17879 non-null  float64
 6   telecommuting        17879 non-null  int64  
 7   has_company_logo     17879 non-null  int64  
 8   has_questions        17879 non-null  int64  
 9   employment_type      17879 non-null  int64  
 10  required_experience  17879 non-null  int64  
 11  required_education   17879 non-null  int64  
 12  industry             17879 non-null  float64
 13  function             17879 non-null  float64
dtypes: float64(4), int64(6), object(4)
memory usage: 2.0+ MB


['title', 'requirements', 'description', 'company_profile']

## Converting text data into TF-IDF features

In [28]:
# Define the TF-IDF vectorization function
@memory.cache
def fit_and_transform_tfidf(X_train_raw, X_test_raw, features):
    # Initialize vectorizers for each text feature
    vectorizers = {
        feature: TfidfVectorizer(stop_words='english', min_df=5, max_df=0.8, ngram_range=(1, 2))
        for feature in features
    }

    # List to store TF-IDF matrices
    tfidf_matrices_train = []
    tfidf_matrices_test = []

    # Loop through features to fit and transform
    for feature in features:
        # Fit on training data and transform both train and test data
        vectorizers[feature].fit(X_train_raw[feature])
        tfidf_matrix_train = vectorizers[feature].transform(X_train_raw[feature])
        tfidf_matrix_test = vectorizers[feature].transform(X_test_raw[feature])

        tfidf_matrices_train.append(tfidf_matrix_train)
        tfidf_matrices_test.append(tfidf_matrix_test)

    # Combine all TF-IDF matrices into one
    X_train = hstack(tfidf_matrices_train)
    X_test = hstack(tfidf_matrices_test)

    return X_train, X_test

# List of features to vectorize
features = ['description', 'requirements', 'benefits']

# Fit and transform the TF-IDF vectors (this will be cached)
X_train, X_test = fit_and_transform_tfidf(X_train, X_test, text_columns)


In [29]:
print(X_train.shape)
print(X_test.shape)

(14303, 137613)
(3576, 137613)


In [30]:
# check for imbalaced dataset - calculate percentages:
y_train_counts = y_train.value_counts()
percentages = (y_train_counts/y_train_counts.sum())*100
percentages

fraudulent
0    95.161854
1     4.838146
Name: count, dtype: float64

## Oversampling because the dataset is imbalanced

In [31]:
# Creating an instance of SMOTE
smote = SMOTE()

# Balancing the data
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print(X_resampled.shape)
print(y_resampled.shape)

(27222, 137613)
(27222,)


In [32]:
# check for imbalaced dataset - calculate percentages:
y_resampled_counts = y_resampled.value_counts()
percentages = (y_resampled_counts/y_resampled_counts.sum())*100
percentages

fraudulent
0    50.0
1    50.0
Name: count, dtype: float64

## Save processed data

In [33]:
# Save the sparse matrix
save_npz('../data/processed/X_resampled.npz', X_resampled)

# Save y_resampled as a DataFrame (you can also use Series)
pd.Series(y_resampled).to_csv('../data/processedy_resampled.csv', index=False)

##  Load processed data

In [34]:
# Load the sparse matrix
X_resampled_loaded = load_npz('../data/processed/X_resampled.npz')

# Load the target variable
y_resampled_loaded = pd.read_csv('../data/processedy_resampled.csv').values.flatten()  # Use .values.flatten() if you need it as a 1D array