In [46]:
import numpy as np
import pandas as pd

In [47]:
df=pd.read_csv("job_postings.csv")

In [48]:
df.head(3)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,7531,Contact Center Representatives,"US, VA, Virginia Beach",,,Tidewater Finance Co. was established in 1992 ...,"Tidewater Finance Company, located in Virginia...",The position requires the following qualificat...,Our company offers a competitive salary plus B...,0,1,0,Full-time,Entry level,Unspecified,Financial Services,Customer Service,0
1,130,Customer Service Associate,"US, TX, Dallas",,,"Novitex Enterprise Solutions, formerly Pitney ...",The Customer Service Associate will be based i...,QualificationsMinimum of 1 year customer servi...,,0,1,0,Full-time,Entry level,High School or equivalent,Telecommunications,Customer Service,0
2,4641,Automated Test Analyst,"NZ, , Auckland",Permanent,,SilverStripe CMS &amp; Framework is an open so...,We are looking for a dedicated and passionate ...,,,0,1,1,Full-time,Mid-Senior level,,Information Technology and Services,,0


In [49]:
df.isnull().sum()

Unnamed: 0,0
job_id,0
title,0
location,280
department,9275
salary_range,12021
company_profile,2672
description,1
requirements,2132
benefits,5803
telecommuting,0


In [50]:
# prompt: do a word cloud of the text data in the set

from wordcloud import WordCloud
import matplotlib.pyplot as plt

def generate_word_cloud(df):
  # Combine text from relevant columns
  text_columns = ['title', 'company_profile', 'description', 'requirements', 'benefits']
  text = " ".join(df[col].astype(str).str.cat(sep=' ') for col in text_columns)

  # Generate a word cloud image
  wordcloud = WordCloud(random_state=42, width = 800, height = 450).generate(text)

  # Display the generated image:
  # the matplotlib way:
  plt.figure(figsize=(10, 8))
  plt.imshow(wordcloud, interpolation='bilinear')
  plt.axis("off")
  plt.show()

In [51]:
# prompt: do pca on these 3 cols 'telecommuting', 'has_company_logo', 'has_questions'

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Select the columns for PCA
features = ['telecommuting', 'has_company_logo', 'has_questions']
x = df[features].values

# Standardize the data
x = StandardScaler().fit_transform(x)

# Apply PCA
pca = PCA(n_components=3) # You can choose the number of components
principal_components = pca.fit_transform(x)

# Create a new DataFrame with the principal components
pca_df = pd.DataFrame(data = principal_components, columns = ['principal_component_1', 'principal_component_2', 'principal_component_3'])

# You can optionally add the target variable if you want to visualize
# pca_df['fraudulent'] = df['fraudulent'].values

print(pca_df.head())
print(f'Explained variance ratio: {pca.explained_variance_ratio_}')


   principal_component_1  principal_component_2  principal_component_3
0              -0.326663              -0.343341               1.025672
1              -0.326663              -0.343341               1.025672
2               1.084059              -0.128682              -0.375964
3              -0.326663              -0.343341               1.025672
4              -0.326663              -0.343341               1.025672
Explained variance ratio: [0.41494342 0.33442034 0.25063624]


In [52]:
df.drop(['job_id'],axis=1,inplace = True)
X = df.drop(columns=['fraudulent'])
y = df['fraudulent']

In [53]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Initialize NLP tools
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

# Clean text features
def clean_text(text):
    if isinstance(text, str):
        words = word_tokenize(text.lower())
        words = [ps.stem(word) for word in words if word.isalnum() and word not in stop_words]
        return ' '.join(words)
    return ''

text_cols = ['title', 'company_profile', 'description', 'requirements', 'benefits']
for col in text_cols:
    X[col] = X[col].apply(clean_text)

# Create text_specified binary feature
X['text_specified'] = X[text_cols].apply(lambda x: 1 if any(x.dropna().astype(str).str.strip() != '') else 0, axis=1)

# Decompose location
def split_location(loc):
    if isinstance(loc, str) and ',' in loc:
        parts = [part.strip() for part in loc.split(',')]
        return [parts[0] if len(parts) > 0 else 'Unspecified',
                parts[1] if len(parts) > 1 else 'Unspecified',
                parts[2] if len(parts) > 2 else 'Unspecified']
    return ['Unspecified', 'Unspecified', 'Unspecified']

loc_data = X['location'].apply(split_location).apply(pd.Series)
X['country'] = loc_data[0]
X['state'] = loc_data[1]
X['city'] = loc_data[2]
X = X.drop(columns=['location'])

# Decompose salary_range
def split_salary(salary):
    if isinstance(salary, str) and '-' in salary:
        try:
            min_sal, max_sal = map(int, salary.split('-'))
            return [min_sal, max_sal, 1]
        except:
            return [0, 0, 0]
    return [0, 0, 0]

sal_data = X['salary_range'].apply(split_salary).apply(pd.Series)
X['min_salary'] = sal_data[0]
X['max_salary'] = sal_data[1]
X['salary_specified'] = sal_data[2]
X = X.drop(columns=['salary_range'])

# Handle missing categorical values
cat_cols = ['department', 'employment_type', 'required_experience', 'required_education',
            'industry', 'function', 'country', 'state', 'city']
for col in cat_cols:
    X[col] = X[col].fillna('Unspecified')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [54]:
X.head(3)

Unnamed: 0,title,department,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,...,required_education,industry,function,text_specified,country,state,city,min_salary,max_salary,salary_specified
0,contact center repres,Unspecified,tidewat financ establish 1992 initi purpos pur...,tidewat financ compani locat virginia beach va...,posit requir follow qualif minimum 1 year call...,compani offer competit salari plu bonus well c...,0,1,0,Full-time,...,Unspecified,Financial Services,Customer Service,1,US,VA,Virginia Beach,0,0,0
1,custom servic associ,Unspecified,novitex enterpris solut formerli pitney bow ma...,custom servic associ base dalla tx right candi...,qualificationsminimum 1 year custom servic rel...,,0,1,0,Full-time,...,High School or equivalent,Telecommunications,Customer Service,1,US,TX,Dallas,0,0,0
2,autom test analyst,Permanent,silverstrip cm amp framework open sourc platfo...,look dedic passion softwar test analyst team p...,,,0,1,1,Full-time,...,Unspecified,Information Technology and Services,Unspecified,1,NZ,,Auckland,0,0,0


In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Text features
tfidf = TfidfVectorizer(max_features=500)
text_cols = ['title', 'company_profile', 'description',
              'requirements', 'benefits']
text_features = tfidf.fit_transform(X[text_cols].agg(' '.join, axis=1)).toarray()

# Numerical and binary features (exclude job_id)
num_cols = ['min_salary', 'max_salary', 'telecommuting', 'has_company_logo',
            'has_questions', 'salary_specified']
scaler = StandardScaler()
num_features = scaler.fit_transform(X[num_cols])

cat_cols = ['country', 'state', 'city']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_features = encoder.fit_transform(X[cat_cols])

# Combine features
features = np.hstack([text_features, cat_features, num_features])

In [56]:
from imblearn.under_sampling import RandomUnderSampler

under_sampler = RandomUnderSampler()
X_res, y_res = under_sampler.fit_resample(features, y)

In [57]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_sm, y_sm = smote.fit_resample(features, y)

In [58]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3,
                                                    stratify=y_sm, random_state=42)
rf = RandomForestClassifier(class_weight='balanced', max_depth=None, min_samples_split=2, n_estimators=200)
%time rf.fit(X_train, y_train)
y_pred_class = rf.predict(X_test)
print("Classification Accuracy:", accuracy_score(y_test, y_pred_class))
print("Classification Report\n")
print(classification_report(y_test, y_pred_class))
print("Confusion Matrix\n")
print(confusion_matrix(y_test, y_pred_class))

CPU times: user 47.2 s, sys: 457 ms, total: 47.7 s
Wall time: 44.1 s
Classification Accuracy: 0.9976735643443124
Classification Report

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4084
           1       1.00      1.00      1.00      4083

    accuracy                           1.00      8167
   macro avg       1.00      1.00      1.00      8167
weighted avg       1.00      1.00      1.00      8167

Confusion Matrix

[[4080    4]
 [  15 4068]]


In [60]:
import joblib

joblib.dump((rf, tfidf, scaler, encoder), 'rf_smote.joblib')

['rf_smote.joblib']