In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

In [None]:
!pip install word2number
!pip install contractions

In [None]:
df = pd.read_csv('/kaggle/input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv')
df

In [None]:
df.info()

In [None]:
df['fraudulent'].value_counts()
#heavily imbalanced

In [None]:
df.drop_duplicates(inplace = True)
df.reset_index()
len(df)

In [None]:
df.isna().sum()
# salary col has max null vals, makes sense

## DATA EXPLORATION ##

In [None]:
df_na = df.fillna('na')

In [None]:
real = df_na[df_na['fraudulent'] == 0]
real.head()

In [None]:
fake = df_na[df_na['fraudulent'] == 1]
fake.sample(5)

In [None]:
fake.loc[5516, :]['employment_type']

In [None]:
## too many missing vals, so lets look into that

# Misssing values

In [None]:
na_rates=pd.DataFrame([col, len(real.loc[real[col]=='na'])/len(real[col]), len(fake.loc[fake[col]=='na'])/len(fake[col]) ] for col in df.columns)

In [None]:
na_rates
#excluding salary range, all of the remaining columns have higher ratio of nulls for fake ads

# Examining the features 

In [None]:
df.columns

In [None]:
len(df_na['title'].unique())

In [None]:
len(df_na['location'].unique())

In [None]:
len(df_na['department'].unique())

In [None]:
df_na[df_na['salary_range'] != 'na'].head()
#not a categorical variable. its in a range. and a very small portion of rows have salary listed (most jobs usually dont have salary listed)

In [None]:
df_na['company_profile']
#text

In [None]:
df['description']
#text
# benefits and requirements are textual columns too

In [None]:
df_na['telecommuting'].unique()
# categorical variable

In [None]:
df_na['has_company_logo'].unique()
# categorical variable

In [None]:
df_na['has_questions'].unique()
# categorical variable

In [None]:
df_na['employment_type'].unique()
# categorical variable

In [None]:
df_na['required_education'].unique()
# categorical variable

In [None]:
df_na['required_experience'].unique()
# categorical variable

In [None]:
df_na['industry'].unique()
print(len(df_na['industry'].unique()))

# categorical variable

In [None]:
print(df_na['function'].unique())
# categorical variable

In [None]:
textual_cols = ['company_profile', 'description', 'requirements', 'benefits']
categorical_cols = ['telecommuting', 'has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education','industry', 'function','title', 'location', 'department']

# Examining text fields

In [None]:
def len_fun(x):
    return len(x)

In [None]:
df_na['company_profile_length'] = df_na['company_profile'].apply(len_fun)
df_na['description_length'] = df_na['description'].apply(len_fun)
df_na['requirements_length'] = df_na['requirements'].apply(len_fun)
df_na['benefits_length'] = df_na['benefits'].apply(len_fun)
df_na['title_length'] = df_na['title'].apply(len_fun)
df_na['text_length'] = df_na['company_profile_length']+df_na['description_length']+df_na['requirements_length']+df_na['benefits_length']

df_na.head()

In [None]:
ax=sb.displot(df_na, x='description_length', hue='fraudulent', kind='kde', common_norm=False)

In [None]:
ax=sb.displot(df_na, x='company_profile_length', hue='fraudulent', kind='kde', common_norm=False)

In [None]:
ax=sb.displot(df_na, x='requirements_length', hue='fraudulent', kind='kde', common_norm=False)

In [None]:
ax=sb.displot(df_na, x='benefits_length', hue='fraudulent', kind='kde', common_norm=False)

In [None]:
ax=sb.displot(df_na, x='title_length', hue='fraudulent', kind='kde', common_norm=False)

In [None]:
ax=sb.displot(df_na, x='text_length', hue='fraudulent', kind='kde', common_norm=False)

### Conclusion: Fake ads generally have lower length as the peaks of the distribution for these ads are lower than the real ones

# Examining categorical features

In [None]:
categorical_cols

In [None]:
g = sb.catplot(data=df_na, y='fraudulent', x='telecommuting', kind='bar')
g.set_axis_labels("", "Fraud Rate")

In [None]:
g = sb.catplot(data=df_na, x='has_company_logo', y='fraudulent', kind='bar')
g.set_axis_labels("", "Fraud Rate")

In [None]:
g = sb.catplot(data=df_na, x='has_questions', y='fraudulent', kind='bar')
g.set_axis_labels("", "Fraud Rate")

In [None]:
g = sb.catplot(data=df_na, x='employment_type', y='fraudulent', kind='bar')
g.set_axis_labels("", "Fraud Rate")

In [None]:
g = sb.catplot(data=df_na, y='required_experience', x='fraudulent', kind='bar')
g.set_axis_labels("Fraud Rate", "")

In [None]:
g = sb.catplot(data=df_na, y='required_education', x='fraudulent', kind='bar')
g.set_axis_labels("Fraud Rate", "")

In [None]:
g = sb.catplot(data=df_na, y='function', x='fraudulent', kind='bar')
g.set_axis_labels("Fraud Rate", "")

In [None]:
too_many_cats = ['industry', 'title', 'location', 'department']

In [None]:
industry_all = pd.DataFrame(df_na['industry'].value_counts())
industry_fake = pd.DataFrame(fake['industry'].value_counts())
industry_fake.rename(columns = {'count':'count1'}, inplace=True)
industry_fake_ratio = pd.concat([industry_all, industry_fake], axis=1)
industry_fake_ratio.dropna(inplace=True)
industry_fake_ratio['ratio'] = industry_fake_ratio['count1']/industry_fake_ratio['count']
industry_fake_ratio.sort_values('count', ascending = False)

In [None]:
d_all = pd.DataFrame(df_na['title'].value_counts())
d_fake = pd.DataFrame(fake['title'].value_counts())
d_fake.rename(columns = {'count':'count1'}, inplace=True)
d_fake_ratio = pd.concat([d_all, d_fake], axis=1)
d_fake_ratio.dropna(inplace=True)
d_fake_ratio['ratio'] = d_fake_ratio['count1']/d_fake_ratio['count']
d_fake_ratio = d_fake_ratio[d_fake_ratio['count'] != d_fake_ratio['count1']]
d_fake_ratio.sort_values('ratio', ascending = False)

In [None]:
d_all = pd.DataFrame(df_na['location'].value_counts())
d_fake = pd.DataFrame(fake['location'].value_counts())
d_fake.rename(columns = {'count':'count1'}, inplace=True)
d_fake_ratio = pd.concat([d_all, d_fake], axis=1)
d_fake_ratio.dropna(inplace=True)
d_fake_ratio['ratio'] = d_fake_ratio['count1']/d_fake_ratio['count']
d_fake_ratio = d_fake_ratio[d_fake_ratio['count'] != d_fake_ratio['count1']]
d_fake_ratio.sort_values('ratio', ascending = False)

In [None]:
d_all = pd.DataFrame(df_na['department'].value_counts())
d_fake = pd.DataFrame(fake['department'].value_counts())
d_fake.rename(columns = {'count':'count1'}, inplace=True)
d_fake_ratio = pd.concat([d_all, d_fake], axis=1)
d_fake_ratio.dropna(inplace=True)
d_fake_ratio['ratio'] = d_fake_ratio['count1']/d_fake_ratio['count']
d_fake_ratio = d_fake_ratio[d_fake_ratio['count'] != d_fake_ratio['count1']]
d_fake_ratio.sort_values('ratio', ascending = False)

In [None]:
def fun(x):
    l = x.split(',')
    if len(l)>1:
        return [l[0], l[1]]
    else:
        return l[0]
d = pd.DataFrame(df_na['location'].apply(fun))

def funn(x):
    if x[1] != ' ':
        return x[1]
    else:
        return '#'
d['state'] = d['location'].apply(funn)
print(len(d))
d
d = d[d['state'] != '#']
print(len(d['state'].unique()))


### best to group by country as too many unique state values and 3k state values are missing

### categorical features so far: 
* location: segregate by country only as US has arounf 50% of the points
* department: didnt examine #####################
* salary_range: better to check if value exists or not
* telecommuting: works the way it is
* has_company_logo: works the way it is
* has_questions: works the way it is
* employment_type: works the way it is
* experience: works the way it is
* education: works the way it is
* industry: 4k missing vals and the remaining ones have a wide variety, cant do much about it. 
* function: works the way it is

### textual features so far: 
* company_profile
* description
* requirements
* benefits

#### club all these together and the text length is one factor. texts might have urls or emails or numbers and those could be a factor too

# Textual Analysis

In [None]:
real_text=real.title+' '+real.company_profile+' '+real.description+' '+real.requirements+' '+real.benefits

In [None]:
real_text_frame=real_text.to_frame(name='text')

In [None]:
fake_text=fake.title+' '+fake.company_profile+' '+fake.description+' '+fake.requirements+' '+fake.benefits

In [None]:
fake_text_frame=fake_text.to_frame(name='text')

In [None]:
print(fake_text.str.contains('#URL', regex=False).value_counts(normalize=True))
print()
print(real_text.str.contains('#URL', regex=False).value_counts(normalize=True))

In [None]:
print(fake_text.str.contains('#EMAIL', regex=False).value_counts(normalize=True))
print()
print(real_text.str.contains('#EMAIL', regex=False).value_counts(normalize=True))

In [None]:
print(fake_text.str.contains('#PHONE', regex=False).value_counts(normalize=True))
print()
print(real_text.str.contains('#PHONE', regex=False).value_counts(normalize=True))

In [None]:
from bs4 import BeautifulSoup
import spacy
import unidecode
from word2number import w2n
import contractions

nlp = spacy.load("en_core_web_sm")

# exclude words from spacy stopwords list
deselect_stop_words = ['no', 'not']
for w in deselect_stop_words:
    nlp.vocab[w].is_stop = False


def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text


def remove_whitespace(text):
    """remove extra whitespaces from text"""
    text = text.strip()
    return " ".join(text.split())


def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text

def expand_contractions(text):
    """expand shortened words, e.g. don't to do not"""
    text = contractions.fix(text)
    return text


def text_preprocessing(text, accented_chars=True, contractions=True, 
                       convert_num=True, extra_whitespace=True, 
                       lemmatization=True, lowercase=True, punctuations=True,
                       remove_html=True, remove_num=True, special_chars=True, 
                       stop_words=True):
    """preprocess text with default option set to true for all steps"""
    if remove_html == True: #remove html tags
        text = strip_html_tags(text)
    if extra_whitespace == True: #remove extra whitespaces
        text = remove_whitespace(text)
    if accented_chars == True: #remove accented characters
        text = remove_accented_chars(text)
    if contractions == True: #expand contractions
        text = expand_contractions(text)
    if lowercase == True: #convert all characters to lowercase
        text = text.lower()

    doc = nlp(text) #tokenise text

    clean_text = []
    
    for token in doc:
        flag = True
        edit = token.text
                # remove stop words
        if stop_words == True and token.is_stop and token.pos_ != 'NUM': 
            flag = False
        # remove punctuations
        if punctuations == True and token.pos_ == 'PUNCT' and flag == True: 
            flag = False
        # remove special characters
        if special_chars == True and token.pos_ == 'SYM' and flag == True: 
            flag = False
        # remove numbers
        if remove_num == True and (token.pos_ == 'NUM' or token.text.isnumeric()) \
        and flag == True:
            flag = False
        # convert number words to numeric numbers
        if convert_num == True and token.pos_ == 'NUM' and flag == True:
            edit = w2n.word_to_num(token.text)
        # convert tokens to base form
        elif lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
            edit = token.lemma_
        # append tokens edited and not removed to list 
        if edit != "" and flag == True:
            clean_text.append(edit)        
    return clean_text


In [None]:
real_text_frame['clean']=real_text_frame.apply(lambda x: text_preprocessing(x['text']), axis=1)
real_text_frame['clean']=real_text_frame.apply(lambda x: " ".join(x['clean']), axis=1)

In [None]:
for i in real_text_frame.index:
    #if i%100 == 0: print(i)
    real_text_frame['clean'][i] = " ".join(text_preprocessing(real_text_frame['text'][i]))

In [None]:
real_text_frame

In [None]:
from collections import Counter

real_total_text = [text for text in real_text_frame['clean']]
real_total_text = ' '.join(real_total_text).split()

real_counts = Counter(real_total_text)

real_common_words = [word[0] for word in real_counts.most_common(20)]
real_common_counts = [word[1] for word in real_counts.most_common(20)]

fig = plt.figure(figsize=(18,6))
sb.barplot(x=real_common_words, y=real_common_counts)
plt.title('Most Common Words used in Real Job Ads')
plt.show()

In [None]:
fake_text_frame['clean']=fake_text_frame.apply(lambda x: text_preprocessing(x['text']), axis=1)
fake_text_frame['clean']=fake_text_frame.apply(lambda x: " ".join(x['clean']), axis=1)

In [None]:
from collections import Counter

fake_total_text = [text for text in fake_text_frame['clean']]
fake_total_text = ' '.join(fake_total_text).split()

fake_counts = Counter(fake_total_text)

fake_common_words = [word[0] for word in fake_counts.most_common(20)]
fake_common_counts = [word[1] for word in fake_counts.most_common(20)]

fig = plt.figure(figsize=(18,6))
sb.barplot(x=fake_common_words, y=fake_common_counts)
plt.title('Most Common Words used in Fake Job Ads')
plt.show()

### cant figure out any commonly used words

# Features

In [None]:
finaldf = df.copy()

In [None]:
finaldf['missing_company_profile']=finaldf.company_profile.isnull().astype(int)
finaldf['missing_salary_range']=finaldf.company_profile.isnull().astype(int)
finaldf.fillna('na', inplace=True)

In [None]:
text_columns=['title','company_profile','description','requirements','benefits']
for column in text_columns:
    finaldf.loc[finaldf[column]=='na', column]=' '

In [None]:
def location_transform(location):
    country_list=['US','GB','CA','DE','NZ','AU','IN','MY','na']
    country=location[:2]
    if country in country_list:
        return country
    else:
        return 'other'
    
finaldf.location=finaldf.apply(lambda x: location_transform(x['location']), axis=1)


In [None]:
finaldf['text']=finaldf.title+' '+finaldf.company_profile+' '+finaldf.description+' '+finaldf.requirements+' '+finaldf.benefits
finaldf['email_link']=finaldf.text.str.contains('#EMAIL', regex=False).astype(int)
finaldf['phone_link']=finaldf.text.str.contains('#PHONE', regex=False).astype(int)

In [None]:
import math
finaldf['text_length']=finaldf.apply(lambda x: len(x['text']), axis=1) 

In [None]:
finaldf['cleaned_text']=finaldf.apply(lambda x: text_preprocessing(x['text']), axis=1)
finaldf['cleaned_text']=finaldf.apply(lambda x: " ".join(x['cleaned_text']), axis=1)

In [None]:
finaldf

In [None]:
from sklearn.feature_selection import mutual_info_classif

def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_classif(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

In [None]:
feature_set=['location','missing_salary_range','missing_company_profile','telecommuting','has_company_logo','has_questions',
             'employment_type','required_experience','required_education','industry','function',
             'email_link','phone_link','text_length']
scores=make_mi_scores(finaldf[feature_set], finaldf['fraudulent'])
plot_mi_scores(scores)

In [None]:
finaldf_getdummy=pd.get_dummies(data=finaldf, columns=['location','employment_type','required_experience',
                                                       'required_education','industry','function'])

In [None]:
finaldf_getdummy

In [None]:
X=finaldf_getdummy.drop(['fraudulent','title','department','salary_range','company_profile','description','requirements','benefits'], axis=1)
y=finaldf_getdummy['fraudulent']

In [None]:
X

In [None]:
scores=make_mi_scores(X,y)[2:32]
plot_mi_scores(scores)

# Model Stuff

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.25, random_state=50, stratify=y)

In [None]:
X_train1=X_train[y_train==0].iloc[0::3]._append(X_train[y_train==1])
X_train2=X_train[y_train==0].iloc[1::3]._append(X_train[y_train==1])
X_train3=X_train[y_train==0].iloc[2::3]._append(X_train[y_train==1])

In [None]:
y_train1=y_train[y_train==0].iloc[0::3]._append(y_train[y_train==1])
y_train2=y_train[y_train==0].iloc[1::3]._append(y_train[y_train==1])
y_train3=y_train[y_train==0].iloc[2::3]._append(y_train[y_train==1])

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(sampling_strategy=0.6, random_state= 42)
X_train1_resampled, y_train1_resampled = ros.fit_resample(X_train1, y_train1)
X_train2_resampled, y_train2_resampled = ros.fit_resample(X_train2, y_train2)
X_train3_resampled, y_train3_resampled = ros.fit_resample(X_train3, y_train3)

# SVM BoW Model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

vectorizer = CountVectorizer(ngram_range=(1,2)) # looking for both unigrams and bigrams
clf = LinearSVC(C=0.01, class_weight='balanced', random_state=42)
bowpipe = Pipeline([('vectorizer', vectorizer), ('clf', clf)])
bowpipe.fit(X_train1_resampled['cleaned_text'], y_train1_resampled)
bow_predict = bowpipe.predict(X_test['cleaned_text'])

In [None]:
from sklearn.metrics import accuracy_score, classification_report
print("accuracy:", accuracy_score(y_test, bow_predict))
print(classification_report(y_test, bow_predict))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm_bow=confusion_matrix(y_test, bow_predict)
disp=ConfusionMatrixDisplay(confusion_matrix=cm_bow)
disp.plot()
plt.show()

# Random Forest

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

t_vectorizer = TfidfVectorizer(ngram_range=(1,2))
t_clf = RandomForestClassifier(class_weight='balanced', random_state=42)
t_bowpipe = Pipeline([('vectorizer', t_vectorizer), ('clf', t_clf)])
t_bowpipe.fit(X_train2_resampled['cleaned_text'], y_train2_resampled)
t_bow_predict = t_bowpipe.predict(X_test['cleaned_text'])

In [None]:
print("accuracy:", accuracy_score(y_test, t_bow_predict))
print(classification_report(y_test, t_bow_predict))

In [None]:
cm_t_bow=confusion_matrix(y_test, t_bow_predict)
disp=ConfusionMatrixDisplay(confusion_matrix=cm_t_bow)
disp.plot()
plt.show()


# XGBoost

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

num_cols = ['text_length']
ct = ColumnTransformer([('num', StandardScaler(), num_cols)], remainder='passthrough') 
# Standardize the value of numerical features to avoid overweighted in training

In [None]:
Xtr3=X_train3_resampled.drop(['text','cleaned_text'], axis=1)
Xte=X_test.drop(['text','cleaned_text'], axis=1)


In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
xgbc=XGBClassifier(use_label_encoder=False, objective='binary:logistic', eval_metric='error',
                   eta=0.7, gamma=0,
                  max_depth=7, min_child_weight=1, random_state=42)
xgbcpipe = Pipeline([('preprocessor', ct), ('clf', xgbc)])
xgbcpipe.fit(Xtr3, y_train3_resampled)
xgbc_predict = xgbcpipe.predict(Xte)

In [None]:
print("accuracy:", accuracy_score(y_test, xgbc_predict))
print(classification_report(y_test, xgbc_predict))

In [None]:
cm_xgbc=confusion_matrix(y_test, xgbc_predict)
disp=ConfusionMatrixDisplay(confusion_matrix=cm_xgbc)
disp.plot()
plt.show()

# ENSEMBLE MODEL

In [None]:
ensemble_test_score=[bow_predict[i]+t_bow_predict[i]+xgbc_predict[i] for i in range(len(bow_predict))]
ensemble_test_predict=[int(i > 1) for i in ensemble_test_score]

In [None]:
print("accuracy:", accuracy_score(y_test, ensemble_test_predict))
print(classification_report(y_test, ensemble_test_predict))

In [None]:
cm_ensemble_test=confusion_matrix(y_test, ensemble_test_predict)
disp=ConfusionMatrixDisplay(confusion_matrix=cm_ensemble_test)
disp.plot()
plt.show()


In [None]:
#apply the ensemble models for the whole dataset
bow_svm_predict_full=bowpipe.predict(X['cleaned_text'])
bow_rf_predict_full=t_bowpipe.predict(X['cleaned_text'])
xgbc_predict_full = xgbcpipe.predict(X.drop(['text','cleaned_text'], axis=1))

In [None]:
finaldf['bow_svm_predict_full']=bow_svm_predict_full
finaldf['bow_rf_predict_full']=bow_rf_predict_full
finaldf['xgbc_predict_full']=xgbc_predict_full

In [None]:
finaldf['ensemble_total']=finaldf['bow_svm_predict_full']+finaldf['bow_rf_predict_full']+finaldf['xgbc_predict_full']
finaldf['ensemble_predict']=finaldf.apply(lambda x: int(x['ensemble_total'] >1), axis=1)

In [None]:
print("accuracy:", accuracy_score(y, finaldf['ensemble_predict']))
print(classification_report(y, finaldf['ensemble_predict']))


In [None]:
cm_ensemble=confusion_matrix(y, finaldf['ensemble_predict'])
disp=ConfusionMatrixDisplay(confusion_matrix = cm_ensemble)
disp.plot()
plt.show()