In [63]:
# general
import pandas as pd
import numpy as np
import itertools
import warnings
import json
import pickle
import argparse

# nlp
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import gensim
import string

# sklearn
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
def preprocess(data, col):
    """Preprocess text"""
    st = PorterStemmer()
    stopwords_dict = stopwords.words('english') # all stopwords in English
    data[col] = data[col].apply(lambda x: " ".join([st.stem(i) for i in x.split() if i not in stopwords_dict]).lower().strip(string.punctuation))

def create_ngrams(text, n=2):
    """Create n-grams given a review"""
    unigrams = nltk.word_tokenize(text) # text should already be cleaned
    unigrams_joined = ' '.join(unigrams)

    if len(unigrams) > 1:
        bigrams = list(map(lambda x: '_'.join(x), zip(unigrams, unigrams[1:])))
    else:
        bigrams = []

    bigrams_joined = ' '.join(bigrams)

    if n == 2:
        return bigrams_joined
    elif n == 1.5:
        return unigrams_joined + ' ' + bigrams_joined
def ohe(data, cols=['required_education', 'required_experience', 'employment_type']):
    """One hot encode non-textual features"""

    dummies = pd.get_dummies(data[cols])

    data_new = pd.concat([data, dummies], axis=1)      
    data_new.drop(cols, inplace=True, axis=1)
    return data_new
def impute(data, feature='text'):
    """Imputation"""

    # categorical features
    unspecified_impute_features = ['employment_type', 'required_education', 'industry',
                                   'required_experience', 'function', 'location', 'department']
    data[unspecified_impute_features] = data[unspecified_impute_features].fillna('Unspecified')

    # textual features
    data['company_profile'] = data['company_profile'].fillna('')
    data['description'] = data['description'].fillna('')
    data['requirements'] = data['requirements'].fillna('')
    data['text'] = data['company_profile'] + data['description'] + data['requirements']
    data['text'] = data['text'].replace({'':'Missing'})

    # regrouping
    data['benefits'] = np.where(data['benefits'].isna(), 1, 0) # missing 1
    data['required_education'] = np.where(data['required_education'].str.contains("Vocational"), 'Vocational', data['required_education'])

    # salary extraction and imputation
    data[['salary_lower', 'salary_upper']] = data['salary_range'].str.split('-', 1, expand=True)
    data['salary_lower'] = np.where(data['salary_lower'].str.isnumeric(), data['salary_lower'], np.nan)
    data['salary_lower'] = data['salary_lower'].astype(float)
    data['salary_lower'] = data['salary_lower'].fillna(data['salary_lower'].median())
    data['salary_upper'] = np.where(data['salary_upper'].str.isnumeric(), data['salary_upper'], np.nan)
    data['salary_upper'] = data['salary_upper'].astype(float)
    data['salary_upper'] = data['salary_upper'].fillna(data['salary_upper'].median())

In [4]:
warnings.filterwarnings("ignore")

# settings
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 100)

# load the news dataset
path = '../data/fake_job_postings.csv'
df = pd.read_csv(path)
print('Shape of dataframe is {}'.format(df.shape))
# df = open(path, encoding='utf8').readlines()
# df = [json.loads(x) for x in df]
# df = pd.DataFrame(df)

print('----- percentage for each class -----')
print(pd.DataFrame(df.groupby('fraudulent').description.count()/len(df)).rename(columns={'description':'percentage'}))

# concat title and content
feature = 'text'; target = 'fraudulent'

# upsampling
neg = df[df.fraudulent == 0]
pos = df[df.fraudulent == 1].sample(n=len(neg), replace=True, random_state=42)
df = pd.concat([neg, pos])
df = df.sample(frac=1).reset_index(drop=True) # shuffle rows
print('----- percentage for each class after upsampling -----')
print(pd.DataFrame(df.groupby('fraudulent').description.count()/len(df)).rename(columns={'description':'percentage'}))

# print(df.head())
# print(df.tail())

# data cleaning, remove stopwords and perform stemming
impute(df, feature) # this will automatically create text col
preprocess(df, feature)
print('----- preprocess finished -----')

Shape of dataframe is (17880, 18)
----- percentage for each class -----
            percentage
fraudulent            
0             0.951566
1             0.048378
----- percentage for each class after upsampling -----
            percentage
fraudulent            
0             0.500000
1             0.499295
----- preprocess finished -----


In [5]:
df = ohe(df) # one hot encoding 
add_features = ['benefits', 'has_company_logo', 'has_questions', 'telecommuting', # binary
                        'salary_lower', 'salary_upper', # numerical
                        'required_education.*', 'required_experience.*', 'employment_type.*'] # categorical
add_features = df.filter(regex='|'.join(add_features)).columns.to_list()

In [96]:
def svm_multimodal_wrapper(data, add_features, feature='text', target='fraudulent', k=1000, max_iter=500, min_df=3, loss='hinge', alpha=1e-4, path='svm_mult.pickle'):
    """SVM multimodal pipeline"""
    X = data[[feature]+add_features]
    y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    vec = TfidfVectorizer(min_df=min_df, stop_words='english', sublinear_tf=True, ngram_range=(1,2))
    score_func = SelectKBest(chi2, k=k)
    mod = SGDClassifier(loss=loss, penalty='l2', alpha=alpha, random_state=42, max_iter=max_iter, tol=None)
    
    # save tf-idf to dataframe
    X_train1 = vec.fit_transform(X_train[feature])
    X_train1 = score_func.fit_transform(X_train1, y_train)
    train_data = pd.DataFrame(X_train1.toarray(), columns=sel.get_support(indices=True))
    train_data = pd.concat([y_train, X_train[add_features], train_data], axis=1) # append tf-idf features to meda features and target
    print(train_data.head())
    model = mod.fit(train_data[1:], train_data[target]) # model training

    # tranform test data for prediction and evaluation
    X_test1 = vec.transform(X_test1[feature])
    X_test1 = score_func.transform(X_test1)
    test_data = pd.DataFrame(X_test1.toarray(), columns=sel.get_support(indices=True))
    test_data = pd.concat([y_test, X_test[add_features], test_data], axis=1) # append tf-idf features to meda features and target

    print(classification_report(np.array(y_test), model.predict(test_data)))
    print(confusion_matrix(np.array(y_test), model.predict(test_data)))


In [77]:
svm_multimodal_wrapper(data=df, add_features=add_features, k=1000, loss='hinge', path='svm_mult0.pickle')

In [88]:
data = df; k=1000; max_iter=500; min_df=3; loss='hinge'; alpha=1e-4; feature='text'
X = data[[feature]+add_features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

vec = TfidfVectorizer(min_df=min_df, stop_words='english', sublinear_tf=True, ngram_range=(1,2))
score_func = SelectKBest(chi2, k=k)
# mod = SGDClassifier(loss=loss, penalty='l2', alpha=alpha, random_state=42, max_iter=max_iter, tol=None)
mod = LogisticRegression(random_state=42, max_iter=max_iter, C=1, solver='saga', l1_ratio=0.5)

In [89]:
# save tf-idf to dataframe
X_train1 = vec.fit_transform(X_train[feature])
X_train1 = score_func.fit_transform(X_train1, y_train)
train_data = pd.DataFrame(X_train1.toarray(), columns=score_func.get_support(indices=True))
# train_data.head()

In [90]:
y_train = y_train.reset_index(); y_train.drop(columns={'index'}, inplace=True)
X_train = X_train.reset_index(); X_train.drop(columns={'index'}, inplace=True)
train_data = train_data.reset_index(); train_data.drop(columns={'index'}, inplace=True)
train_data = pd.concat([y_train, X_train[add_features], train_data], axis=1) # append tf-idf features to meda features and target
# train_data.head()

In [91]:
model = mod.fit(train_data.iloc[:,1:], train_data[target]) # model training

In [92]:
# tranform test data for prediction and evaluation
X_test1 = vec.transform(X_test[feature])
X_test1 = score_func.transform(X_test1)
test_data = pd.DataFrame(X_test1.toarray(), columns=score_func.get_support(indices=True))

In [93]:
y_test = y_test.reset_index(); y_test.drop(columns={'index'}, inplace=True)
X_test = X_test.reset_index(); X_test.drop(columns={'index'}, inplace=True)
test_data = test_data.reset_index(); test_data.drop(columns={'index'}, inplace=True)
test_data = pd.concat([y_test, X_test[add_features], test_data], axis=1) # append tf-idf features to meda features and target

In [94]:
print(classification_report(np.array(y_test), model.predict(test_data.iloc[:,1:])))
print(confusion_matrix(np.array(y_test), model.predict(test_data.iloc[:,1:])))

              precision    recall  f1-score   support

           0       0.50      0.98      0.67      5074
           1       0.71      0.04      0.08      5135

    accuracy                           0.51     10209
   macro avg       0.61      0.51      0.37     10209
weighted avg       0.61      0.51      0.37     10209

[[4989   85]
 [4929  206]]
