In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from funs import *

In [3]:
# Locally
# data_url = 'https://github.com/dmika1234/ml_uwr_22/blob/Project/Project/data/fake_job_postings.csv'
data_path = 'data/fake_job_postings.csv'
raw_data = pd.read_csv(data_path)

# For colab
# data_url = '/content/fake_job_postings.csv'
# raw_data = pd.read_csv(data_url, error_bad_lines=False, engine="python")
#straszny problem miałem, żeby wczytać te dane tak ja ty to robiłeś. dziwne błędy mi wyskakiwały

### Text data preprocessing

In [4]:
text_colnames = ['company_profile', 'description', 'requirements', 'benefits']
DataPrep = DataPreprocessor()

text_data_ls = DataPrep.preprocess_data(text_data=raw_data, column_names=text_colnames, vectorize_fun=list)
text_data_np = DataPrep.preprocess_data(text_data=raw_data, column_names=text_colnames, vectorize_fun=np.array)
text_data_str = DataPrep.preprocess_data(text_data=raw_data, column_names=text_colnames, vectorize_fun=join_fun)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dmika\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dmika\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\dmika\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Preparing other columns

In [5]:
working_df = raw_data.copy()

Spliting location into country, state, city

In [6]:
working_df[['country', 'state', 'city']] = working_df['location'].str.split(',', expand=True).iloc[:,0:3]

Splitting salary range into min, max salary

In [7]:
working_df[['salary_min', 'salary_max']] = working_df['salary_range'].str.split('-', expand=True)
working_df[['salary_min', 'salary_max']] = working_df[['salary_min', 'salary_max']].apply(pd.to_numeric, errors='coerce').fillna(0)

In [8]:
target_colname = 'fraudulent'
# Getting numerical colnames and deleting not useful
numerical_colnames = list(working_df.select_dtypes(include='int64').columns)
numerical_colnames = list(set(numerical_colnames) - set(['job_id', target_colname]))
numerical_colnames = numerical_colnames + ['salary_min', 'salary_max']
# Getting other text colnames and deleting not useful
other_text_colnames = list(set(working_df.select_dtypes(include='object').columns) - set(text_colnames))
other_text_colnames = list(set(other_text_colnames) - set(['location', 'salary_range']))
print(numerical_colnames)
print(text_colnames)
print(other_text_colnames)

['telecommuting', 'has_questions', 'has_company_logo', 'salary_min', 'salary_max']
['company_profile', 'description', 'requirements', 'benefits']
['required_experience', 'employment_type', 'required_education', 'title', 'function', 'country', 'department', 'state', 'industry', 'city']


Filling missing values

In [9]:
working_df[text_colnames + other_text_colnames] = working_df[text_colnames + other_text_colnames].fillna('')
working_df[numerical_colnames] = working_df[numerical_colnames].fillna(0)

In [10]:
working_df[other_text_colnames].apply(lambda x: np.unique(x).shape[0]).sort_values()

employment_type            6
required_experience        8
required_education        14
function                  38
country                   91
industry                 132
state                    326
department              1338
city                    2336
title                  11231
dtype: int64

We will only use those with not so much levels(<50 for start)

In [11]:
final_other_text_colnames = ['employment_type', 'required_experience', 'required_education', 'function']

In [72]:
X = pd.get_dummies(working_df[final_other_text_colnames], columns=final_other_text_colnames)
X[numerical_colnames] = working_df[numerical_colnames]
y = working_df[target_colname]
X

Unnamed: 0,employment_type_,employment_type_Contract,employment_type_Full-time,employment_type_Other,employment_type_Part-time,employment_type_Temporary,required_experience_,required_experience_Associate,required_experience_Director,required_experience_Entry level,...,function_Science,function_Strategy/Planning,function_Supply Chain,function_Training,function_Writing/Editing,telecommuting,has_questions,has_company_logo,salary_min,salary_max
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0.0,0.0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0.0,0.0
2,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0.0,0.0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0.0,0.0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0.0,0.0
17876,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0.0,0.0
17877,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
17878,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0.0,0.0


Splitting data

In [13]:
train_indcs, test_indcs = get_train_test_indcs(raw_data, raw_data['fraudulent'],
 test_size=.1, random_state=42, stratify=raw_data['fraudulent'])

# Word2vec for text columns

In [76]:
X_w2v = pd.DataFrame()
for colname in text_colnames:
    TxtTrans = TextTransformer(text_data_ls[colname][train_indcs], vector_size=100, min_count=1)
    df_transformed = TxtTrans.transform_data(column_name=colname, data=text_data_ls[colname])
    print(f'{colname} data successfuly transformed!')
    X_w2v = pd.concat((X_w2v, df_transformed), axis=1)
X_w2v.shape
X_w2v_final = pd.concat((X, X_w2v), axis=1)
X_w2v_final.shape

Transforming company_profile data should take around 3.311111 minutes


  return _methods._mean(a, axis=axis, dtype=dtype,


company_profile data successfuly transformed!
Transforming description data should take around 3.311111 minutes


  return _methods._mean(a, axis=axis, dtype=dtype,


description data successfuly transformed!
Transforming requirements data should take around 3.311111 minutes


  return _methods._mean(a, axis=axis, dtype=dtype,


requirements data successfuly transformed!
Transforming benefits data should take around 3.311111 minutes


  return _methods._mean(a, axis=axis, dtype=dtype,


benefits data successfuly transformed!


(17880, 400)

In [113]:
X_w2v.to_csv('data/X_w2v.csv', index=False)
X_w2v = pd.read_csv('data/X_w2v.csv')

In [117]:
X_w2v_train, X_w2v_test, y_train, y_test = X_w2v_final.iloc[train_indcs], X_w2v_final.iloc[test_indcs], y[train_indcs], y[test_indcs]

clf_w2v = LogisticRegression(solver='liblinear', random_state=42, multi_class='ovr', max_iter=1000).fit(X_w2v_train, y_train)

In [119]:
y_proba = clf_w2v.predict_proba(X_w2v_train)[:, 1]
res = evaluate_performance(y_train, y_proba, threshold=0.1)
print(f'\nLogistic regression performence on TRAIN data:\n {res}')

y_proba = clf_w2v.predict_proba(X_w2v_test)[:, 1]
res = evaluate_performance(y_test, y_proba, threshold=0.1)
print(f'\nLogistic regression performence on TEST data: \n{res}')


Logistic regression performence on TRAIN data:
 {'detection_percentage': 0.6176, 'precision': 0.0908, 'accuracy': 0.6819, 'f1_score': 0.1583, 'auc_roc': 0.6894}

Logistic regression performence on TEST data: 
{'detection_percentage': 0.5376, 'precision': 0.0857, 'accuracy': 0.7002, 'f1_score': 0.1478, 'auc_roc': 0.6535}


In [30]:
X_train, X_test, y_train, y_test = X.iloc[train_indcs], X.iloc[test_indcs], y[train_indcs], y[test_indcs]

In [31]:
clf = LogisticRegression().fit(X_train, y_train)

In [41]:
y_proba = clf.predict_proba(X_test)[:, 1]
res = evaluate_performance(y_test, y_proba, threshold=0.)
res

{'detection_percentage': 1.0,
 'precision': 0.04865771812080537,
 'accuracy': 0.04865771812080537,
 'f1_score': 0.0928,
 'auc_roc': 0.6285349388797665}

# TfidVectorizer

In [139]:
from sklearn.feature_extraction.text import TfidfVectorizer
number_of_vars = 50
X_tfdif = pd.DataFrame()
for colname in text_colnames:
    vectorizer = TfidfVectorizer(max_features=number_of_vars, min_df=2).fit(text_data_str[colname][train_indcs])
    df_transformed = pd.DataFrame(vectorizer.transform(text_data_str[colname]).toarray(), 
    columns=['num_' + colname + '_' + str(nr) for nr in np.arange(number_of_vars)])
    print(f'{colname} data successfuly transformed!')
    X_tfdif = pd.concat((X_tfdif, df_transformed), axis=1)
X_tfdif.shape
X_tfdif_final = pd.concat((X, X_tfdif), axis=1)
X_tfdif_final.shape

company_profile data successfuly transformed!
description data successfuly transformed!
requirements data successfuly transformed!
benefits data successfuly transformed!


(17880, 271)

In [140]:
X_tf_train, X_tf_test, y_train, y_test = X_tfdif_final.iloc[train_indcs], X_tfdif_final.iloc[test_indcs], y[train_indcs], y[test_indcs]

clf_tf = LogisticRegression(solver='liblinear', random_state=42, multi_class='ovr', max_iter=1000).fit(X_tf_train, y_train)

In [141]:
threshold = 0.16
y_proba = clf_tf.predict_proba(X_tf_train)[:, 1]
res = evaluate_performance(y_train, y_proba, threshold=threshold)
print(f'\nLogistic regression performence on TRAIN data:\n {res}')

y_proba = clf_tf.predict_proba(X_tf_test)[:, 1]
res = evaluate_performance(y_test, y_proba, threshold=threshold)
print(f'\nLogistic regression performence on TEST data: \n{res}')


Logistic regression performence on TRAIN data:
 {'detection_percentage': 0.6205, 'precision': 0.1687, 'accuracy': 0.8335, 'f1_score': 0.2653, 'auc_roc': 0.7757}

Logistic regression performence on TEST data: 
{'detection_percentage': 0.5318, 'precision': 0.1479, 'accuracy': 0.8291, 'f1_score': 0.2314, 'auc_roc': 0.7088}


# Google Word2vec

In [143]:
model_google = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [161]:
class GoogleTextTransformer:
    def __init__(self, path_to_model='GoogleNews-vectors-negative300.bin') -> None:
        self.wv = KeyedVectors.load_word2vec_format(path_to_model, binary=True)
        self.vector_size = self.wv.vector_size

    def get_agg_word2vec(self, text, agg_func=np.mean):
        # Get the word2vec representation of each word in the text
        word_vectors = np.array([self.wv[word] for word in text if word in self.wv.index_to_key])
        res = agg_func(word_vectors, axis=0)
        if np.isnan(res).any():
            res = np.zeros(self.vector_size)
        return res

    def transform_data(self, column_name, data=None, verbose=False):
        n = data.shape[0]
        if verbose:
            print(f'Transforming {column_name} data should take around {(n / 90 / 60):3f} minutes')
        X_train_vectors = data.apply(lambda x: self.get_agg_word2vec(x))
        X_train_vectors = np.array(X_train_vectors)
        X_train_vectors = np.vstack(X_train_vectors)
        df_train = pd.DataFrame(X_train_vectors,
         columns=['num_' + column_name + '_' + str(nr) for nr in np.arange(self.vector_size)])
        self.data_transformed = df_train
        return df_train        
    

In [162]:
X_ggl = pd.DataFrame()
GglTxtTrans = GoogleTextTransformer()

In [163]:
df_transformed = GglTxtTrans.transform_data(column_name='description', data=text_data_ls['description'])

  return _methods._mean(a, axis=axis, dtype=dtype,


In [None]:
X_ggl = pd.DataFrame()
GglTxtTrans = GoogleTextTransformer()
for colname in text_colnames:
    df_transformed = GglTxtTrans.transform_data(column_name=colname, data=text_data_ls[colname])
    print(f'{colname} data successfuly transformed!')
    X_ggl = pd.concat((X_ggl, df_transformed), axis=1)
X_ggl.shape
X_ggl_final = pd.concat((X, X_ggl), axis=1)
X_ggl_final.shape