In [1]:

import string
import numpy as np
import pandas as pd
from imblearn import under_sampling
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from collections import Counter

In [2]:
df = pd.read_csv('fake_jobs.csv',index_col=0)

In [3]:
df.head()

Unnamed: 0,loc,salary_range,has_questions,employment_type,fraudulent,text
0,US,1,0,Other,0,marketing intern were food and weve created a ...
1,NZ,1,0,Full-time,0,customer service cloud video production second...
2,US,1,0,,0,commissioning machinery assistant cma valor se...
3,US,1,0,Full-time,0,account executive washington dc our passion fo...
4,US,1,1,Full-time,0,bill review manager spotsource solutions llc i...


In [4]:
df.fillna('',inplace=True)

In [5]:
df['fraudulent'] = df['fraudulent'].astype(int)

In [6]:
df['fraudulent'].dtype

dtype('int32')

In [7]:
punctuations = string.punctuation
def clean_text(text):
    text = text.strip().lower()
    nopunc =[char for char in text if char not in string.punctuation and not char.isdigit()]
    nopunc=''.join(nopunc)
    list = [word for word in nopunc.split()]
    return " ".join([word for word in list])


In [8]:
title = 'Sales Executive'
company_profile = 'Aker solution delivers integrated solutions,oil and products and services to customers'
description = 'The job requires ability to work with clients ,knowledge about the product and provide best services to customers'
requirements = 'The sales job requires experience and ability to ensure service, responsible for business and knowledge about product and communication skills'
benefits = 'Aker solution oil and gas industry offers standard environment and help to develop skills , apply for work from home ,ensure knowledge'
text = title+ ' ' + company_profile+' '+description+' '+requirements+' '+benefits
has_questions = 0
employment_type = 'Full-time'
loc = 'US'
fraudulent = ''
salary_range = 1
dict = {'title' : [title] , 'company_profile' : [company_profile], 'description' :  [description],'requirements' :[requirements],'benefits':[benefits],'fraudulent' : [fraudulent],'has_questions' : [has_questions], 'employment_type' : [employment_type] , 'loc' : loc, 'salary_range' : [salary_range]}
test_df = pd.DataFrame(dict)
test_df['text'] = clean_text(text)
del test_df['title']
del test_df['company_profile']
del test_df['description']
del test_df['requirements']
del test_df['benefits']
df = df.append(test_df,ignore_index=True)

In [9]:
df.tail()

Unnamed: 0,loc,salary_range,has_questions,employment_type,fraudulent,text
17876,US,1,1,Full-time,0.0,payroll accountant weblinc is the ecommerce pl...
17877,US,1,0,Full-time,0.0,project cost control staff engineer cost contr...
17878,NG,1,1,Contract,0.0,graphic designer nemsia studios is looking for...
17879,NZ,1,1,Full-time,0.0,web application developers vend is looking for...
17880,US,1,0,Full-time,,sales executive aker solution delivers integra...


In [10]:
enc = LabelEncoder()
df.loc[:,['employment_type','loc']] = df.loc[:,['employment_type','loc']].apply(enc.fit_transform)

In [11]:
tf = TfidfVectorizer(max_features=2000)

In [12]:
df1 = pd.DataFrame(tf.fit_transform(df['text']).toarray(),columns=tf.get_feature_names())
df.drop(['text'],axis=1,inplace=True)
main_df = pd.concat([df1,df],axis=1)

In [13]:
main_df.tail()

Unnamed: 0,abilities,ability,able,about,above,abroad,academic,accept,access,accordance,...,youre,yourself,zealand,και,να,loc,salary_range,has_questions,employment_type,fraudulent
17876,0.0,0.03585,0.020345,0.036791,0.033535,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,86,1,1,2,0.0
17877,0.0,0.0,0.029317,0.026508,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,86,1,0,2,0.0
17878,0.0,0.0,0.121573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,57,1,1,1,0.0
17879,0.0,0.0,0.0,0.017438,0.031789,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,61,1,1,2,0.0
17880,0.0,0.16577,0.0,0.170122,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,86,1,0,2,


In [14]:
y_train = main_df.iloc[:17880,-1]

x_train = main_df.iloc[:17880,:-1]
x_test = main_df.iloc[17880:,:-1]

In [15]:
x_test

Unnamed: 0,abilities,ability,able,about,above,abroad,academic,accept,access,accordance,...,your,youre,yourself,zealand,και,να,loc,salary_range,has_questions,employment_type
17880,0.0,0.16577,0.0,0.170122,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,86,1,0,2


In [16]:
x_train = np.asarray(x_train).astype('float')
y_train = np.asarray(y_train).astype('int')
x_test = np.asarray(x_test).astype('float')


In [17]:
sm = under_sampling.RandomUnderSampler(sampling_strategy='auto',random_state=42)
x_train_under_sample , y_train_under_sample = sm.fit_resample(x_train,y_train)

In [18]:
x_train_under_sample = np.asarray(x_train_under_sample)
y_train_under_sample = np.asarray(y_train_under_sample)

In [19]:
model = Sequential()
model.add(Dense(units=150, activation='tanh'))
model.add(Dense(units=100, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(
    x=x_train_under_sample,
    y=y_train_under_sample,
    epochs=35,
    shuffle=True,
    verbose=2,
    batch_size = 20,
)

Epoch 1/35
87/87 - 2s - loss: 0.5908 - accuracy: 0.6478
Epoch 2/35
87/87 - 0s - loss: 0.3063 - accuracy: 0.8782
Epoch 3/35
87/87 - 0s - loss: 0.1865 - accuracy: 0.9261
Epoch 4/35
87/87 - 0s - loss: 0.1092 - accuracy: 0.9607
Epoch 5/35
87/87 - 0s - loss: 0.0612 - accuracy: 0.9798
Epoch 6/35
87/87 - 0s - loss: 0.0411 - accuracy: 0.9850
Epoch 7/35
87/87 - 0s - loss: 0.0498 - accuracy: 0.9804
Epoch 8/35
87/87 - 0s - loss: 0.0115 - accuracy: 0.9988
Epoch 9/35
87/87 - 0s - loss: 0.0073 - accuracy: 0.9994
Epoch 10/35
87/87 - 0s - loss: 0.0037 - accuracy: 1.0000
Epoch 11/35
87/87 - 0s - loss: 0.0025 - accuracy: 1.0000
Epoch 12/35
87/87 - 0s - loss: 0.0019 - accuracy: 1.0000
Epoch 13/35
87/87 - 0s - loss: 0.0013 - accuracy: 1.0000
Epoch 14/35
87/87 - 0s - loss: 0.0012 - accuracy: 1.0000
Epoch 15/35
87/87 - 0s - loss: 9.9040e-04 - accuracy: 1.0000
Epoch 16/35
87/87 - 0s - loss: 8.6681e-04 - accuracy: 1.0000
Epoch 17/35
87/87 - 0s - loss: 6.7613e-04 - accuracy: 1.0000
Epoch 18/35
87/87 - 0s - los

<tensorflow.python.keras.callbacks.History at 0x29e29f6e580>

In [20]:
pred = model.predict(x_test)
pred = [int(round(x[0])) for x in pred]
print(pred)

[1]
