## Real Vs Fake Job Prediction

In [None]:
# imports
import numpy as np
import pandas as pd
import os
import re
from sklearn.model_selection import train_test_split


In [None]:
!pip install -q -U "tensorflow-text==2.8.*"

[K     |████████████████████████████████| 4.9 MB 15.1 MB/s 
[K     |████████████████████████████████| 462 kB 58.9 MB/s 
[?25h

In [None]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import matplotlib.pyplot as plt
import time
tf.get_logger().setLevel('ERROR')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# read in the data
path = '/content/drive/MyDrive/AML Group Project'
foldername = 'data'
filename = 'fake_job_postings.csv'
postings_orig = pd.read_csv(os.path.join(path, foldername, filename))

# display dataframe
postings_orig


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17876,Account Director - Distribution,"CA, ON, Toronto",Sales,,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,1,Full-time,Mid-Senior level,,Computer Software,Sales,0
17876,17877,Payroll Accountant,"US, PA, Philadelphia",Accounting,,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0
17877,17878,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",,,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,0,0,Full-time,,,,,0
17878,17879,Graphic Designer,"NG, LA, Lagos",,,,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0


In [None]:
# drop salary_range, and department; there are too many missing values
# drop job_id; irrelevant
postings = postings_orig.copy()
postings.drop(columns=['salary_range', 'department', 'job_id'], inplace=True)


In [None]:
# summary data elements
print('Shape: ', postings.shape)
num_fraud = sum(postings['fraudulent'])
print('Number of fraudulent examples:', num_fraud)
print('Number of legitimate examples:', postings.shape[0]-num_fraud)


Shape:  (17880, 15)
Number of fraudulent examples: 866
Number of legitimate examples: 17014


In [None]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


In [None]:
import tensorflow_hub as hub
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
# separate features from labels
postings_X = postings.drop(columns=['fraudulent'])
postings_y = postings['fraudulent']

# development-test split
# random; sampling to be applied later
X_dev_orig, X_test_orig, y_dev, y_test = train_test_split(postings_X, postings_y, test_size=0.2, random_state=42)


In [None]:
def preprocessing_string(text_test):
  text_preprocessed = bert_preprocess_model(text_test)
  bert_results = bert_model(text_preprocessed)
  return bert_results

In [None]:
postings_X['combined_text'] = postings_X['company_profile'] + " " + postings_X['description'] + " " + postings_X['requirements'] + " " + postings_X['benefits']


In [None]:
descriptions_output = []
i = 0
for descriptions in postings_X['combined_text']:
  start = time.time()
  descriptions_result = preprocessing_string([descriptions])
  print(i,"Time Taken: ",time.time()-start)
  i+=1
  descriptions_output.append(descriptions_result["pooled_output"])


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1258 Time Taken:  0.15648245811462402
1259 Time Taken:  0.15032744407653809
1260 Time Taken:  0.1491410732269287
1261 Time Taken:  0.14704084396362305
1262 Time Taken:  0.14745187759399414
1263 Time Taken:  0.14945578575134277
1264 Time Taken:  0.15136504173278809
1265 Time Taken:  0.14621543884277344
1266 Time Taken:  0.15034985542297363
1267 Time Taken:  0.15456295013427734
1268 Time Taken:  0.14867830276489258
1269 Time Taken:  0.1445167064666748
1270 Time Taken:  0.14664769172668457
1271 Time Taken:  0.1499171257019043
1272 Time Taken:  0.14925026893615723
1273 Time Taken:  0.14838695526123047
1274 Time Taken:  0.1481952667236328
1275 Time Taken:  0.14855265617370605
1276 Time Taken:  0.14753103256225586
1277 Time Taken:  0.15729188919067383
1278 Time Taken:  0.15886831283569336
1279 Time Taken:  0.1597881317138672
1280 Time Taken:  0.155778169631958
1281 Time Taken:  0.14776849746704102
1282 Time Taken:  0.1482577323

In [None]:
descriptions_output = np.asarray(descriptions_output).reshape((1000,128))
print(descriptions_output.shape)
np.save(os.path.join(path, foldername, 'alltext.npy') ,descriptions_output)