In [1]:
# import libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as tf_hub
from tensorflow.keras.models import load_model

# import preprocessing
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

2024-07-16 16:00:02.988096: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# download the embedding layer
url = 'https://tfhub.dev/google/tf2-preview/nnlm-en-dim128-with-normalization/1'
hub_layer = tf_hub.KerasLayer(url, output_shape=[128], input_shape=[], dtype=tf.string)

In [3]:
# load files
model_nlp = load_model('improve_model_lstm.keras',custom_objects={'KerasLayer': hub_layer})

In [4]:
# create a function for text removal
def text_removal(text):
    '''
    Function to automate the deletion of unnecessary text.
    '''
    # convert text to lowercase
    text = text.lower()

    # hashtags removal
    text = re.sub(r"#\w+", " ", text)

    # newline removal (\n)
    text = re.sub(r"\n", " ", text)

    # URL removal
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"www\S+", " ", text)

    # symbol '&' removal
    text = re.sub(r"&amp;", " ", text) #in HTML

    # punctuation removal
    text = re.sub(r"[^\w\s]", " ", text)

    # non-letter removal (such as emoticon, symbol (like μ, $, 兀), etc
    text = re.sub(r"[^A-Za-z\s']", " ", text)

    # multiple spaces removal
    text = re.sub(r"\s+", " ", text)

    # whitespace removal
    text = text.strip()

    return text

In [5]:
# create a function for stopwords removal
def stopwords_removal(text):
    '''
    Function to automate the removal of stopwords ('and','or') and custom stopwords using the NLTK library.
    '''
    # defining stopwords
    stpwrd_eng = set(stopwords.words('english'))
    custom_stopwords = ['job','jobs','position','positions','career','careers']
    stpwrd_eng.update(custom_stopwords)

    # tokenization
    tokens = nltk.word_tokenize(text)

    # stopwords removal
    filtered_tokens = [word for word in tokens if word.lower() not in stpwrd_eng]

    # joining stopwords tokens back into a string
    cleaned_text = ' '.join(filtered_tokens)

    return cleaned_text

In [6]:
# create a function for lemmatization
def lemmatization(text):
    '''
    Function to soften text by returning a word to its base (lemmatization) using the NLTK library.
    '''
    # defining lemmatizer
    lemmatizer = WordNetLemmatizer()

    # tokenization
    tokens = nltk.word_tokenize(text)

    # lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # joining lemmatized tokens back into a string
    lemmatized_text = ' '.join(lemmatized_tokens)

    return lemmatized_text

In [7]:
# combining tokens from preprocess
def text_preprocessing(text):
    '''
    Function to combine the results of text removal, stopwords removal, and lemmatization.
    '''
    text = text_removal(text)
    text = stopwords_removal(text)
    text = lemmatization(text)
    return text

In [8]:
# create new data
df_inf = {
    'text':
    '''
    ABC Corporation is a pioneering leader in the healthcare technology sector, dedicated to revolutionizing patient care through innovative solutions. Established in 2005, our company has consistently delivered cutting-edge medical devices and software that elevate healthcare standards worldwide.
    ABC Corporation is seeking a dynamic and experienced Business Partnership Manager to expand our strategic alliances and foster growth opportunities. As a Business Partnership Manager, you will be responsible for developing and managing partnerships with key stakeholders, including healthcare providers, technology partners, and strategic alliances.
    Bachelor's degree in Business Administration, Marketing, or a related field. MBA preferred. Proven track record of success in business development or partnership management, preferably in the healthcare or technology sector. Strong negotiation and interpersonal skills with the ability to build and maintain relationships at all levels. Strategic thinker with a clear understanding of market dynamics and business trends. Excellent communication and presentation skills.
    '''
}

df_inf = pd.DataFrame([df_inf])
df_inf

Unnamed: 0,text
0,\n ABC Corporation is a pioneering leader i...


In [9]:
# applying text preprocessing
df_inf['text_processed'] = df_inf['text'].apply(lambda x: text_preprocessing(x))
df_inf

Unnamed: 0,text,text_processed
0,\n ABC Corporation is a pioneering leader i...,abc corporation pioneering leader healthcare t...


In [10]:
# predict for binary classification
predict = model_nlp.predict(df_inf['text_processed'])
predict = np.where(predict >= 0.5, 1, 0)
predict

for i in predict:
    if i == 0:
        print('The job posting is real.')
    else:
        print('The job posting is fake.')

The job posting is real.
