<a href="https://colab.research.google.com/github/dqminhv/fraudulent-job-posting-detection-with-NLP/blob/main/notebook/data-preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import required packages

In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from wordcloud import WordCloud
from sklearn import tree, metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [15]:
job_posting_us_df = pd.read_csv('https://raw.githubusercontent.com/dqminhv/fraudulent-job-posting-detection-with-NLP/main/Data/job-posting-us.csv')

In [4]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Create a text cleaning function

In [5]:
#Create a function to clean the text feature
stop=set(stopwords.words("english"))

def clean(text):
    text=text.lower()
    obj=re.compile(r"<.*?>")                     #removing html tags
    text=obj.sub(r" ",text)
    obj=re.compile(r"https://\S+|http://\S+")    #removing url
    text=obj.sub(r" ",text)
    obj=re.compile(r"[^\w\s]")                   #removing punctuations
    text=obj.sub(r" ",text)
    obj=re.compile(r"\d{1,}")                    #removing digits
    text=obj.sub(r" ",text)
    obj=re.compile(r"_+")                        #removing underscore
    text=obj.sub(r" ",text)
    obj=re.compile(r"\s\w\s")                    #removing single character
    text=obj.sub(r" ",text)
    obj=re.compile(r"\s{2,}")                    #removing multiple spaces
    text=obj.sub(r" ",text)
    lemmatizer = WordNetLemmatizer()
    text=[lemmatizer.lemmatize(word) for word in text.split() if word not in stop]
    return " ".join(text)

# Turning int variable to text

In [17]:
job_posting_us_df.loc[job_posting_us_df.telecommuting == 1, 'telecommuting'] = 'telecommuting'
job_posting_us_df.loc[job_posting_us_df.telecommuting == 0, 'telecommuting'] = 'no-telecommuting'

job_posting_us_df.loc[job_posting_us_df.has_company_logo == 1, 'has_company_logo'] = 'has-logo'
job_posting_us_df.loc[job_posting_us_df.has_company_logo == 0, 'has_company_logo'] = 'no-logo'

job_posting_us_df.loc[job_posting_us_df.has_questions == 1, 'has_questions'] = 'has-questions'
job_posting_us_df.loc[job_posting_us_df.has_questions == 0, 'has_questions'] = 'no-question'

# Create a text feature by combining all the columns

In [18]:
#Combine all features into one column
job_posting_us_df["text"]=""
for col in job_posting_us_df.drop('fraudulent', axis=1).columns:
    job_posting_us_df["text"]=job_posting_us_df["text"]+" "+job_posting_us_df[col]

In [20]:
job_posting_us_df['text'].apply(clean)

0       marketing intern u ny new york marketing speci...
1       account executive washington dc u dc washingto...
2       bill review manager u fl fort worth specified ...
3       customer service associate part time u az phoe...
4       asp net developer job opportunity united state...
                              ...                        
6747    post production editor u ia west de moines spe...
6748    senior financial analyst retail u ca gardena s...
6749    product manager u ca san francisco product dev...
6750    javascript developer u specified specified sr ...
6751    payroll accountant u pa philadelphia accountin...
Name: text, Length: 6752, dtype: object


# Spliting Data into Training and Testing Sets

In [23]:
X = job_posting_us_df['text']
y = job_posting_us_df['fraudulent']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)