## Download Packages

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

## Import Libraries

In [1]:
## import libraries
import pandas as pd
import unicodedata
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
## read the data
df_raw = pd.read_csv('../Data/Raw/Job_Frauds.csv', encoding='latin-1')

## Data Inspection

In [3]:
# inspect the data
df = df_raw
print(df.info())   # Check data types & missing values
print(df.head())   # View first few rows
print(df['Fraudulent'].value_counts())  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Job Title           17880 non-null  object
 1   Job Location        17534 non-null  object
 2   Department          6333 non-null   object
 3   Range_of_Salary     2868 non-null   object
 4   Profile             14572 non-null  object
 5   Job_Description     17879 non-null  object
 6   Requirements        15185 non-null  object
 7   Job_Benefits        10670 non-null  object
 8   Telecomunication    17880 non-null  int64 
 9   Comnpany_Logo       17880 non-null  int64 
 10  Type_of_Employment  14409 non-null  object
 11  Experience          10830 non-null  object
 12  Qualification       9775 non-null   object
 13  Type_of_Industry    12977 non-null  object
 14  Operations          11425 non-null  object
 15  Fraudulent          17880 non-null  int64 
dtypes: int64(3), object(13

In [4]:
# check for missing values
print(df.isnull().sum())

Job Title                 0
Job Location            346
Department            11547
Range_of_Salary       15012
Profile                3308
Job_Description           1
Requirements           2695
Job_Benefits           7210
Telecomunication          0
Comnpany_Logo             0
Type_of_Employment     3471
Experience             7050
Qualification          8105
Type_of_Industry       4903
Operations             6455
Fraudulent                0
dtype: int64


In [5]:
# check if missing values correlates with whether job is fraudulent
print(df[df['Department'].isnull()]['Fraudulent'].value_counts())
print(df[df['Range_of_Salary'].isnull()]['Fraudulent'].value_counts())
print(df[df['Profile'].isnull()]['Fraudulent'].value_counts())
print(df[df['Requirements'].isnull()]['Fraudulent'].value_counts())
print(df[df['Job_Benefits'].isnull()]['Fraudulent'].value_counts())
print(df[df['Type_of_Employment'].isnull()]['Fraudulent'].value_counts())
print(df[df['Experience'].isnull()]['Fraudulent'].value_counts())
print(df[df['Qualification'].isnull()]['Fraudulent'].value_counts())
print(df[df['Type_of_Industry'].isnull()]['Fraudulent'].value_counts())
print(df[df['Operations'].isnull()]['Fraudulent'].value_counts())


0    11016
1      531
Name: Fraudulent, dtype: int64
0    14369
1      643
Name: Fraudulent, dtype: int64
0    2721
1     587
Name: Fraudulent, dtype: int64
0    2541
1     154
Name: Fraudulent, dtype: int64
0    6846
1     364
Name: Fraudulent, dtype: int64
0    3230
1     241
Name: Fraudulent, dtype: int64
0    6615
1     435
Name: Fraudulent, dtype: int64
0    7654
1     451
Name: Fraudulent, dtype: int64
0    4628
1     275
Name: Fraudulent, dtype: int64
0    6118
1     337
Name: Fraudulent, dtype: int64


## Preprocessing

In [None]:
#Split 'Job Location' into 'Country', 'State', 'City'
split_location = df['Job Location'].str.split(', ', expand=True)
df['Country'] = split_location[0]  
df['State'] = split_location[1] 
df['City'] = split_location[2] 
df = df.drop(columns=['Job Location'])

In [8]:
text_cols = ['Job Title','Profile', 'Job_Description', 'Requirements', 'Job_Benefits','Type_of_Employment','Experience','Qualification',
        'Department', 'Type_of_Industry', 'Operations','Country','State','City']
df[text_cols] = df[text_cols].fillna('unknown')

In [9]:
df.head()

Unnamed: 0,Job Title,Department,Range_of_Salary,Profile,Job_Description,Requirements,Job_Benefits,Telecomunication,Comnpany_Logo,Type_of_Employment,Experience,Qualification,Type_of_Industry,Operations,Fraudulent,Country,State,City
0,Marketing Intern,Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,unknown,0,1,Other,Internship,unknown,unknown,Marketing,0,US,NY,New York
1,Customer Service - Cloud Video Production,Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,Full-time,Not Applicable,unknown,Marketing and Advertising,Customer Service,0,NZ,,Auckland
2,Commissioning Machinery Assistant (CMA),unknown,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,unknown,0,1,unknown,unknown,unknown,unknown,unknown,0,US,IA,Wever
3,Account Executive - Washington DC,Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI â Environmental Systems Re...,"EDUCATION:Â Bachelorâs or Masterâs in GIS,...",Our culture is anything but corporateâwe hav...,0,1,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,US,DC,Washington
4,Bill Review Manager,unknown,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,US,FL,Fort Worth


In [10]:
## convert strings to lower case
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [11]:
## remove punctuations and symbols
df = df.replace(to_replace=r'[^\w\s]', value='', regex=True)

In [12]:
## function to replace accents (eg. áéíóú -> aeiou)
def strip_accents(text):
    return ''.join(char for char in
                   unicodedata.normalize('NFKD', text)
                   if unicodedata.category(char) != 'Mn')

## replace accents
for col in text_cols:
    # replace accent
    df[col] = df[col].apply(lambda x: [strip_accents(x)])
    df[col] = df[col].apply(lambda x: " ".join(x))

In [13]:
df.head()

Unnamed: 0,Job Title,Department,Range_of_Salary,Profile,Job_Description,Requirements,Job_Benefits,Telecomunication,Comnpany_Logo,Type_of_Employment,Experience,Qualification,Type_of_Industry,Operations,Fraudulent,Country,State,City
0,marketing intern,marketing,,were food52 and weve created a groundbreaking ...,food52 a fastgrowing james beard awardwinning ...,experience with content management systems a m...,unknown,0,1,other,internship,unknown,unknown,marketing,0,us,ny,new york
1,customer service cloud video production,success,,90 seconds the worlds cloud video production s...,organised focused vibrant awesomedo you hav...,what we expect from youyour key responsibility...,what you will get from usthrough being part of...,0,1,fulltime,not applicable,unknown,marketing and advertising,customer service,0,nz,,auckland
2,commissioning machinery assistant cma,unknown,,valor services provides workforce solutions th...,our client located in houston is actively seek...,implement precommissioning and commissioning p...,unknown,0,1,unknown,unknown,unknown,unknown,unknown,0,us,ia,wever
3,account executive washington dc,sales,,our passion for improving quality of life thro...,the company esri a environmental systems resea...,educationa bacheloras or masteras in gis busin...,our culture is anything but corporateawe have ...,0,1,fulltime,midsenior level,bachelors degree,computer software,sales,0,us,dc,washington
4,bill review manager,unknown,,spotsource solutions llc is a global human cap...,job title itemization review managerlocation f...,qualificationsrn license in the state of texas...,full benefits offered,0,1,fulltime,midsenior level,bachelors degree,hospital health care,health care provider,0,us,fl,fort worth


In [14]:
## tokenize, remove stop words, and lemmatize for columns 'Profile', 'Job_Description', 'Requirements', 'Job_Benefits','Job Title', 'Department', 'Type_of_Industry', 'Operations'
# initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()    
cols = ['Profile', 'Job_Description', 'Requirements', 'Job_Benefits','Job Title', 'Department', 'Type_of_Industry', 'Operations']

In [15]:
for col in cols:
    # apply tokenization 
    df[col] = df[col].apply(lambda x: word_tokenize(x) if isinstance(x, str) and x.strip() != '' else [])
    # remove stop words
    df[col] = df[col].apply(lambda x: [word for word in x if word not in stop_words])
    # apply lemmatization
    df[col] = df[col].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    
print(df.head())

                                       Job Title   Department Range_of_Salary  \
0                            [marketing, intern]  [marketing]             NaN   
1  [customer, service, cloud, video, production]    [success]             NaN   
2     [commissioning, machinery, assistant, cma]    [unknown]             NaN   
3           [account, executive, washington, dc]       [sale]             NaN   
4                        [bill, review, manager]    [unknown]             NaN   

                                             Profile  \
0  [food52, weve, created, groundbreaking, awardw...   
1  [90, second, world, cloud, video, production, ...   
2  [valor, service, provides, workforce, solution...   
3  [passion, improving, quality, life, geography,...   
4  [spotsource, solution, llc, global, human, cap...   

                                     Job_Description  \
0  [food52, fastgrowing, james, beard, awardwinni...   
1  [organised, focused, vibrant, awesomedo, passi...   
2  [clie

In [16]:
for col in cols:
    df[col] = df[col].apply(lambda x: " ".join(x))
print(df.head())

                                 Job Title Department Range_of_Salary  \
0                         marketing intern  marketing             NaN   
1  customer service cloud video production    success             NaN   
2    commissioning machinery assistant cma    unknown             NaN   
3          account executive washington dc       sale             NaN   
4                      bill review manager    unknown             NaN   

                                             Profile  \
0  food52 weve created groundbreaking awardwinnin...   
1  90 second world cloud video production service...   
2  valor service provides workforce solution meet...   
3  passion improving quality life geography heart...   
4  spotsource solution llc global human capital m...   

                                     Job_Description  \
0  food52 fastgrowing james beard awardwinning on...   
1  organised focused vibrant awesomedo passion cu...   
2  client located houston actively seeking experi...   


In [17]:
# check if data is imbalanced
print(df['Fraudulent'].value_counts())

0    17014
1      866
Name: Fraudulent, dtype: int64


In [133]:
## save as csv
df.to_csv('../Data/Processed/processed_df.csv', index=False)