In [19]:
## download packages
#nltk.download(punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')


/bin/bash: pip: command not found


In [2]:
## import libraries
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from imblearn.under_sampling import RandomUnderSampler

In [3]:
## read the data
df_raw = pd.read_csv('Data/Raw/Job_Frauds.csv', encoding='latin-1')

In [4]:
# inspect the data
df = df_raw
print(df.info())   # Check data types & missing values
print(df.head())   # View first few rows
print(df['Fraudulent'].value_counts())  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Job Title           17880 non-null  object
 1   Job Location        17534 non-null  object
 2   Department          6333 non-null   object
 3   Range_of_Salary     2868 non-null   object
 4   Profile             14572 non-null  object
 5   Job_Description     17879 non-null  object
 6   Requirements        15184 non-null  object
 7   Job_Benefits        10668 non-null  object
 8   Telecomunication    17880 non-null  int64 
 9   Comnpany_Logo       17880 non-null  int64 
 10  Type_of_Employment  14409 non-null  object
 11  Experience          10830 non-null  object
 12  Qualification       9775 non-null   object
 13  Type_of_Industry    12977 non-null  object
 14  Operations          11425 non-null  object
 15  Fraudulent          17880 non-null  int64 
dtypes: int64(3), object(13

In [5]:
# check for missing values
print(df.isnull().sum())

Job Title                 0
Job Location            346
Department            11547
Range_of_Salary       15012
Profile                3308
Job_Description           1
Requirements           2696
Job_Benefits           7212
Telecomunication          0
Comnpany_Logo             0
Type_of_Employment     3471
Experience             7050
Qualification          8105
Type_of_Industry       4903
Operations             6455
Fraudulent                0
dtype: int64


In [10]:
# check if missing values correlates with whether job is fraudulent
print(df[df['Department'].isnull()]['Fraudulent'].value_counts())
print(df[df['Range_of_Salary'].isnull()]['Fraudulent'].value_counts())
print(df[df['Profile'].isnull()]['Fraudulent'].value_counts())
print(df[df['Requirements'].isnull()]['Fraudulent'].value_counts())
print(df[df['Job_Benefits'].isnull()]['Fraudulent'].value_counts())
print(df[df['Type_of_Employment'].isnull()]['Fraudulent'].value_counts())
print(df[df['Experience'].isnull()]['Fraudulent'].value_counts())
print(df[df['Qualification'].isnull()]['Fraudulent'].value_counts())
print(df[df['Type_of_Industry'].isnull()]['Fraudulent'].value_counts())
print(df[df['Operations'].isnull()]['Fraudulent'].value_counts())


Fraudulent
0    11016
1      531
Name: count, dtype: int64
Fraudulent
0    14369
1      643
Name: count, dtype: int64
Fraudulent
0    2721
1     587
Name: count, dtype: int64
Fraudulent
0    2542
1     154
Name: count, dtype: int64
Fraudulent
0    6848
1     364
Name: count, dtype: int64
Fraudulent
0    3230
1     241
Name: count, dtype: int64
Fraudulent
0    6615
1     435
Name: count, dtype: int64
Fraudulent
0    7654
1     451
Name: count, dtype: int64
Fraudulent
0    4628
1     275
Name: count, dtype: int64
Fraudulent
0    6118
1     337
Name: count, dtype: int64


In [None]:
# We can't simply drop missing vallues as they take up quite a significant portion of Fraudulant cases. 
# The presence of missing values may correlate with whether a job posting is fraudulent
# Have to check correlation when doing EDA/feature engineering

In [3]:
## split 'Job Location' into 'Country', 'State', 'City'
split_location = df['Job Location'].str.split(', ', expand=True)
df['Country'] = split_location[0]  
df['State'] = split_location[1] 
df['City'] = split_location[2] 
df = df.drop(columns=['Job Location'])

In [4]:
## convert strings to lower case
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

  df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [5]:
## remove punctuations and symbols
df = df.replace(to_replace=r'[^\w\s]', value='', regex=True)

In [6]:
## tokenize, remove stop words, and lemmatize for columns 'Profile', 'Job_Description', 'Requirements', 'Job_Benefits'
# initialize stop words and lemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()    
cols = ['Profile', 'Job_Description', 'Requirements', 'Job_Benefits']

In [8]:
for col in cols:
    # apply tokenization 
    df[col] = df[col].apply(lambda x: word_tokenize(x) if isinstance(x, str) and x.strip() != '' else [])
    # remove stop words
    df[col] = df[col].apply(lambda x: [word for word in x if word not in stop_words])
    # apply lemmatization
    df[col] = df[col].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

print(df)

                                               Job Title   Department  \
0                                       marketing intern    marketing   
1               customer service  cloud video production      success   
2                  commissioning machinery assistant cma          NaN   
3                       account executive  washington dc        sales   
4                                    bill review manager          NaN   
...                                                  ...          ...   
17875                    account director  distribution         sales   
17876                                 payroll accountant   accounting   
17877  project cost control staff engineer  cost cont...          NaN   
17878                                   graphic designer          NaN   
17879                         web application developers  engineering   

      Range_of_Salary Profile  \
0                 NaN      []   
1                 NaN      []   
2                 NaN   

In [11]:
# check if data is imbalanced
print(df['Fraudulent'].value_counts())

Fraudulent
0    17014
1      866
Name: count, dtype: int64


In [23]:
# conduct undersampling since data is imbalanced
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(df.drop(columns=['Fraudulent']), df['Fraudulent'])
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)
print(df_resampled)

                                     Job Title           Department  \
13092     sr ms dynamics consultant  ax or gp                   NaN   
13436               software engineer  android                  NaN   
17502               twic or rapidgate laborers                  NaN   
10820              agentinbound sales position                  NaN   
2067               internship media production                  NaN   
...                                        ...                  ...   
17827  student positions parttime and fulltime                  NaN   
17828                          sales associate  sales and marketing   
17829                        android developer                  NaN   
17830                           payroll clerk                   NaN   
17831                          furniture mover                  NaN   

      Range_of_Salary Profile  \
13092             NaN      []   
13436             NaN      []   
17502             NaN      []   
10820        11

In [9]:
## save as csv
#df.to_csv('Data/Processed/processed_df.csv', index=False)