In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
import time
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pathlib import Path
import sys

In [8]:
# get add the Base directory
BASE_DIR = str(Path().cwd().parent.resolve())
sys.path.insert(0, BASE_DIR)
TEXT_COLS = [1,5,6,7,8,17]

In [9]:
df = pd.read_csv(f'{BASE_DIR}/data/fake_job_postings.csv', engine = 'python')

In [10]:
only_fake = df[df['fraudulent'] == 1]
only_real = df[df['fraudulent'] == 0]

In [13]:
only_fake_text.columns

Index(['title', 'company_profile', 'description', 'requirements', 'benefits',
       'fraudulent'],
      dtype='object')

In [20]:
for col in only_fake_text.columns[:-1]:
    for x in only_fake_text[col]:
        if not isinstance(x, str):
            print(x)

In [38]:
import re

# only fake's text
only_fake_text = only_fake.iloc[:,TEXT_COLS].copy(deep=True)

def remove_html_tags(df):
    for col in df.columns:
        if col == 'fraudulent':
            continue

        # replace &amp; to &
        df[col] = df[col].str.replace(r'&amp;', '&')
        
        # replace &nbsp; to space
        df[col] = df[col].str.replace(r'&nbsp;', ' ')
        
        # replace &lt; to <
        df[col] = df[col].str.replace(r'&lt;', '<')
        
        # replace &gt; to >
        df[col] = df[col].str.replace(r'&gt;', '>')
        
        # replace &quot; to "
        df[col] = df[col].str.replace(r'&quot;', '"')
        
        # replace \u00a0 to space
        df[col] = df[col].str.replace(r'\u00a0', ' ')
    

    df = df.fillna('None')

    return df.reset_index(drop=True)
    


In [50]:
# extract unique keywords from text using tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

def get_unique_keywords(df, col, top_k=20):
    tfidf = TfidfVectorizer(
        stop_words='english',
        ngram_range=(1, 1),
        )
    tfidf.fit(df[col])
    terms = tfidf.get_feature_names_out().tolist()
    score = tfidf.transform(df[col]).toarray().flatten().tolist()
    
    data = [(t, s) for t, s in zip(terms, score)]
    
    return list(sorted(data, key=lambda x: x[1], reverse=True))[:top_k]

[('equipment', 0.3635001490391424),
 ('performs', 0.26694893989639124),
 ('maintenance', 0.24150701890508272),
 ('members', 0.1818264341911659),
 ('required', 0.15328420094611045),
 ('installs', 0.14726775075245796),
 ('troubleshoots', 0.14726775075245796),
 ('identifies', 0.13232382316134655),
 ('motor', 0.1230859729232742),
 ('control', 0.1228673208936796),
 ('instrumentation', 0.12159130794336889),
 ('follows', 0.11758449775282394),
 ('tasks', 0.11612477212743229),
 ('plant', 0.10842381321626243),
 ('power', 0.10842381321626243),
 ('electrical', 0.10599683016804141),
 ('environmental', 0.10523817272483196),
 ('team', 0.10468917783902862),
 ('record', 0.10378749423564171),
 ('controls', 0.10175996149785409)]

In [51]:
for c in only_fake_text.columns[:-1]:
    print(f'====={c}=====')
    print(get_unique_keywords(only_fake_text, c, 20))
    print('\n\n')

=====title=====
[('ic', 0.8178481560722106), ('technician', 0.5754340914555595), ('000', 0.0), ('100', 0.0), ('130', 0.0), ('15', 0.0), ('150', 0.0), ('175', 0.0), ('1781', 0.0), ('1843', 0.0), ('19', 0.0), ('1970', 0.0), ('20', 0.0), ('200', 0.0), ('2020', 0.0), ('2022', 0.0), ('2048', 0.0), ('2053', 0.0), ('2131', 0.0), ('2141', 0.0)]



=====company_profile=====
[('candidates', 0.3369808461928552), ('significant', 0.21743359220444844), ('refined', 0.1908000882926579), ('resources', 0.1908000882926579), ('bonus', 0.17314629004139406), ('referral', 0.17314629004139406), ('signing', 0.17314629004139406), ('anyperk', 0.10871679610222422), ('automatically', 0.10871679610222422), ('behalf', 0.10871679610222422), ('cleaning', 0.10871679610222422), ('click', 0.10871679610222422), ('corporate', 0.10871679610222422), ('directly', 0.10871679610222422), ('discounts', 0.10871679610222422), ('enlarge', 0.10871679610222422), ('expenditures', 0.10871679610222422), ('formthank', 0.10871679610222422)

In [42]:
div_by = 100
number_of_parts = len(only_fake_text)//div_by + 1

for i in range(number_of_parts):
    only_fake_text.iloc[:i*div_by, :].to_json(f'{BASE_DIR}/data/only_fake_text_{i}.json', indent=4, orient='table')