In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
import time
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pathlib import Path
import sys

In [2]:
# get add the Base directory
BASE_DIR = str(Path().cwd().parent.resolve())
sys.path.insert(0, BASE_DIR)
TEXT_COLS = [1,5,6,7,8,17]

In [3]:
df = pd.read_csv(f'{BASE_DIR}/data/fake_job_postings.csv', engine = 'python')

In [4]:
only_fake = df[df['fraudulent'] == 1]
only_real = df[df['fraudulent'] == 0]

In [6]:
only_fake_text = only_fake.iloc[:,TEXT_COLS].copy(deep=True)
only_real_text = only_real.iloc[:,TEXT_COLS].copy(deep=True)

In [11]:
def refine(df):
    for col in df.columns:
        if col == 'fraudulent':
            continue

        # replace &amp; to &
        df[col] = df[col].str.replace(r'&amp;', '&')
        
        # replace &nbsp; to space
        df[col] = df[col].str.replace(r'&nbsp;', ' ')
        
        # replace &lt; to <
        df[col] = df[col].str.replace(r'&lt;', '<')
        
        # replace &gt; to >
        df[col] = df[col].str.replace(r'&gt;', '>')
        
        # replace &quot; to "
        df[col] = df[col].str.replace(r'&quot;', '"')
        
        # replace \u00a0 to space
        df[col] = df[col].str.replace(r'\u00a0', ' ')
    

    df = df.fillna('None')

    return df.reset_index(drop=True)
    


In [12]:
refined_only_fake_text = refine(only_fake_text)
refined_only_real_text = refine(only_real_text)

In [22]:
# extract unique keywords from text using tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import os


def get_unique_keywords(df, col, top_k=20, d_type='real', ngram=(1,1)):
    tfidf = TfidfVectorizer(
        stop_words='english',
        ngram_range=ngram,
        )
    tfidf.fit(df[col])
    terms = tfidf.get_feature_names_out().tolist()
    score = tfidf.transform(df[col]).toarray().flatten().tolist()
    
    data = [(t, s) for t, s in zip(terms, score)]
    sorted_iter = sorted(data, key=lambda x: x[1], reverse=True)
    sorted_result = {}
    
    for k, v in sorted_iter:
        sorted_result[k] = v
    
    path_dir = f'{BASE_DIR}/data/tf_idf'
    
    if not os.path.isdir(path_dir):
        os.mkdir(path_dir)
    
    with open(f'{path_dir}/{d_type}_{col}_{ngram}.json', 'w') as f:
        json.dump(sorted_result, f, indent=4)
        
    return list(sorted_result)[:top_k]
    
    

In [23]:
for c in refined_only_fake_text.columns[:-1]:
    print(f'====={c}=====')
    print(get_unique_keywords(refined_only_fake_text, c, 20, d_type='fake', ngram=(1,1)))
    print('\n\n')

=====title=====
['ic', 'technician', '000', '100', '130', '15', '150', '175', '1781', '1843', '19', '1970', '20', '200', '2020', '2022', '2048', '2053', '2131', '2141']



=====company_profile=====
['candidates', 'significant', 'refined', 'resources', 'bonus', 'referral', 'signing', 'anyperk', 'automatically', 'behalf', 'cleaning', 'click', 'corporate', 'directly', 'discounts', 'enlarge', 'expenditures', 'formthank', 'forward', 'granted']



=====description=====
['equipment', 'performs', 'maintenance', 'members', 'required', 'installs', 'troubleshoots', 'identifies', 'motor', 'control', 'instrumentation', 'follows', 'tasks', 'plant', 'power', 'electrical', 'environmental', 'team', 'record', 'controls']



=====requirements=====
['demonstrated', 'equipment', 'plant', 'systems', 'including', 'electrical', 'power', 'ability', 'control', 'environment', 'qualificationsknowledge', 'basics', 'calibrate', 'cem', 'controllers', 'emissions', 'generators', 'programmable', 'transformers', 'analyt

In [15]:
for c in refined_only_real_text.columns[:-1]:
    print(f'====={c}=====')
    print(get_unique_keywords(refined_only_real_text, c, 20))
    print('\n\n')

=====title=====
[('intern', 0.7775118106326152), ('marketing', 0.6288683362412136), ('00', 0.0), ('000', 0.0), ('02', 0.0), ('0dq', 0.0), ('0rg', 0.0), ('0tz', 0.0), ('10', 0.0), ('100', 0.0), ('100k', 0.0), ('1099', 0.0), ('10k', 0.0), ('10x', 0.0), ('11', 0.0), ('12', 0.0), ('12hr', 0.0), ('13', 0.0), ('13th', 0.0), ('14', 0.0)]



=====company_profile=====
[('food', 0.2840718248529412), ('food52', 0.259000944091012), ('cooks', 0.2533007527643997), ('cooking', 0.252553696071854), ('connect', 0.18780744507929095), ('york', 0.15428606971489003), ('new', 0.15254877778917614), ('home', 0.15058764845383665), ('batali', 0.129500472045506), ('beard', 0.129500472045506), ('contributors', 0.129500472045506), ('danny', 0.129500472045506), ('gwyneth', 0.129500472045506), ('iacp', 0.129500472045506), ('mario', 0.129500472045506), ('meyer', 0.129500472045506), ('npr', 0.129500472045506), ('paltrow', 0.129500472045506), ('pando', 0.129500472045506), ('random', 0.129500472045506)]



=====descripti

KeyboardInterrupt: 

In [42]:
div_by = 100
number_of_parts = len(only_fake_text)//div_by + 1

for i in range(number_of_parts):
    only_fake_text.iloc[:i*div_by, :].to_json(f'{BASE_DIR}/data/only_fake_text_{i}.json', indent=4, orient='table')