In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
import time
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pathlib import Path
import sys

In [2]:
# get add the Base directory
BASE_DIR = str(Path().cwd().parent.resolve())
sys.path.insert(0, BASE_DIR)
TEXT_COLS = [1,5,6,7,8,17]

In [3]:
df = pd.read_csv(f'{BASE_DIR}/data/fake_job_postings.csv', engine = 'python')

In [4]:
only_fake = df[df['fraudulent'] == 1]
only_real = df[df['fraudulent'] == 0]

In [6]:
only_fake_text = only_fake.iloc[:,TEXT_COLS].copy(deep=True)
only_real_text = only_real.iloc[:,TEXT_COLS].copy(deep=True)

In [37]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def refine(df):
    for col in df.columns:
        if col == 'fraudulent':
            continue

        # replace &amp; to &
        df[col] = df[col].str.replace(r'&amp;', '&')
        
        # replace &nbsp; to space
        df[col] = df[col].str.replace(r'&nbsp;', ' ')
        
        # replace &lt; to <
        df[col] = df[col].str.replace(r'&lt;', '<')
        
        # replace &gt; to >
        df[col] = df[col].str.replace(r'&gt;', '>')
        
        # replace &quot; to "
        df[col] = df[col].str.replace(r'&quot;', '"')
        
        # replace \u00a0 to space
        df[col] = df[col].str.replace(r'\u00a0', ' ')
        
        df[col] = df[col].fillna('None')
        
        # stop word remove
        stop_words = set(stopwords.words('english'))
        df[col] = df[col].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
        
        # lemmatize
        lemmatizer = WordNetLemmatizer()
        df[col] = df[col].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

    return df.reset_index(drop=True)
    


In [38]:
refined_only_fake_text = refine(only_fake_text)
refined_only_real_text = refine(only_real_text)

In [39]:
# extract unique keywords from text using tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import os


def get_unique_keywords(df, col, top_k=20, d_type='real', ngram=(1,1)):
    tfidf = TfidfVectorizer(
        stop_words='english',
        ngram_range=ngram,
        )
    tfidf.fit(df[col])
    terms = tfidf.get_feature_names_out().tolist()
    score = tfidf.transform(df[col]).toarray().flatten().tolist()
    
    data = [(t, s) for t, s in zip(terms, score)]
    sorted_iter = sorted(data, key=lambda x: x[1], reverse=True)
    sorted_result = {}
    
    for k, v in sorted_iter:
        sorted_result[k] = v
    
    path_dir = f'{BASE_DIR}/data/tf_idf'
    
    if not os.path.isdir(path_dir):
        os.mkdir(path_dir)
    
    with open(f'{path_dir}/{d_type}_{col}_{ngram}.json', 'w') as f:
        json.dump(sorted_result, f, indent=4)
        
    return list(sorted_result)[:top_k]
    
    

In [40]:
for c in refined_only_fake_text.columns[:-1]:
    print(f'====={c}=====')
    print(get_unique_keywords(refined_only_fake_text, c, 20, d_type='fake', ngram=(1,1)))
    print('\n\n')

=====title=====
['ic', 'technician', '000', '100', '130', '15', '150', '175', '1781', '1843', '19', '1970', '20', '200', '2020', '2022', '2048', '2053', '2131', '2141']



=====company_profile=====
['bonus', 'referral', 'candidates', 'significant', 'refined', 'resources', 'signing', 'candidate', 'anyperk', 'automatically', 'behalf', 'cleaning', 'click', 'corporate', 'directly', 'discount', 'enlarge', 'expenditure', 'formthank', 'forward']



=====description=====
['equipment', 'performs', 'maintenance', 'control', 'required', 'installs', 'troubleshoots', 'identifies', 'member', 'instrumentation', 'motor', 'follows', 'work', 'plant', 'electrical', 'environmental', 'power', 'team', 'coordinate', 'record']



=====requirements=====
['demonstrated', 'equipment', 'plant', 'control', 'including', 'electrical', 'power', 'ability', 'environment', 'systems', 'qualificationsknowledge', 'calibrate', 'cem', 'controllers', 'emission', 'generators', 'physic', 'programmable', 'transformers', 'analyti

In [41]:
for c in refined_only_real_text.columns[:-1]:
    print(f'====={c}=====')
    print(get_unique_keywords(refined_only_real_text, c, 20,  d_type='real', ngram=(1,1)))
    print('\n\n')

=====title=====
['intern', 'marketing', '00', '000', '02', '0dq', '0rg', '0tz', '10', '100', '100k', '1099', '10k', '10x', '11', '12', '12hr', '13', '13th', '14']



=====company_profile=====
['food', 'food52', 'cooking', 'connect', 'york', 'new', 'home', 'batali', 'beard', 'contributor', 'cooks', 'danny', 'gwyneth', 'iacp', 'mario', 'meyer', 'npr', 'paltrow', 'pando', 'random']



=====description=====
['affiliate', 'food52', 'inquiriessupporting', 'meetingsworking', 'neededhelping', 'programassisting', 'systemsresearching', 'repackaging', 'reproducing', 'sitesupporting', 'huffington', 'buzzfeed', 'beard', 'crowd', 'provisions', 'james', 'unpaid', 'content', 'editors', 'sourced']



=====requirements=====
['cooking', 'food52', 'big', 'aestheticloves', 'counts', 'pinterestloves', 'seasonsmeticulous', 'smallinterested', 'dishes', 'forwardthinks', 'themcheerful', 'delighted', 'gritty', 'maddened', 'nitty', 'appreciates', 'broken', 'juggler', 'pressureexcellent', 'typo']



=====benefits=

In [27]:
div_by = 100
number_of_parts = len(only_fake_text)//div_by + 1

for i in range(number_of_parts):
    only_fake_text.iloc[:i*div_by, :].to_json(f'{BASE_DIR}/data/only_fake_text_{i}.json', indent=4, orient='table')

In [64]:
# need to check the magnitue of string of numbers in the texts

def collect_numbers(df, col):
    numbers = []
    target = df[col]
    
    for i in range(len(df)):
        numbers.extend(re.findall(r'[0-9][0-9,.]+', target.iloc[i]))
        
    for i in range(len(numbers)):
        
        if not '.' in numbers[i]:
            numbers[i] = int(numbers[i].replace(',', ''))
            
        else:
            numbers[i] = ''.join(numbers[i].split('.')[:-1])
            numbers[i] = float(numbers[i].replace(',', ''))
        
    return numbers


In [67]:
for c in refined_only_fake_text.columns[:-1]:
    print(f'====={c}=====')
    ns = collect_numbers(refined_only_fake_text, c)
    
    print("mean: ", np.mean(ns))
    print("std: ", np.std(ns))
    print("max: ", np.max(ns))
    print("min: ", np.min(ns))
    print("median: ", np.median(ns))

    print('\n\n')

=====title=====
mean:  24756.87479674797
std:  56034.21728161342
max:  175000.0
min:  15.0
median:  200.0



=====company_profile=====
mean:  17283305.589181285
std:  114872796.26757514
max:  916884407.0
min:  3.0
median:  500.0



=====description=====


ValueError: could not convert string to float: '6.5.'

In [None]:
ns = collect_numbers(only, 'benefits')
print("mean: ", np.mean(ns))
print("std: ", np.std(ns))
print("max: ", np.max(ns))
print("min: ", np.min(ns))
print("median: ", np.median(ns))