In [50]:
import pandas as pd
import seaborn as sns
import json

In [51]:
base_result_dir = '../benchmark_results'
TO_EXCLUDE_MODEL = ['lr']

In [52]:
def get_result_df(result_dir, target_metric):
    print(result_dir)
    with open(f'{result_dir}/results_{target_metric}.json', 'r') as f:
        result = json.load(f)
        
    refined_result = {k: result[k]['cv_avg_scores'] for k in result.keys() if k not in TO_EXCLUDE_MODEL}
    return pd.DataFrame.from_dict(refined_result).T

In [53]:
from itertools import product

_token = 'words'
result_lst = []

_sampling_lst = ['random_over', 'random_under', 'smote']
_chained_lst = [None, 'chained', 'unchained']
_rule_lst = [False, True]
_feature_select_lst = ['mutual_info_classif', 'chi2']
_rule_lst = [False, True]

for _sampling, _chained, _rule, _feature_select in product(_sampling_lst, _chained_lst, _rule_lst, _feature_select_lst):
    _rule_txt = '_rule_added' if _rule else ''
    
    if _chained is None:
        logging_nm = f'{_sampling}/no_imputation_{_feature_select}'
        
    else:
        _path = f'./data/imputed/{_token}/{_chained}/fake_job_postings{_rule_txt}.csv'
        logging_nm = f'{_sampling}/imputation_{_chained}_{_feature_select}{_rule_txt}'
        
    result_lst.append(logging_nm)

In [54]:
baseline_no_imputation_mutual = f'{base_result_dir}/{result_lst[0]}'
target_metric = 'AUC'
result_all = {}

for dir in result_lst:
    result_dir = f'{base_result_dir}/{dir}'
    result_df = get_result_df(result_dir, target_metric)
    best_ = result_df.sort_values(f"test_{target_metric}", ascending=False).reset_index().iloc[0, :]
    key_str = "-".join(dir.split('/')) + "-" + best_['index']
    # print("-".join(dir.split('/')))
    result_all[key_str] = {k: best_[k] for k in best_.keys() if 'test' in k}

../benchmark_results/random_over/no_imputation_mutual_info_classif
../benchmark_results/random_over/no_imputation_chi2
../benchmark_results/random_over/no_imputation_mutual_info_classif
../benchmark_results/random_over/no_imputation_chi2
../benchmark_results/random_over/imputation_chained_mutual_info_classif
../benchmark_results/random_over/imputation_chained_chi2
../benchmark_results/random_over/imputation_chained_mutual_info_classif_rule_added
../benchmark_results/random_over/imputation_chained_chi2_rule_added
../benchmark_results/random_over/imputation_unchained_mutual_info_classif
../benchmark_results/random_over/imputation_unchained_chi2
../benchmark_results/random_over/imputation_unchained_mutual_info_classif_rule_added
../benchmark_results/random_over/imputation_unchained_chi2_rule_added
../benchmark_results/random_under/no_imputation_mutual_info_classif
../benchmark_results/random_under/no_imputation_chi2
../benchmark_results/random_under/no_imputation_mutual_info_classif
../be

In [58]:
pd.DataFrame.from_dict(result_all).T.sort_values('test_AUC', ascending=False)

Unnamed: 0,test_precision,test_recall,test_f1,test_AUC
random_over-imputation_chained_chi2_rule_added-xgb,0.992134,1.0,0.99605,0.999669
random_over-imputation_chained_mutual_info_classif_rule_added-xgb,0.991672,1.0,0.995817,0.999631
random_over-imputation_unchained_chi2_rule_added-xgb,0.991613,1.0,0.995788,0.99957
smote-imputation_unchained_chi2_rule_added-xgb,0.989655,0.989009,0.98933,0.998899
smote-imputation_chained_chi2_rule_added-xgb,0.992167,0.989538,0.990849,0.998806
smote-imputation_chained_mutual_info_classif_rule_added-rf,0.98024,0.978606,0.979413,0.997634
random_over-imputation_unchained_mutual_info_classif_rule_added-rf,0.966522,0.989009,0.977634,0.997485
smote-imputation_unchained_mutual_info_classif_rule_added-rf,0.979221,0.980075,0.979642,0.997425
random_over-imputation_chained_mutual_info_classif-xgb,0.96829,0.970202,0.969236,0.995282
smote-imputation_chained_chi2-xgb,0.968419,0.94669,0.957412,0.992918


In [41]:
baseline_no_imputation_mutual = f'{base_result_dir}/{result_lst[1]}'
target_metric = 'AUC'

get_result_df(baseline_no_imputation_mutual, target_metric)

../benchmark_results/no_imputation_chi2


Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_AUC
rf,30.589674,1.001763,0.976905,0.335031,0.4965,0.9553
svm,15.984631,0.992805,0.864162,0.538372,0.66102,0.93952
knn,0.008799,1.282814,0.866666,0.249425,0.377389,0.922547
ada,174.733237,0.903558,0.72845,0.185966,0.292814,0.906279
et,6.076001,0.159152,0.979707,0.382451,0.547363,0.956023


In [42]:
baseline_no_imputation_mutual = f'{base_result_dir}/{result_lst[2]}'
target_metric = 'AUC'

get_result_df(baseline_no_imputation_mutual, target_metric)

../benchmark_results/no_imputation_mutual_info_classif


Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_AUC
rf,16.495235,0.530955,0.983426,0.259877,0.407879,0.952491
svm,15.705963,1.000486,0.919137,0.472507,0.62157,0.937262
knn,0.008409,1.273158,0.942148,0.112002,0.198794,0.921035
ada,176.617374,0.923785,0.812553,0.144387,0.242453,0.904216
et,6.190374,0.131533,0.975129,0.325809,0.485077,0.950507


In [43]:
baseline_no_imputation_mutual = f'{base_result_dir}/{result_lst[3]}'
target_metric = 'AUC'

get_result_df(baseline_no_imputation_mutual, target_metric)

../benchmark_results/no_imputation_chi2


Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_AUC
rf,30.589674,1.001763,0.976905,0.335031,0.4965,0.9553
svm,15.984631,0.992805,0.864162,0.538372,0.66102,0.93952
knn,0.008799,1.282814,0.866666,0.249425,0.377389,0.922547
ada,174.733237,0.903558,0.72845,0.185966,0.292814,0.906279
et,6.076001,0.159152,0.979707,0.382451,0.547363,0.956023


In [44]:
baseline_no_imputation_mutual = f'{base_result_dir}/{result_lst[4]}'
target_metric = 'AUC'

get_result_df(baseline_no_imputation_mutual, target_metric)

../benchmark_results/imputation_chained_mutual_info_classif


Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_AUC
rf,33.602512,0.946689,0.945827,0.315277,0.470358,0.95919
svm,16.196224,0.939184,0.796626,0.653582,0.715435,0.945733
knn,0.009065,1.281965,0.801082,0.297902,0.433452,0.937934
ada,17.481562,0.117749,0.83119,0.121264,0.210223,0.900647
et,6.138364,0.155147,0.934162,0.392596,0.551713,0.962763


In [45]:
baseline_no_imputation_mutual = f'{base_result_dir}/{result_lst[5]}'
target_metric = 'AUC'

get_result_df(baseline_no_imputation_mutual, target_metric)

../benchmark_results/imputation_chained_chi2


Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_AUC
rf,30.662671,0.938998,0.938021,0.271411,0.417177,0.958105
svm,16.388191,0.946424,0.776302,0.65131,0.706031,0.944468
knn,0.008775,1.293178,0.846312,0.315183,0.457723,0.940579
ada,172.205981,0.815168,0.835331,0.143157,0.242822,0.904059
et,62.382989,1.112461,0.938502,0.41803,0.577024,0.961841


In [46]:
baseline_no_imputation_mutual = f'{base_result_dir}/{result_lst[9]}'
target_metric = 'AUC'

get_result_df(baseline_no_imputation_mutual, target_metric)

../benchmark_results/imputation_unchained_chi2


Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_AUC
rf,40.045457,1.723395,0.951463,0.32101,0.475237,0.955286
svm,17.271997,1.196877,0.884241,0.534723,0.664842,0.941455
knn,0.009844,1.554575,0.754693,0.278401,0.402935,0.926979
ada,193.876039,1.156833,0.802648,0.166346,0.273329,0.906336
et,71.594271,1.193292,0.937041,0.42621,0.582788,0.957302


In [9]:
import pandas as pd
df = pd.read_csv("../data/imputed/words/chained/fake_job_postings.csv")
df.iloc[[11538, 3723], :]

Unnamed: 0,title,location,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
11538,Research Engineer Data Science,US,Aptitude Staffing Solutions has redesigned the...,"Research Engineer | Data Science | San Mateo, ...",Position requirements: Quickly learning new ar...,The BenefitsGenerous paid time off to help you...,0,1,1,Full-time,Mid-Senior level,Master's Degree,Marketing and Advertising,Engineering,1
3723,Senior Geologist,US,,Job DescriptionCompany Overview:INTECSEA is a ...,Job RequirementsTechnical Requirements:Working...,,0,0,0,Full-time,Mid-Senior level,Master's Degree,Oil & Energy,Engineering,1


In [8]:
df = pd.read_csv("../data/fake_job_postings.csv")
df.iloc[[11538, 3723], :].drop(['job_id', 'salary_range'], axis=1)

Unnamed: 0,title,location,department,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
11538,Research Engineer Data Science,"US, CA, San Mateo",Engineering,Aptitude Staffing Solutions has redesigned the...,"Research Engineer | Data Science | San Mateo, ...",Position requirements: Quickly learning new ar...,The BenefitsGenerous paid time off to help you...,0,1,1,Full-time,Mid-Senior level,Master's Degree,Marketing and Advertising,Engineering,1
3723,Senior Geologist,"US, TX, Houston",,,Job DescriptionCompany Overview:INTECSEA is a ...,Job RequirementsTechnical Requirements:Working...,,0,0,0,Full-time,,Master's Degree,Oil & Energy,Engineering,1


In [18]:
pd.set_option("display.max_columns", 30)
# display the dataframe head

df = pd.read_csv("../data/imputed/words/chained/fake_job_postings_rule_added.csv")
df.iloc[[11538, 3723], :].drop(['company_profile', 'description', 'requirements', 'benefits', "Unnamed: 0"], axis=1)

Unnamed: 0,title,location,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,url_count_company_profile,url_count_description,url_count_benefits,keyword_count_title,keyword_count_company_profile,keyword_count_description,keyword_count_requirements,keyword_count_benefits
11538,Research Engineer Data Science,US,0,1,1,Full-time,Mid-Senior level,Master's Degree,Marketing and Advertising,Engineering,1,0,0,0,0,0,0,0,0
3723,Senior Geologist,US,0,0,0,Full-time,Mid-Senior level,Master's Degree,Oil & Energy,Engineering,1,0,0,0,1,0,0,0,0
