In [1]:
import pandas as pd
import seaborn as sns
import json

In [2]:
base_result_dir = '../benchmark_results'
TO_EXCLUDE_MODEL = ['lr']

In [3]:
def get_result_df(result_dir, target_metric):
    # print(result_dir)
    with open(f'{result_dir}/results_{target_metric}.json', 'r') as f:
        result = json.load(f)
        
    refined_result = {k: result[k]['cv_avg_scores'] for k in result.keys() if k not in TO_EXCLUDE_MODEL}
    return pd.DataFrame.from_dict(refined_result).T

In [4]:
from itertools import product

_token = 'words'
result_lst = []

_chained_lst = [None, 'chained', 'unchained']
_rule_lst = [False, True]
_feature_select_lst = [None, 'mutual_info_classif', 'chi2']
_sample_method_lst = [None, 'random_over', 'random_under', 'smote']

# get the combination of chained, rule, feature_select

for _sample_method, _chained, _rule, _feature_select in product(_sample_method_lst, _chained_lst, _rule_lst, _feature_select_lst):
    _rule_txt = '_rule_added' if _rule else ''
    
    if _chained is None:
        logging_nm = f'{_sample_method}/no_imputation_{_feature_select}'
        
    else:
        logging_nm = f'{_sample_method}/imputation_{_chained}_{_feature_select}{_rule_txt}'
        
    result_lst.append(logging_nm)
    print(logging_nm)

None/no_imputation_None
None/no_imputation_mutual_info_classif
None/no_imputation_chi2
None/no_imputation_None
None/no_imputation_mutual_info_classif
None/no_imputation_chi2
None/imputation_chained_None
None/imputation_chained_mutual_info_classif
None/imputation_chained_chi2
None/imputation_chained_None_rule_added
None/imputation_chained_mutual_info_classif_rule_added
None/imputation_chained_chi2_rule_added
None/imputation_unchained_None
None/imputation_unchained_mutual_info_classif
None/imputation_unchained_chi2
None/imputation_unchained_None_rule_added
None/imputation_unchained_mutual_info_classif_rule_added
None/imputation_unchained_chi2_rule_added
random_over/no_imputation_None
random_over/no_imputation_mutual_info_classif
random_over/no_imputation_chi2
random_over/no_imputation_None
random_over/no_imputation_mutual_info_classif
random_over/no_imputation_chi2
random_over/imputation_chained_None
random_over/imputation_chained_mutual_info_classif
random_over/imputation_chained_chi2
r

In [5]:
target_metric = 'AUC'
result_all = {}

for dir in result_lst:
    result_dir = f'{base_result_dir}/{dir}'
    result_df = get_result_df(result_dir, target_metric)
    best_ = result_df.sort_values(f"test_auc", ascending=False).reset_index().iloc[0, :]
    key_str = "-".join(dir.split('/')) + "-" + best_['index']
    # print("-".join(dir.split('/')))
    result_all[key_str] = {k: best_[k] for k in best_.keys() if 'test' in k}

In [15]:
result_df_original = pd.DataFrame.from_dict(result_all).T.sort_values('test_auc', ascending=False).reset_index()

In [42]:
# index name order: sample_method, imputation, feature_select, rule, model
# split the index name to get the information and create a new column for each information

index_nm_order = ['sample_method', 'imputation', 'feature_select', 'rule', 'model']
df = result_df_original.copy(deep=True)

# create new column for each information
for nm in index_nm_order:
    df[nm] = None

final_result_df_col = ["test_AUC", "test_precision", "test_recall", "test_f1", "sample_method", "imputation", "featuer_select", "rule", "model"]
final_result_df = {}

for idx_num, row in df.iterrows():
    temp_dict = {}
    
    index_nm = row['index']
    # print(index_nm.split('-'))
    
    lst = index_nm.split('-')
    
    sampling = lst[0]
    
    l2_split = []
    
    l2 = lst[1].split("_")
    # print(l2)
    imputation = "_".join(l2[:2])
    l2_split.append(imputation)
    
    l2 = l2[2:]
    
    if l2[-1] == 'added':
        rule_added = "rule_added"
        l2 = l2[:-2]
    
    l2_split.append(rule_added)    
    
    feat_sel = "_".join(l2)
    
    l2_split.append(feat_sel)   

    model = lst[2]
    
    for col in final_result_df_col:
        if col in row.keys():
            temp_dict[col] = row[col]
            
    temp_dict['sample_method'] = sampling
    temp_dict['imputation'] = imputation
    temp_dict['featuer_select'] = feat_sel
    temp_dict['rule'] = rule_added
    temp_dict['model'] = model
    
    final_result_df[idx_num] = temp_dict
        
final_result_df = pd.DataFrame.from_dict(final_result_df).T
final_result_df

Unnamed: 0,test_precision,test_recall,test_f1,sample_method,imputation,rule,model,featuer_select
0,0.801016,0.970022,0.970022,random_over,imputation_chained,rule_added,et,mutual_info_classif
1,0.801744,0.97047,0.97047,smote,imputation_chained,rule_added,et,mutual_info_classif
2,0.801598,0.970022,0.970022,random_under,imputation_chained,rule_added,et,chi2
3,0.801598,0.970022,0.970022,random_under,imputation_chained,rule_added,et,
4,0.801598,0.970022,0.970022,,imputation_chained,rule_added,et,
5,0.801598,0.970022,0.970022,,imputation_chained,rule_added,et,mutual_info_classif
6,0.801598,0.970022,0.970022,,imputation_chained,rule_added,et,chi2
7,0.801598,0.970022,0.970022,smote,imputation_chained,rule_added,et,chi2
8,0.788792,0.967114,0.967114,smote,imputation_chained,rule_added,rf,
9,0.787942,0.967114,0.967114,random_under,imputation_chained,rule_added,rf,mutual_info_classif


In [41]:
baseline_no_imputation_mutual = f'{base_result_dir}/{result_lst[1]}'
target_metric = 'AUC'

get_result_df(baseline_no_imputation_mutual, target_metric)

../benchmark_results/no_imputation_chi2


Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_AUC
rf,30.589674,1.001763,0.976905,0.335031,0.4965,0.9553
svm,15.984631,0.992805,0.864162,0.538372,0.66102,0.93952
knn,0.008799,1.282814,0.866666,0.249425,0.377389,0.922547
ada,174.733237,0.903558,0.72845,0.185966,0.292814,0.906279
et,6.076001,0.159152,0.979707,0.382451,0.547363,0.956023


In [42]:
baseline_no_imputation_mutual = f'{base_result_dir}/{result_lst[2]}'
target_metric = 'AUC'

get_result_df(baseline_no_imputation_mutual, target_metric)

../benchmark_results/no_imputation_mutual_info_classif


Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_AUC
rf,16.495235,0.530955,0.983426,0.259877,0.407879,0.952491
svm,15.705963,1.000486,0.919137,0.472507,0.62157,0.937262
knn,0.008409,1.273158,0.942148,0.112002,0.198794,0.921035
ada,176.617374,0.923785,0.812553,0.144387,0.242453,0.904216
et,6.190374,0.131533,0.975129,0.325809,0.485077,0.950507


In [43]:
baseline_no_imputation_mutual = f'{base_result_dir}/{result_lst[3]}'
target_metric = 'AUC'

get_result_df(baseline_no_imputation_mutual, target_metric)

../benchmark_results/no_imputation_chi2


Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_AUC
rf,30.589674,1.001763,0.976905,0.335031,0.4965,0.9553
svm,15.984631,0.992805,0.864162,0.538372,0.66102,0.93952
knn,0.008799,1.282814,0.866666,0.249425,0.377389,0.922547
ada,174.733237,0.903558,0.72845,0.185966,0.292814,0.906279
et,6.076001,0.159152,0.979707,0.382451,0.547363,0.956023


In [44]:
baseline_no_imputation_mutual = f'{base_result_dir}/{result_lst[4]}'
target_metric = 'AUC'

get_result_df(baseline_no_imputation_mutual, target_metric)

../benchmark_results/imputation_chained_mutual_info_classif


Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_AUC
rf,33.602512,0.946689,0.945827,0.315277,0.470358,0.95919
svm,16.196224,0.939184,0.796626,0.653582,0.715435,0.945733
knn,0.009065,1.281965,0.801082,0.297902,0.433452,0.937934
ada,17.481562,0.117749,0.83119,0.121264,0.210223,0.900647
et,6.138364,0.155147,0.934162,0.392596,0.551713,0.962763


In [45]:
baseline_no_imputation_mutual = f'{base_result_dir}/{result_lst[5]}'
target_metric = 'AUC'

get_result_df(baseline_no_imputation_mutual, target_metric)

../benchmark_results/imputation_chained_chi2


Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_AUC
rf,30.662671,0.938998,0.938021,0.271411,0.417177,0.958105
svm,16.388191,0.946424,0.776302,0.65131,0.706031,0.944468
knn,0.008775,1.293178,0.846312,0.315183,0.457723,0.940579
ada,172.205981,0.815168,0.835331,0.143157,0.242822,0.904059
et,62.382989,1.112461,0.938502,0.41803,0.577024,0.961841


In [46]:
baseline_no_imputation_mutual = f'{base_result_dir}/{result_lst[9]}'
target_metric = 'AUC'

get_result_df(baseline_no_imputation_mutual, target_metric)

../benchmark_results/imputation_unchained_chi2


Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_AUC
rf,40.045457,1.723395,0.951463,0.32101,0.475237,0.955286
svm,17.271997,1.196877,0.884241,0.534723,0.664842,0.941455
knn,0.009844,1.554575,0.754693,0.278401,0.402935,0.926979
ada,193.876039,1.156833,0.802648,0.166346,0.273329,0.906336
et,71.594271,1.193292,0.937041,0.42621,0.582788,0.957302


In [9]:
import pandas as pd
df = pd.read_csv("../data/imputed/words/chained/fake_job_postings.csv")
df.iloc[[11538, 3723], :]

Unnamed: 0,title,location,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
11538,Research Engineer Data Science,US,Aptitude Staffing Solutions has redesigned the...,"Research Engineer | Data Science | San Mateo, ...",Position requirements: Quickly learning new ar...,The BenefitsGenerous paid time off to help you...,0,1,1,Full-time,Mid-Senior level,Master's Degree,Marketing and Advertising,Engineering,1
3723,Senior Geologist,US,,Job DescriptionCompany Overview:INTECSEA is a ...,Job RequirementsTechnical Requirements:Working...,,0,0,0,Full-time,Mid-Senior level,Master's Degree,Oil & Energy,Engineering,1


In [8]:
df = pd.read_csv("../data/fake_job_postings.csv")
df.iloc[[11538, 3723], :].drop(['job_id', 'salary_range'], axis=1)

Unnamed: 0,title,location,department,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
11538,Research Engineer Data Science,"US, CA, San Mateo",Engineering,Aptitude Staffing Solutions has redesigned the...,"Research Engineer | Data Science | San Mateo, ...",Position requirements: Quickly learning new ar...,The BenefitsGenerous paid time off to help you...,0,1,1,Full-time,Mid-Senior level,Master's Degree,Marketing and Advertising,Engineering,1
3723,Senior Geologist,"US, TX, Houston",,,Job DescriptionCompany Overview:INTECSEA is a ...,Job RequirementsTechnical Requirements:Working...,,0,0,0,Full-time,,Master's Degree,Oil & Energy,Engineering,1


In [18]:
pd.set_option("display.max_columns", 30)
# display the dataframe head

df = pd.read_csv("../data/imputed/words/chained/fake_job_postings_rule_added.csv")
df.iloc[[11538, 3723], :].drop(['company_profile', 'description', 'requirements', 'benefits', "Unnamed: 0"], axis=1)

Unnamed: 0,title,location,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,url_count_company_profile,url_count_description,url_count_benefits,keyword_count_title,keyword_count_company_profile,keyword_count_description,keyword_count_requirements,keyword_count_benefits
11538,Research Engineer Data Science,US,0,1,1,Full-time,Mid-Senior level,Master's Degree,Marketing and Advertising,Engineering,1,0,0,0,0,0,0,0,0
3723,Senior Geologist,US,0,0,0,Full-time,Mid-Senior level,Master's Degree,Oil & Energy,Engineering,1,0,0,0,1,0,0,0,0
