## Evaluate data augmentation techniques on train set

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import re
PERC99=0.22449437871575342
MEDIAN=0.076746

In [2]:
def get_keyword_types(keywords):
    ## keywords
    keywords=keywords.loc[(~keywords.Keywords.isna())&(keywords['Concern w.r.t. precision'].isna()),['Keywords','python_checker (default = string_match, other_options=nltk.word_tokenize + match; lower_case+remove_non_alphabet+string_match']]
    keywords.columns=['keywords','method']
    keywords.keywords=keywords.keywords.str.lower()
    keywords=keywords.assign(extraction_method=np.where(keywords.method.isna(),'contains',
                                            np.where(keywords.method.str.lower().str.contains('exclude'),'start_special',
                                            np.where(keywords.method.str.lower().str.contains('not'),'not_in',
                                            np.where(keywords.method.str.lower().str.contains('starts'),'start','in')))))
    keywords=keywords.assign(keywords=keywords.keywords.replace("-"," ",regex=True))
    keywords=keywords.assign(keywords=keywords.keywords.apply(lambda x:re.sub('[^a-zA-Z0-9 ]+', '',x)))
    key_start=keywords.loc[keywords.extraction_method=='start']
    key_contains=keywords.loc[keywords.extraction_method=='contains']
    key_in=keywords.loc[keywords.extraction_method=='in']
    key_not_in=keywords.loc[keywords.extraction_method=='not_in']
    key_special=keywords.loc[keywords.extraction_method=='start_special']
    return (key_start,key_contains,key_in,key_not_in,key_special)


In [3]:
def keyword_search(df_merged,keywords):
    key_start,key_contains,key_in,key_not_in,key_special=get_keyword_types(keywords)
    ## title abstract
    df_merged=df_merged.assign(title_abstract_search=df_merged.title_abstract_clean.replace("-"," ",regex=True))
    df_merged=df_merged.assign(title_abstract_search=df_merged.title_abstract_search.apply(lambda x:re.sub('[^a-zA-Z0-9 ]+', '',x)))
    # title
    df_test_final.title=df_test_final.title.replace("{","",regex=True).replace("}","",regex=True)
    df_merged=df_merged.assign(title_search=df_merged.title.replace("-"," ",regex=True))
    df_merged=df_merged.assign(title_search=df_merged.title_search.apply(lambda x:re.sub('[^a-zA-Z0-9 ]+', '',x)))

    ## keywords
    df_merged=df_merged.assign(keyword_pred=np.where(
        (df_merged.title_abstract_search.apply(lambda x:any(word.startswith(tuple(key_special.keywords)) for word in x.lower().split()))) 
        ,1,0))

    df_merged=df_merged.assign(keyword_pred=np.where(
        (df_merged.title_abstract_search.apply(lambda x: any(word.startswith(tuple(key_not_in.keywords)) for word in x.lower().split())))
        ,0,df_merged.keyword_pred))

    df_merged=df_merged.assign(keyword_pred=np.where(
        (df_merged.title_abstract_search.str.lower().str.contains('|'.join(list(key_contains.keywords.values)))) |
        (df_merged.title_abstract_search.apply(lambda x:any(word.startswith(tuple(key_start.keywords)) for word in x.lower().split()))) |
        (df_merged.title_abstract_search.apply(lambda x:any(word in (tuple(key_in.keywords)) for word in x.lower().split()))) 
        ,1,df_merged.keyword_pred))

    ## keywords
    df_merged=df_merged.assign(keyword_title_pred=np.where(
        (df_merged.title.apply(lambda x:any(word.startswith(tuple(key_special.keywords)) for word in x.lower().split()))) 
        ,1,0))

    df_merged=df_merged.assign(keyword_title_pred=np.where(
        (df_merged.title.apply(lambda x: any(word.startswith(tuple(key_not_in.keywords)) for word in x.lower().split())))
        ,0,df_merged.keyword_title_pred))

    df_merged=df_merged.assign(keyword_title_pred=np.where(
        (df_merged.title.str.lower().str.contains('|'.join(list(key_contains.keywords.values)))) |
        (df_merged.title.apply(lambda x:any(word.startswith(tuple(key_start.keywords)) for word in x.lower().split()))) |
        (df_merged.title.apply(lambda x:any(word in (tuple(key_in.keywords)) for word in x.lower().split()))) 
        ,1,df_merged.keyword_title_pred))
    
    return df_merged

In [4]:
def combinations(df_merged):

    df_merged=df_merged.assign(pred_combined=np.where(df_merged.workshop_pred==1,1,
                                np.where(df_merged.keyword_pred==1,1,
                                np.where(df_merged.similarity_pos_pred==1,1,
                                np.where(df_merged.similarity_neg_pred,0,2)))))

    df_merged=df_merged.assign(pred_combined_title=np.where(df_merged.workshop_pred==1,1,
                                np.where(df_merged.keyword_title_pred==1,1,
                                np.where(df_merged.similarity_pos_pred==1,1,
                                np.where(df_merged.similarity_neg_pred,0,2)))))

    df_merged=df_merged.assign(pred_combined_title2=np.where(df_merged.keyword_title_pred==1,1,
                                np.where(df_merged.workshop_pred==1,1,
                                np.where(df_merged.similarity_pos_pred==1,1,
                                np.where(df_merged.similarity_neg_pred,0,2)))))

    df_merged=df_merged.assign(pred_similarities=np.where(df_merged.similarity_pos_pred==1,1,
                                np.where(df_merged.similarity_neg_pred,0,2)))

    df_merged=df_merged.assign(pred_keyword_t_workshop=np.where(df_merged.workshop_pred==1,1,
                                np.where(df_merged.keyword_title_pred==1,1,0)))

    df_merged=df_merged.assign(pred_keyword_workshop=np.where(df_merged.workshop_pred==1,1,
                                np.where(df_merged.keyword_pred==1,1,0)))

    df_merged=df_merged.assign(pred_combined_title_final=np.where(df_merged.keyword_title_pred==1,1,
                                np.where(df_merged.similarity_pos_pred==1,1,
                                np.where(df_merged.similarity_neg_pred,0,2))))
    return df_merged

In [5]:
data_path="../../data/"
outputs_path="../../outputs/"
df_test_final=pd.read_csv(outputs_path+"general/train_set_final.csv")
workshops=pd.read_csv(data_path+"others/sg_workshops_v3.csv")
keywords=pd.read_csv(data_path+"others/sg_keywords_v6.csv")
match_unique=pd.read_csv(outputs_path+"general/papers_uniques.csv")

In [6]:
workshops=workshops.loc[(workshops.SG_or_not==1)].reset_index(drop=True)
match_unique=match_unique.loc[:,['ID','cosine_similarity']]
df_merged=df_test_final.merge(match_unique,how='left',on='ID')

# similarity
df_merged=df_merged.assign(similarity_neg_pred=np.where(df_merged.cosine_similarity<=MEDIAN,0,1))
df_merged=df_merged.assign(similarity_pos_pred=np.where(df_merged.cosine_similarity>=PERC99,1,0))

In [7]:
## workshop
df_merged=df_merged.assign(workshop_pred=np.where((df_merged.url.str.lower().str.contains('|'.join(list(workshops.event.values)))),1,0))

In [8]:
## keywords
df_merged=keyword_search(df_merged,keywords)

In [9]:
## combine techniques
df_merged=combinations(df_merged)

In [10]:
#workshops
print(classification_report(df_merged.label,df_merged.workshop_pred,digits=4))

              precision    recall  f1-score   support

           0     0.8876    0.9964    0.9388      2203
           1     0.7037    0.0640    0.1173       297

    accuracy                         0.8856      2500
   macro avg     0.7956    0.5302    0.5281      2500
weighted avg     0.8657    0.8856    0.8412      2500



In [11]:
#keyword title
print(classification_report(df_merged.label,df_merged.keyword_title_pred,digits=4))

              precision    recall  f1-score   support

           0     0.9427    0.9927    0.9671      2203
           1     0.9111    0.5522    0.6876       297

    accuracy                         0.9404      2500
   macro avg     0.9269    0.7725    0.8273      2500
weighted avg     0.9389    0.9404    0.9339      2500



In [12]:
#keyword
print(classification_report(df_merged.label,df_merged.keyword_pred,digits=4))

              precision    recall  f1-score   support

           0     0.9665    0.9029    0.9336      2203
           1     0.5158    0.7677    0.6171       297

    accuracy                         0.8868      2500
   macro avg     0.7412    0.8353    0.7753      2500
weighted avg     0.9129    0.8868    0.8960      2500



In [14]:
#cos_sim pos
print(classification_report(df_merged.label,df_merged.similarity_pos_pred,digits=4))

              precision    recall  f1-score   support

           0     0.8836    0.9959    0.9364      2203
           1     0.4706    0.0269    0.0510       297

    accuracy                         0.8808      2500
   macro avg     0.6771    0.5114    0.4937      2500
weighted avg     0.8345    0.8808    0.8312      2500



In [13]:
#cos_sim neg
print(classification_report(df_merged.label,df_merged.similarity_neg_pred,digits=4))

              precision    recall  f1-score   support

           0     0.9029    0.4898    0.6351      2203
           1     0.1387    0.6094    0.2260       297

    accuracy                         0.5040      2500
   macro avg     0.5208    0.5496    0.4305      2500
weighted avg     0.8121    0.5040    0.5865      2500



In [10]:
#similarities
print(classification_report(df_merged.loc[df_merged.pred_similarities.isin([0,1])].label,
                            df_merged.loc[df_merged.pred_similarities.isin([0,1])].pred_similarities,digits=4))

              precision    recall  f1-score   support

           0     0.8657    0.9920    0.9245      1124
           1     0.4706    0.0442    0.0808       181

    accuracy                         0.8605      1305
   macro avg     0.6681    0.5181    0.5027      1305
weighted avg     0.8109    0.8605    0.8075      1305



## combinations

In [16]:
#combined no workshops
print(classification_report(df_merged.loc[df_merged.pred_combined_title_final.isin([0,1])].label,
                            df_merged.loc[df_merged.pred_combined_title_final.isin([0,1])].pred_combined_title_final,digits=4))

              precision    recall  f1-score   support

           0     0.9278    0.9779    0.9522      1130
           1     0.8698    0.6601    0.7506       253

    accuracy                         0.9197      1383
   macro avg     0.8988    0.8190    0.8514      1383
weighted avg     0.9172    0.9197    0.9153      1383



In [17]:
#combined title
print(classification_report(df_merged.loc[df_merged.pred_combined_title.isin([0,1])].label,
                            df_merged.loc[df_merged.pred_combined_title.isin([0,1])].pred_combined_title,digits=4))

              precision    recall  f1-score   support

           0     0.9306    0.9708    0.9503      1132
           1     0.8398    0.6784    0.7505       255

    accuracy                         0.9171      1387
   macro avg     0.8852    0.8246    0.8504      1387
weighted avg     0.9139    0.9171    0.9136      1387



In [19]:
#combined keyword abstract
print(classification_report(df_merged.loc[df_merged.pred_combined.isin([0,1])].label,
                            df_merged.loc[df_merged.pred_combined.isin([0,1])].pred_combined,digits=4))

              precision    recall  f1-score   support

           0     0.9538    0.8148    0.8788      1215
           1     0.5044    0.8267    0.6265       277

    accuracy                         0.8170      1492
   macro avg     0.7291    0.8208    0.7527      1492
weighted avg     0.8703    0.8170    0.8320      1492

