In [1]:
import numpy as np
import pandas as pd

In [2]:
def label_data(train_set,test_set,keywords,workshops,acl_labeled,website_positive):
    """Label the dataset (social good or not) with a set of rules 

    Parameters:
    train_set (df): Dataframe with papers information
    test_set (df): Dataframe with papers information
    keywords (df):
    workshops (df):
    acl_labeled (df): Labeled df with positive examples
    website_positive (df): Labeled df with positive examples
    Returns:
    dataframe labeled for training,
    dataframe with positive observations not in the train set
    unlabeled dataset
    """
    keywords=keywords.assign(Keywords=np.where(keywords.Keywords=='asl',' asl ',keywords.Keywords))
    percentiles=train_set.cosine_similarity.describe(percentiles=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.99]).reset_index()
    perc_99=percentiles.loc[percentiles['index']=="99%"].cosine_similarity.values[0]

    ## take positive examples from acl set
    acl_labeled=acl_labeled.loc[~acl_labeled['social good domain'].isna()].reset_index(drop=True)
    ## make sure it doesnt contain observations in the training set
    acl_labeled=acl_labeled.loc[~acl_labeled.paper_name.isin(test_set.title)]
    acl_labeled=acl_labeled.loc[:,['paper_name']]
    acl_labeled_add=acl_labeled.loc[~acl_labeled.paper_name.isin(train_set.title.values)].reset_index(drop=True)
    acl_labeled_add=acl_labeled_add.rename(columns={'paper_name':'title'})
    acl_labeled_add=acl_labeled_add.assign(title_abstract=acl_labeled_add.title)
    acl_labeled_add=acl_labeled_add.assign(abstract="")
    acl_labeled_add=acl_labeled_add.assign(year=2020)
    acl_labeled_add=acl_labeled_add.assign(ID=acl_labeled_add.title)
    acl_labeled_add=acl_labeled_add.assign(positive=1)

    ## concat positive examples and unlabeled ones 
    train_set=train_set.assign(positive=np.where(train_set.title.isin(acl_labeled.paper_name.values),1,0))
    train_set=pd.concat([train_set,acl_labeled_add])
    train_set=train_set.assign(abstract=train_set.abstract.fillna(''))
    train_set=train_set.assign(title_abstract=train_set.title+". "+train_set.abstract)
    train_set.title_abstract=train_set.title_abstract.replace("{","",regex=True).replace("}","",regex=True)

    ## website labeled positive examples
    website_positive=website_positive.assign(title_abstract=website_positive.title+". "+website_positive.abstract)
    website_positive=website_positive.rename(columns={'paperId':'ID'})
    website_positive=website_positive.assign(label=1)
    website_positive=website_positive.loc[:,['ID','title','abstract','title_abstract','label','year','url']]

    ## rule based identification of positive and negative examples
    train_set_positive=train_set.loc[(train_set.url.str.lower().str.contains('|'.join(list(workshops.Event.values))))|
               (train_set.title_abstract.str.lower().str.contains('|'.join(list(keywords.Keywords.values)))) |
               (train_set.cosine_similarity>=perc_99),:]

    train_set_negative=train_set.loc[~((train_set.url.str.lower().str.contains('|'.join(list(workshops.Event.values))))|
               (train_set.title_abstract.str.lower().str.contains('|'.join(list(keywords.Keywords.values)))) |
               (train_set.cosine_similarity>=perc_99)),:]

    ## take negative examples with the lowest cosine similarity with social needs
    median_cos_sim=round(train_set_negative.cosine_similarity.median(),6)

    train_set_worst=train_set_negative.loc[train_set_negative.cosine_similarity<median_cos_sim]

    ## label those examples
    train_set_positive=train_set_positive.assign(label=1)
    train_set_worst=train_set_worst.assign(label=0)

    train_set_positive=train_set_positive.loc[:,['ID','title','abstract','title_abstract','label','year','url']]
    train_set_worst=train_set_worst.loc[:,['ID','title','abstract','title_abstract','label','year','url']]

    train_set_positive=pd.concat([train_set_positive,website_positive])

    ## create a trainning set with the same proportion of positive examples as the original set
    proportion=round(train_set_positive.shape[0]/(train_set_positive.shape[0]+train_set_negative.shape[0]),3)
    ## proportion around 10 percent
    positive_obs=round(train_set_worst.shape[0]*0.10)

    train_set_positive_sample=train_set_positive.sample(n=positive_obs,random_state=42)

    train_set_final=pd.concat([train_set_worst,train_set_positive_sample])

    train_set_final=train_set_final.drop_duplicates(subset=['title'])

    unused_positive=train_set_positive.loc[~train_set_positive.title.isin(train_set_final.title.unique())].drop_duplicates(subset=['title'])

    unlabeled=train_set_negative.loc[(~train_set_negative.title.isin(train_set_final.title.unique())) &
                                     (~train_set_negative.title.isin(unused_positive.title.unique())),
                                     ['ID', 'title', 'abstract', 'title_abstract', 'year', 'url']].drop_duplicates(subset=['title'])


    return (train_set_final,unused_positive,unlabeled)

In [3]:
def main():
    data_path="../../data/"
    outputs_path="../../outputs/"
    train_set=pd.read_csv(outputs_path+"general/train_set.csv")
    test_set=pd.read_csv(outputs_path+"general/test_set_SG_annotate.csv")
    ## help for filtering positive examples
    workshops=pd.read_csv(data_path+"others/sg_workshops.csv")
    keywords=pd.read_csv(data_path+"others/sg_keywords.csv")
    ## labeled positive examples
    acl_labeled=pd.read_csv(data_path+"papers/acl20_long.csv",error_bad_lines=False)
    website_positive=pd.read_json(data_path+"papers/papers.json")
    
    train_set_final,unused_positive,unlabeled=label_data(train_set,test_set,keywords,workshops,acl_labeled,website_positive)
    
    train_set_final.to_csv(outputs_path+"sg_classifier/train_set_labeled.csv",index=False)
    unlabeled.to_csv(outputs_path+"sg_classifier/unlabeled_set.csv",index=False)
    unused_positive.to_csv(outputs_path+"sg_classifier/unused_positive.csv",index=False)

if __name__ == '__main__':
    main()



