In [1]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
from datasets import load_metric
from datasets import Dataset
import torch
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import AutoModelForSequenceClassification,AutoTokenizer
from sklearn.metrics import classification_report
import os
import sys
import pandas as pd
import numpy as np
import argparse
import re
from collections import Counter
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

In [2]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = accuracy_score(y_true=labels, y_pred=predictions)
    recall = recall_score(y_true=labels, y_pred=predictions)
    precision = precision_score(y_true=labels, y_pred=predictions)
    recall_w = recall_score(y_true=labels, y_pred=predictions,average='weighted')
    precision_w = precision_score(y_true=labels, y_pred=predictions,average='weighted')
    f1 = f1_score(y_true=labels, y_pred=predictions)
    f1_pos = f1_score(y_true=labels, y_pred=predictions,average='binary',pos_label=1)
    f1_micro = f1_score(y_true=labels, y_pred=predictions,average='micro')
    f1_weighted = f1_score(y_true=labels, y_pred=predictions,average='weighted')
    return {"accuracy": accuracy, "precision": precision, "recall": recall,
             "precision_w": precision_w, "recall_w": recall_w,
             "f1": f1,"f1_pos": f1_pos,
            "f1_micro": f1_micro,"f1_weighted": f1_weighted} 
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True,max_length=512)

In [3]:
def get_report(df_results):
    """Get classification report

    Parameters:
    df_results: should have column label and prediction
    """
    cr=classification_report(df_results.label,df_results.prediction,digits=4,output_dict=True)

    cr=pd.DataFrame(cr).reset_index().rename(columns={'index':'metric'})

    cr_df=pd.melt(cr,id_vars=['metric'],value_vars=['0','1','accuracy','macro avg','weighted avg'])

    cr_df=cr_df.loc[cr_df.metric!="support"]
    cr_df=cr_df.loc[~((cr_df.variable=="accuracy") & (cr_df.metric.isin(['precision','recall'])))]

    cr_df=cr_df.assign(variable=np.where(cr_df.variable=='0','negative',
                                        np.where(cr_df.variable=='1','positive',cr_df.variable)))

    cr_df=cr_df.assign(value=cr_df.value.apply(lambda x:round(x,4)*100))
    return cr_df

In [4]:
def get_all_results(df_predictions,df_labeled):
    df_labeled=df_labeled.assign(prediction=df_labeled.label)
    df_predictions=pd.concat([df_predictions,df_labeled])
    total_positives=df_predictions.loc[df_predictions.prediction==1]
    total_negatives=df_predictions.loc[df_predictions.prediction==0]
    total_negatives=total_negatives.loc[:,['ID','title','abstract','url','year','title_abstract']]
    total_negatives=total_negatives.assign(label=0)
    total_positives=total_positives.loc[:,['ID','title','abstract','url','year','title_abstract']]
    total_positives=total_positives.assign(label=1)
    return total_positives,total_negatives

In [5]:
def evaluate(df_test_final,trainer,threshold=0.5):
    """Get predictions

    Parameters:
    df_test_final (df): dataframe with text for predictions
    trainer: Trainer with all the configurations
    Returns:
    dataset_test_final_pd
    """
    dataset_test_final = Dataset.from_pandas(df_test_final)
    tokenized_datasets_test_final = dataset_test_final.map(tokenize_function, batched=True)

    test_results_final = trainer.predict(tokenized_datasets_test_final)
    preds_final=[]
    for e in test_results_final.predictions:
        preds_final.append(np.array(torch.softmax(torch.Tensor(e), dim=0)))

    preds_final=np.vstack(preds_final)
    dataset_test_final_pd=tokenized_datasets_test_final.data.to_pandas()

    dataset_test_final_pd=dataset_test_final_pd.assign(proba0=preds_final[:,0])
    dataset_test_final_pd=dataset_test_final_pd.assign(proba1=preds_final[:,1])
    dataset_test_final_pd=dataset_test_final_pd.assign(prediction=np.where(dataset_test_final_pd.proba1>threshold,1,0))
    return dataset_test_final_pd

In [6]:
data_path="../../data/"
outputs_path="../../outputs/"
## READ DATA
workshops=pd.read_csv(data_path+"others/sg_workshops_v3.csv")
keywords=pd.read_csv(data_path+"others/sg_keywords_v6.csv")
## text info of the dataset (it is more complete since it was extracted directly from the pdfs)

## annotated test dataset
df_test_final=pd.read_csv(outputs_path+"general/test_set_final.csv")
df_dev_final=pd.read_csv(outputs_path+"general/dev_set_final.csv")

df_unused=pd.read_csv(outputs_path+"sg_classifier/weakly_labeled_unused_bronze_title_15pct_f.csv")
df_unlabeled=pd.read_csv(outputs_path+"sg_classifier/unlabeled_set_bronze_title_15pct_f.csv")
df_unused=df_unused.assign(text=df_unused.title_abstract)
df_unlabeled=df_unlabeled.assign(text=df_unlabeled.title_abstract)

df_labeled=pd.read_csv(outputs_path+"sg_classifier/train_set_labeled_bronze_title_15pct_f.csv")

model = AutoModelForSequenceClassification.from_pretrained("./model_scibert/", num_labels=2)



In [7]:
## Predict test dataset

training_args = TrainingArguments(output_dir="./model_scibert/", evaluation_strategy="epoch",
                                 per_device_train_batch_size=16,per_device_eval_batch_size=16,
                                 seed=42,num_train_epochs=5,auto_find_batch_size=True,
                                     do_train = False,do_predict = True)
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
)

In [8]:
df_general=pd.concat([df_unused,df_unlabeled]).reset_index(drop=True)

df_general.label=df_general.label.fillna(0)

df_general.label=df_general.label.apply(int)

In [9]:
df_general.shape

(37644, 8)

In [13]:
df_predictions=evaluate(df_general,trainer,0.5)

total_positives,total_negatives=get_all_results(df_predictions,df_labeled)
#total_positives.to_csv(outputs_path+"sg_classifier/all_positive_examples_final.csv",index=False)
#total_negatives.to_csv(outputs_path+"sg_classifier/all_negative_examples_final.csv",index=False)

df_predictions.shape

  0%|          | 0/38 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: abstract, ID, text, title_abstract, year, url, title. If abstract, ID, text, title_abstract, year, url, title are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 37644
  Batch size = 16


(37644, 14)

In [14]:
total_positives.shape

(8133, 7)

In [15]:
df_labeled.shape

(30000, 7)

In [11]:
total_positives.shape

(8449, 7)

In [8]:
df_labeled.shape

(2500, 6)

In [9]:
df_test_final=df_test_final.iloc[:2000,:]

In [10]:
df_test_final.shape

(2000, 17)

In [11]:
np.random.seed(seed=42)
#random_pred=np.random.uniform(0,1,df_test_final.shape[0])
random_pred=np.random.choice([0, 1], size=(df_test_final.shape[0],), p=[0.5, 0.5])
df_test_final['random_proba_1']=random_pred

df_random=df_test_final.assign(prediction=np.where(df_test_final.random_proba_1>0.5,1,0))

cr_random=get_report(df_random)

#df_minority=df_test_final.assign(prediction=1)

#cr_random=get_report(df_minority)

In [12]:
cr_random

Unnamed: 0,metric,variable,value
0,precision,negative,83.89
1,recall,negative,47.67
2,f1-score,negative,60.79
4,precision,positive,10.27
5,recall,positive,39.54
6,f1-score,positive,16.3
10,f1-score,accuracy,46.6
12,precision,macro avg,47.08
13,recall,macro avg,43.61
14,f1-score,macro avg,38.55


In [13]:
np.random.seed(seed=42)
random_pred=np.random.choice([0, 1], size=(df_test_final.shape[0],), p=[0.9, 0.1])
df_test_final['random_proba_1']=random_pred

df_random=df_test_final.assign(prediction=np.where(df_test_final.random_proba_1>0.5,1,0))

cr_random=get_report(df_random)


In [14]:
cr_random

Unnamed: 0,metric,variable,value
0,precision,negative,86.41
1,recall,negative,89.69
2,f1-score,negative,88.02
4,precision,positive,9.14
5,recall,positive,6.84
6,f1-score,positive,7.83
10,f1-score,accuracy,78.8
12,precision,macro avg,47.77
13,recall,macro avg,48.27
14,f1-score,macro avg,47.92


In [15]:
df_results_dev=evaluate(df_dev_final,trainer,0.5)

  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: abstract_clean, method_annotation, title, text, org_annotation, goal1_raw, year, task_annotation, url, abstract, title_clean, goal3_raw, acknowledgments_clean, ID, title_abstract_clean, goal2_raw. If abstract_clean, method_annotation, title, text, org_annotation, goal1_raw, year, task_annotation, url, abstract, title_clean, goal3_raw, acknowledgments_clean, ID, title_abstract_clean, goal2_raw are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 500
  Batch size = 16


In [16]:
df_results_dev.shape

(500, 23)

In [22]:
best_threshold=0.5
best_f1=0
#for p in [0,0.01,0.02,0.03,0.04,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,0.96,0.97,0.98,0.99,0.999,0.9999,0.99999,0.9999999]:
for p in [0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95]:
    df_analyze=df_results_dev.assign(prediction2=np.where(df_results_dev.proba1>p,1,0))
    f1=classification_report(df_analyze.label,df_analyze.prediction2,digits=4,output_dict=True)['1']['f1-score']
    if f1>best_f1:
        best_f1=f1
        best_threshold=p
    print(p)
    print(classification_report(df_analyze.label,df_analyze.prediction2,digits=4))
    print("##########################")

0
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       437
           1     0.1260    1.0000    0.2238        63

    accuracy                         0.1260       500
   macro avg     0.0630    0.5000    0.1119       500
weighted avg     0.0159    0.1260    0.0282       500

##########################
0.05
              precision    recall  f1-score   support

           0     0.9438    0.9611    0.9524       437
           1     0.6909    0.6032    0.6441        63

    accuracy                         0.9160       500
   macro avg     0.8174    0.7821    0.7982       500
weighted avg     0.9120    0.9160    0.9135       500

##########################
0.1
              precision    recall  f1-score   support

           0     0.9420    0.9657    0.9537       437
           1     0.7115    0.5873    0.6435        63

    accuracy                         0.9180       500
   macro avg     0.8268    0.7765    0.7986       500
weighted 

In [23]:
best_f1

0.6666666666666666

In [24]:
best_threshold

0.15

In [25]:
df_results=evaluate(df_test_final,trainer,best_threshold)

cr_final=get_report(df_results)
cr_final=cr_final.rename(columns={'value':'Fine tuned BERT'})

  0%|          | 0/2 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: abstract_clean, method_annotation, title, text, org_annotation, goal1_raw, random_proba_1, year, task_annotation, url, abstract, title_clean, goal3_raw, acknowledgments_clean, ID, title_abstract_clean, goal2_raw. If abstract_clean, method_annotation, title, text, org_annotation, goal1_raw, random_proba_1, year, task_annotation, url, abstract, title_clean, goal3_raw, acknowledgments_clean, ID, title_abstract_clean, goal2_raw are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2000
  Batch size = 16


In [26]:
#gold
print(best_threshold)
print(best_f1)
print(classification_report(df_results.label,df_results.prediction,digits=4))

0.15
0.6666666666666666
              precision    recall  f1-score   support

           0     0.9447    0.9741    0.9592      1737
           1     0.7847    0.6236    0.6949       263

    accuracy                         0.9280      2000
   macro avg     0.8647    0.7988    0.8270      2000
weighted avg     0.9237    0.9280    0.9244      2000



In [21]:
#gold
print(best_threshold)
print(best_f1)
print(classification_report(df_results.label,df_results.prediction,digits=4))

0.01
0.6721311475409837
              precision    recall  f1-score   support

           0     0.9513    0.9551    0.9532      1737
           1     0.6953    0.6768    0.6859       263

    accuracy                         0.9185      2000
   macro avg     0.8233    0.8160    0.8196      2000
weighted avg     0.9176    0.9185    0.9180      2000



In [21]:
#silver_keytitle_15pct_f
print(best_threshold)
print(best_f1)
print(classification_report(df_results.label,df_results.prediction,digits=4))

0.75
0.7924528301886793
              precision    recall  f1-score   support

           0     0.9461    0.9902    0.9677      1737
           1     0.9066    0.6274    0.7416       263

    accuracy                         0.9425      2000
   macro avg     0.9263    0.8088    0.8546      2000
weighted avg     0.9409    0.9425    0.9379      2000



In [33]:
#silver_keytitle_15pct_f
print(best_threshold)
print(best_f1)
print(classification_report(df_results.label,df_results.prediction,digits=4))

0.9
0.7924528301886793
              precision    recall  f1-score   support

           0     0.9441    0.9925    0.9677      1737
           1     0.9253    0.6122    0.7368       263

    accuracy                         0.9425      2000
   macro avg     0.9347    0.8023    0.8523      2000
weighted avg     0.9417    0.9425    0.9374      2000



In [21]:
#bronze
print(best_threshold)
print(best_f1)
print(classification_report(df_results.label,df_results.prediction,digits=4))

0.2
0.7826086956521738
              precision    recall  f1-score   support

           0     0.9525    0.9816    0.9668      1737
           1     0.8476    0.6768    0.7526       263

    accuracy                         0.9415      2000
   macro avg     0.9001    0.8292    0.8597      2000
weighted avg     0.9387    0.9415    0.9387      2000



In [32]:
#bronze
print(best_threshold)
print(best_f1)
print(classification_report(df_results.label,df_results.prediction,digits=4))

0.3
0.7826086956521738
              precision    recall  f1-score   support

           0     0.9525    0.9822    0.9671      1737
           1     0.8517    0.6768    0.7542       263

    accuracy                         0.9420      2000
   macro avg     0.9021    0.8295    0.8607      2000
weighted avg     0.9393    0.9420    0.9391      2000



## Results analysis

In [28]:
df_results=df_results.assign(goal1=np.where(df_results['goal1_raw'].str.lower().str.contains("education"),'Quality Education',
                    np.where(df_results['goal1_raw'].str.lower().str.contains("poverty"),'No Poverty',
                    np.where(df_results['goal1_raw'].str.lower().str.contains("hunger"),'Zero Hunger',
                    np.where(df_results['goal1_raw'].str.lower().str.contains("clean_water"),'Clean Water and Sanitation',
                    np.where(df_results['goal1_raw'].str.lower().str.contains("clean_energy"),'Affordable and Clean Energy',
                    np.where(df_results['goal1_raw'].str.lower().str.contains("life_land"),'Life on Land',
                    np.where(df_results['goal1_raw'].str.lower().str.contains("marine_life"),'Life Below Water',
                    np.where(df_results['goal1_raw'].str.lower().str.contains("health"),'Good Health and Well-Being',
                    np.where(df_results['goal1_raw'].str.lower().str.contains("climate"),'Climate Action',
                    np.where(df_results['goal1_raw'].str.lower().str.contains("peace|privacy|disinformation_and_fake_news|deception|hate"),'Peace, Justice and Strong Institutions',
                    np.where(df_results['goal1_raw'].str.lower().str.contains("social biases|race & identity"),'Reduced Inequalities',
                    np.where(df_results['goal1_raw'].str.lower().str.contains("industry|innovation|research"),'Industry, Innovation and Infrastructure',
                    np.where(df_results['goal1_raw'].str.lower().str.contains("sustainable cities|sustainable_cities"),'Sustainable Cities and Communities',
                    np.where(df_results['goal1_raw'].str.lower().str.contains("gender"),'Gender Equality',
                    np.where(df_results['goal1_raw'].str.lower().str.contains("decent work|decent_work_and_economy"),'Decent Work and Economic Growth',
                    np.where(df_results['goal1_raw'].str.lower().str.contains("partnership"),'Partnership for the goals',
                    np.where(df_results['goal1_raw'].str.lower().str.contains("responsible_consumption_and_production"),'Responsible Consumption and Production',
                    np.where(df_results['goal1_raw'].str.lower().str.contains("reduced|social_equality"),'Reduced Inequalities',''
                          )))))))))))))))))))

df_results=df_results.assign(goal2=np.where(df_results['goal2_raw'].str.lower().str.contains("education"),'Quality Education',
                    np.where(df_results['goal2_raw'].str.lower().str.contains("poverty"),'No Poverty',
                    np.where(df_results['goal2_raw'].str.lower().str.contains("hunger"),'Zero Hunger',
                    np.where(df_results['goal2_raw'].str.lower().str.contains("clean_water"),'Clean Water and Sanitation',
                    np.where(df_results['goal2_raw'].str.lower().str.contains("clean_energy"),'Affordable and Clean Energy',
                    np.where(df_results['goal2_raw'].str.lower().str.contains("life_land"),'Life on Land',
                    np.where(df_results['goal2_raw'].str.lower().str.contains("marine_life"),'Life Below Water',         
                    np.where(df_results['goal2_raw'].str.lower().str.contains("health"),'Good Health and Well-Being',
                    np.where(df_results['goal2_raw'].str.lower().str.contains("climate"),'Climate Action',
                    np.where(df_results['goal2_raw'].str.lower().str.contains("peace|privacy|disinformation_and_fake_news|deception|hate"),'Peace, Justice and Strong Institutions',
                    np.where(df_results['goal2_raw'].str.lower().str.contains("social biases|race & identity"),'Reduced Inequalities',
                    np.where(df_results['goal2_raw'].str.lower().str.contains("industry|innovation|research"),'Industry, Innovation and Infrastructure',
                    np.where(df_results['goal2_raw'].str.lower().str.contains("sustainable cities|sustainable_cities"),'Sustainable Cities and Communities',
                    np.where(df_results['goal2_raw'].str.lower().str.contains("gender"),'Gender Equality',
                    np.where(df_results['goal2_raw'].str.lower().str.contains("decent work|decent_work_and_economy"),'Decent Work and Economic Growth',
                    np.where(df_results['goal2_raw'].str.lower().str.contains("partnership"),'Partnership for the goals',
                    np.where(df_results['goal2_raw'].str.lower().str.contains("responsible_consumption_and_production"),'Responsible Consumption and Production',
                    np.where(df_results['goal2_raw'].str.lower().str.contains("reduced|social_equality"),'Reduced Inequalities',''
                          )))))))))))))))))))

df_results=df_results.assign(goal3=np.where(df_results['goal3_raw'].str.lower().str.contains("education"),'Quality Education',
                    np.where(df_results['goal3_raw'].str.lower().str.contains("poverty"),'No Poverty',
                    np.where(df_results['goal3_raw'].str.lower().str.contains("hunger"),'Zero Hunger',
                    np.where(df_results['goal3_raw'].str.lower().str.contains("clean_water"),'Clean Water and Sanitation',
                    np.where(df_results['goal3_raw'].str.lower().str.contains("clean_energy"),'Affordable and Clean Energy',
                    np.where(df_results['goal3_raw'].str.lower().str.contains("life_land"),'Life on Land',
                    np.where(df_results['goal3_raw'].str.lower().str.contains("marine_life"),'Life Below Water',
                    np.where(df_results['goal3_raw'].str.lower().str.contains("health"),'Good Health and Well-Being',
                    np.where(df_results['goal3_raw'].str.lower().str.contains("climate"),'Climate Action',
                    np.where(df_results['goal3_raw'].str.lower().str.contains("peace|privacy|disinformation_and_fake_news|deception|hate"),'Peace, Justice and Strong Institutions',
                    np.where(df_results['goal3_raw'].str.lower().str.contains("social biases|race & identity"),'Reduced Inequalities',
                    np.where(df_results['goal3_raw'].str.lower().str.contains("industry|innovation|research"),'Industry, Innovation and Infrastructure',
                    np.where(df_results['goal3_raw'].str.lower().str.contains("sustainable cities|sustainable_cities"),'Sustainable Cities and Communities',
                    np.where(df_results['goal3_raw'].str.lower().str.contains("gender"),'Gender Equality',
                    np.where(df_results['goal3_raw'].str.lower().str.contains("decent work|decent_work_and_economy"),'Decent Work and Economic Growth',
                    np.where(df_results['goal3_raw'].str.lower().str.contains("partnership"),'Partnership for the goals',
                    np.where(df_results['goal3_raw'].str.lower().str.contains("responsible_consumption_and_production"),'Responsible Consumption and Production',
                    np.where(df_results['goal3_raw'].str.lower().str.contains("reduced|social_equality"),'Reduced Inequalities',''
                          )))))))))))))))))))

In [29]:
wrong=df_results.loc[(df_results.proba1<=0.5) & (df_results.label==1)].sort_values('proba1',ascending=False).copy()

wrong_fp=df_results.loc[(df_results.proba1>0.5) & (df_results.label==0)].sort_values('proba1',ascending=False)

In [37]:
wrong_fp=df_results.loc[(df_results.proba1>0.5) & (df_results.label==0)].sort_values('proba1',ascending=False)

In [34]:
df_results.loc[df_results.label==1].shape[0]

264

In [35]:
df_results['goal1'].value_counts()/df_results.loc[df_results.label==1].shape[0]

                                           6.511364
Peace, Justice and Strong Institutions     0.291667
Good Health and Well-Being                 0.265152
Quality Education                          0.208333
Industry, Innovation and Infrastructure    0.094697
Decent Work and Economic Growth            0.087121
Reduced Inequalities                       0.060606
Gender Equality                            0.034091
Partnership for the goals                  0.022727
Name: goal1, dtype: float64

In [36]:
wrong['goal1'].value_counts()/wrong.loc[df_results.label==1].shape[0]

Quality Education                          0.303371
Industry, Innovation and Infrastructure    0.202247
Peace, Justice and Strong Institutions     0.191011
Decent Work and Economic Growth            0.123596
Good Health and Well-Being                 0.101124
Partnership for the goals                  0.056180
Reduced Inequalities                       0.022472
Name: goal1, dtype: float64

In [30]:
df_results['goal1'].value_counts()

                                           1719
Peace, Justice and Strong Institutions       77
Good Health and Well-Being                   70
Quality Education                            55
Industry, Innovation and Infrastructure      25
Decent Work and Economic Growth              23
Reduced Inequalities                         16
Gender Equality                               9
Partnership for the goals                     6
Name: goal1, dtype: int64

In [31]:
wrong['goal1'].value_counts()

Quality Education                          27
Industry, Innovation and Infrastructure    18
Peace, Justice and Strong Institutions     17
Decent Work and Economic Growth            11
Good Health and Well-Being                  9
Partnership for the goals                   5
Reduced Inequalities                        2
Name: goal1, dtype: int64

In [41]:

most_common=Counter(" ".join(df_results["text"].str.lower()).split()).most_common(60)

common=set()
for e,c in most_common:
    common.add(e)


wrong_common_fp=Counter(" ".join(wrong_fp["text"]).split()).most_common(60)

wr_common_fp=set()
for e,c in wrong_common_fp:
    wr_common_fp.add(e)

In [45]:
wrong_common_fp

[('the', 168),
 ('and', 119),
 ('of', 117),
 ('a', 82),
 ('to', 60),
 ('in', 59),
 ('for', 46),
 ('that', 29),
 ('with', 27),
 ('is', 27),
 ('on', 25),
 ('we', 24),
 ('The', 23),
 ('We', 22),
 ('as', 22),
 ('by', 20),
 ('are', 20),
 ('this', 18),
 ('an', 17),
 ('our', 15),
 ('from', 14),
 ('translation', 12),
 ('In', 11),
 ('A', 9),
 ('can', 9),
 ('which', 9),
 ('different', 9),
 ('Language', 8),
 ('paper', 8),
 ('have', 8),
 ('MT', 8),
 ('KD', 8),
 ('performance', 8),
 ('method', 7),
 ('show', 7),
 ('given', 7),
 ('how', 7),
 ('task', 7),
 ('system', 7),
 ('data', 7),
 ('use', 7),
 ('into', 7),
 ('between', 7),
 ('at', 7),
 ('social', 7),
 ('these', 6),
 ('results', 6),
 ('provide', 6),
 ('research', 6),
 ('approach', 6),
 ('or', 6),
 ('This', 6),
 ('be', 6),
 ('methods', 6),
 ('work', 6),
 ('propose', 6),
 ('error', 6),
 ('using', 5),
 ('their', 5),
 ('they', 5)]

In [42]:
wr_common_fp-common

{'A',
 'In',
 'KD',
 'Language',
 'MT',
 'The',
 'This',
 'We',
 'error',
 'given',
 'how',
 'into',
 'method',
 'methods',
 'propose',
 'provide',
 'research',
 'social',
 'they',
 'work'}

In [50]:
tp=df_results.loc[(df_results.proba1>=0.5) & (df_results.label==1)].sort_values('proba1',ascending=False).copy()

In [51]:
tp.loc[tp.text.str.contains('social')].shape[0]/tp.shape[0]

0.18285714285714286

In [49]:
df_results.loc[df_results.text.str.contains('social')].shape[0]/df_results.shape[0]

0.0365

In [48]:
wrong_fp.loc[wrong_fp.text.str.contains('social')].shape[0]/wrong_fp.shape[0]

0.14285714285714285

In [46]:
wrong_fp.loc[wrong_fp.text.str.contains('social')].text.values

array(["LIE: Leadership, Influence and Expertise. This paper describes our research into methods for inferring social and instrumental roles and relationships from document and discourse corpora. The goal is to identify the roles of initial authors and participants in internet discussions with respect to leadership, influence and expertise. Web documents, forums and blogs provide data from which the relationships between these concepts are empirically derived and compared. Using techniques from Natural Language Processing (NLP), characterizations of authority and expertise are hypothesized and then tested to see if these pick out the same or different participants as may be chosen by techniques based on social network analysis (Huffaker 2010) see if they pick out the same discourse participants for any given level of these qualities (i.e. leadership, expertise and influence). Our methods could be applied, in principle, to any domain topic, but this paper will describe an initial invest

In [None]:

if args.option=='0':
    df_results=evaluate(df_test_final,trainer)
    df_results=df_results.merge(df_lan,how='left')
    df_results=df_results.loc[df_results.lan=='en',:]
    cr_final=get_report(df_results)
    cr_final=cr_final.rename(columns={'value':'Fine tuned BERT'})
    cr_final.to_csv(outputs_path+"sg_classifier/scores_bert.csv",index=False)
elif args.option=='1':
    df_test_final=df_test_final.merge(df_lan,how='left')
    df_test_final=df_test_final.loc[df_test_final.lan=='en',:]
    df_results=evaluate(df_test_final,trainer)
    df_rules=get_rules_classifier(df_test_final,match_unique,keywords,workshops)
    df_unlabeled=evaluate(df_unlabeled,trainer)

    total_positives,total_negatives=get_all_results(df_unused_positive,df_labeled,df_unlabeled)
    total_positives.to_csv(outputs_path+"sg_classifier/all_positive_examples.csv",index=False)
    total_negatives.to_csv(outputs_path+"sg_classifier/all_negative_examples.csv",index=False)

    cr_model=get_report(df_results)
    cr_df_rules=get_report(df_rules)
    cr_df_rules=cr_df_rules.rename(columns={'value':'Rules classifier'})
    cr_model=cr_model.rename(columns={'value':'Fine tuned BERT'})
    cr_final=cr_model.merge(cr_df_rules,on=['metric','variable'])
    cr_final.to_csv(outputs_path+"sg_classifier/scores.csv",index=False)

print(cr_final)

In [None]:
def main():
    parser = argparse.ArgumentParser(add_help=True)
    parser.add_argument('-o','--option', nargs='?', help='1 for full results, 0 evaluate only test set',default='1')
    args = parser.parse_args()


if __name__ == '__main__':
    main()