In [3]:
import yaml
import os
import langid
import pandas as pd


In [2]:
from src.idsp_kedro.pipelines.disease_ner.disease_ner_nodes import DiseaseTagger
from src.idsp_kedro.pipelines.relevancy_classifier.nodes import RelevanceTagger
from src.idsp_kedro.pipelines.translate.nodes import Translator
from src.idsp_kedro.pipelines.event_extraction.extraction_nodes import EventExtractor
from src.idsp_kedro.pipelines.location_date_ner.nodes import LocationDateTagger
from src.idsp_kedro.pipelines.event_extraction.filtering_nodes import EventFilterer
from src.idsp_kedro.pipelines.noevents_extraction.extraction_nodes import NoEventExtractorNLI

Need to compile C++ extensions to get sparse attention suport. Please run python setup.py build develop


/home/ubuntu/venv-prod/lib/python3.8/site-packages/xformers/_C.so: undefined symbol: _ZNR5torch7Library5_implEPKcONS_11CppFunctionE
Initializing vocab and bpe
Initializing model for translation


2024-02-22 16:19:50 | INFO | fairseq.tasks.translation | [SRC] dictionary: 35904 types
2024-02-22 16:19:50 | INFO | fairseq.tasks.translation | [TGT] dictionary: 32088 types


# Dataset

In [4]:
path = './750_annotated_articles.csv'
main_df = pd.read_csv(path, encoding='utf-8', lineterminator='\n')
print('len of main_df:', main_df.shape[0])

main_df['GT_Events'] = main_df['GT_Events'].apply(eval)
main_df = main_df[main_df['Article'].notnull()]
main_df.reset_index(drop=True, inplace=True)
print('Number of Articles after removing empty articles:', main_df.shape[0])

N_no_events = sum(main_df['GT_Events'].apply(lambda x: 0 if len(x) else 1))
N_events = main_df.shape[0] - N_no_events
print('Articles with atleast one Event:', N_events)
print('Number of no event Articles:', N_no_events)

main_df['complete_article'] = main_df['Article']
main_df['lang'] = 'en'


len of main_df: 750
Number of Articles after removing empty articles: 750
Articles with atleast one Event: 394
Number of no event Articles: 356


In [4]:
class Pipeline:
    def __init__(self):
        self.params = yaml.safe_load(open("/home/ubuntu/devesh/Prod_change_12th_jan/idsp_5th_feb/idsp-score/conf/local/parameters.yml"))
        self.params["common_params"]["lang_dict"] = "/home/ubuntu/devesh/Prod_change_12th_jan/idsp_5th_feb/idsp-score/data/05_model_input/lang_dict.json"
        self.device_params = self.params["device_params"]
        self.common_params = self.params["common_params"]
        
        
    def language_classifier(self, df):
        print('Language Classifier...')
        df['lang'] = df['Original_Article'].apply(lambda x: langid.classify(x)[0])
        return df
    
    def relevancy_classifier(self, df):
        
        print('Relevancy Classifier...')
        relevance_params = self.params["relevance_params"]
        relevance_tagger_obj = RelevanceTagger(
                 relevance_params, self.device_params, self.common_params
         )        
        # we don't have title and description separately available, we perform it on the whole article one by one
        def get_relevance_per_article(article, lang):
            relevancy,_ = relevance_tagger_obj.get_relevance_per_article(article=article, lang=lang)
            return relevancy

        relevancy_preds = []
        for itr, article in enumerate(df['complete_article']):
            relevancy = get_relevance_per_article(article, df['lang'][itr])
            relevancy_preds.append(relevancy)

        df['relevant'] = [1 if i == 1 else 0 for i in relevancy_preds]
  
        try:
            df = df[df["relevant"] == 1]
        except:
            df = df
        print('Number of Relevant Articles:', df.shape[0])
        return df
    
    def translate(self, df):
        translator_obj = Translator(common_params=self.common_params)
        translated_articles = translator_obj.run_translate_to_en(articles=df['Original_Article'])
        df['complete_article'] = translated_articles
        df['lang'] = 'en'

        return df
    
    def disease_ner(self, df):
        
        print('Disease NER...')
        disease_tagger_obj = DiseaseTagger(
        disease_ner_params=self.params["disease_ner_params"],
        device_params=self.params["device_params"],
        common_params=self.params["common_params"])
        df = disease_tagger_obj.perform_disease_ner_on_df(df)
        return df
      
    def location_ner(self, df):
        print('Location NER...')
        location_tagger_obj = LocationDateTagger(
            ner_params=self.params["location_date_ner_params"],
            common_params=self.params["common_params"])
        
        df = location_tagger_obj.get_location_date_for_df(df)
        
        return df
        
    def qa(self, df):
        print('QA...')
        event_extractor_obj = EventExtractor(
            event_ext_params=self.params["event_extraction_params"],
            device_params=self.device_params,
            common_params=self.common_params,
        )
        event_filter_obj = EventFilterer(common_params=self.common_params)
        total_events_list = event_extractor_obj.perform_event_extraction(df, preprocess=False)
        df = event_filter_obj.filter_events_df(df, total_events_list)
        return df

        
    def nli(self, df):
        print('Numberless Event Extraction...')
        noevent_extraction_params=self.params["noevent_extraction_params"]
        noevent_ext_obj = NoEventExtractorNLI(noevent_extraction_params,
                                              self.device_params,
                                              self.common_params)
        
        total_events_list = noevent_ext_obj.perform_noevent_extraction_and_filter(df, preprocess=False)
        df["noevents_events"] = total_events_list
        
        return df
        


In [5]:

pipeline = Pipeline()
# # Relevancy Classifier
df = pipeline.relevancy_classifier(main_df)
# Disease NER
df = pipeline.disease_ner(df)
# Location NER
df = pipeline.location_ner(df)
print(df['diseases'], df['location'])
df.rename(columns={'dates':'date'}, inplace=True)
df.reset_index(drop=True, inplace=True)
# QA
df = pipeline.qa(df)
df.loc[:,"old_data_cluster_idx"] = ''
# NLI
df = pipeline.nli(df)

Relevancy Classifier...




Number of Relevant Articles: 735
Disease NER...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["diseases"] = diseases_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["spans"] = spans_list


Location NER...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["location"] = locations_list
2024-02-22 16:25:56 | CRITICAL | pipeline_throughput | No. of articles that have atleast 1 location: 658
2024-02-22 16:25:56 | INFO | root | No. of articles that have atleast 1 location: 658
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["dates"] = dates_list
2024-02-22 16:25:56 | CRITICAL | pipeline_throughput | No. of articles that have atleast 1 date: 89
2024-02-22 16:25:56 | INFO | root | No. of articles that have atleast 1 date: 89
A value is trying to be set on a copy of 

0      covid-19,covid - 19,covid - 19 infected
1                             panic,burnt,fire
2                           lumpy skin disease
3                               food poisoning
4                                  heat stroke
                        ...                   
745                                           
746                            abuse,sex abuse
747                         lumpy skin disease
748                                       fire
749                         dengue,chikungunya
Name: diseases, Length: 735, dtype: object 0                                   [(gwalior, Gwalior)]
1      [(nagar, Nagar), (along, Along), (jodhpur, Jod...
2      [(uttarakhand, Uttarakhand), (uttar pradesh, U...
3                               [(jharkhand, Jharkhand)]
4             [(bihar, Bihar), (aurangabad, Aurangabad)]
                             ...                        
745                                     [(patna, Patna)]
746                                      

100%|██████████| 735/735 [40:56<00:00,  3.34s/it]    
2024-02-22 17:07:06 | CRITICAL | pipeline_throughput | Raw Events extracted 4195
2024-02-22 17:07:06 | INFO | root | Raw Events extracted 4195
2024-02-22 17:07:07 | CRITICAL | pipeline_throughput | Total Events after filtering 923
2024-02-22 17:07:07 | INFO | root | Total Events after filtering 923
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["events"] = filtered_events_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,"old_data_cluster_idx"] = ''


Numberless Event Extraction...


Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 52%|█████▏    | 379/735 [05:31<02:17,  2.60it/s] Token indices sequence length is longer than the specified maximum sequence length for this model (656 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 735/735 [09:47<00:00,  1.25it/s]  
2024-02-22 17:17:03 | CRITICAL | pipeline_throughput | Total articles f

In [6]:
df.to_csv('final_output_end-to-end_new.csv', index=False)

# Evaluation

In [17]:
import json
class EventExtractionEvaluation:
    def __init__(self, df):
        self.df = df
        self.synonyms = json.load(open("/home/ubuntu/devesh/Prod_change_12th_jan/idsp_5th_feb/idsp-score/data/05_model_input/disease_synonyms.json"))

    def group_diseases(self, events):
        new_events = []
        for event in events:
            disease = event["disease"]
            for key in self.synonyms.keys():
                synonyms_for_disease = [x.lower() for x in self.synonyms[key]]
                if disease.lower().lstrip().rstrip() in synonyms_for_disease:
                    event["disease"] = key.lower()
                    event["original_disease"] = disease
                    break  
            new_events.append(event)
        return new_events

    def precision_recall_method_1(self, pred, gt):
        
        gt = set(tuple(sorted(d.items())) for d in gt)
        pred = set(tuple(sorted(d.items())) for d in pred)
        
        tp = len(pred.intersection(gt))
        fp = len(pred.difference(gt))
        fn = len(gt.difference(pred))

        precision = tp / (tp + fp) if tp+fp > 0 else 1.0
        recall = tp / (tp + fn) if (tp+fn) > 0 else 1.0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 1.0
        exact_match = 1.0 if pred == gt else 0.0
        
        return precision, recall, f1, exact_match
        
    def precision_recall_method_2(self, pred, gt):
                
        # strip spaces from the values
        # for d in gt:
        #     for k, v in d.items():
        #         d[k] = v.strip()
        # for d in pred:
        #     for k, v in d.items():
        #         d[k] = v.strip()
                
        gt = set(tuple(sorted(d.items())) for d in gt)
        pred = set(tuple(sorted(d.items())) for d in pred)
        tp = len(pred.intersection(gt))
        fp = len(pred.difference(gt))
        fn = len(gt.difference(pred))
        
        if fp > 1:
            print(gt)
            print(pred)
            
        if tp == 0 and fp == 0 and fn == 0:
            precision = 1.0; recall = 1.0; f1 = 1.0
        elif tp == 0 and (fp > 0 or fn > 0):
            precision = 0.0; recall = 0.0; f1 = 0.0
        else:
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f1 = 2 * (precision * recall) / (precision + recall)
        exact_match = 1.0 if pred == gt else 0.0
        
        return precision, recall, f1, exact_match


    def jaccard_index(self, pred, gt):
        gt = set(tuple(sorted(d.items())) for d in gt)
        pred = set(tuple(sorted(d.items())) for d in pred)
        if len(pred) == 0 and len(gt) == 0: return 1
        intersection = len(pred.intersection(gt))
        union = len(pred.union(gt))
        return intersection / union
    

    def subset_accuracy(self, pred, gt):
        gt = set(tuple(sorted(d.items())) for d in gt)
        pred = set(tuple(sorted(d.items())) for d in pred)
        return float(pred.issubset(gt))

    def evaluate_event_extraction(self):
        # Group diseases using synonyms list
        self.df['GT_Events'] = self.df['GT_Events'].apply(lambda events: self.group_diseases(events))
        self.df['events'] = self.df['events'].apply(lambda events: self.group_diseases(events))
        
        # keep only keys disease, location, incident, incident_type, number for comparison

        def keys_to_keep(d):
            return {k: v for k, v in d.items() if k in ['disease', 'location', 'incident', 'incident_type', 'number']}
        self.df['GT_Events'] = self.df['GT_Events'].apply(lambda x: [keys_to_keep(i) for i in x])
        self.df['events'] = self.df['events'].apply(lambda x: [keys_to_keep(i) for i in x])
        
        
        # apply precision_recall_method_2 to each row
        self.df['precision'], self.df['recall'], self.df['f1'], self.df['exact_match'] = zip(*self.df.apply(lambda row: self.precision_recall_method_2(row['events'], row['GT_Events']), axis=1))
        self.df['jaccard'] = self.df.apply(lambda row: self.jaccard_index(row['events'], row['GT_Events']), axis=1)
        self.df['subset'] = self.df.apply(lambda row: self.subset_accuracy(row['events'], row['GT_Events']), axis=1)
        
        metrics = {
            "precision": self.df['precision'].mean(),
            "recall": self.df['recall'].mean(),
            "f1": self.df['f1'].mean(),
            "exact_match": self.df['exact_match'].mean(),
            "jaccard": self.df['jaccard'].mean(),
            "subset": self.df['subset'].mean()
        }
        
        return metrics

In [31]:

import pandas as pd
# read output_df and original_df having gt_events

orig_df = pd.read_csv('750_annotated_articles.csv')
output_df = pd.read_csv('final_output_end-to-end_new.csv')

# remove rows from orig_df where article is not present in output_df
orig_df = orig_df[orig_df['Article'].isin(output_df['Article'])]

df = orig_df.copy()

# Preprocess data
def rename_keys(d):
    return {'incident' if k == 'Incident (case or death)' else 'incident_type' if k == 'Incident Type (new or total)' else k: v for k, v in d.items()}

def lower_case_keys(d):
    return {k.lower(): v.lower() if isinstance(v, str) else v for k, v in d.items()}




df['GT_Events'] = df['GT_Events'].apply(eval)
output_df['GT_Events'] = output_df['GT_Events'].apply
# based on gt_events, get corresponding events and noevents_events from output_df, and add them to df
# if a particular gt_events is not found in output_df, add empty lists
df['events'] = df.apply(lambda x: output_df[output_df['Article'] == x['Article']]['events'].values[0] if len(output_df[output_df['Article'] == x['Article']]['events']) else "[]", axis=1) 
df['noevents_events'] = df.apply(lambda x: output_df[output_df['Article'] == x['Article']]['noevents_events'].values[0] if len(output_df[output_df['Article'] == x['Article']]['noevents_events']) else "[]", axis=1)

df['events'] = df['events'].apply(eval)
df['noevents_events'] = df['noevents_events'].apply(eval)


# remove articles with no events
# df = df[df['GT_Events'].apply(len) == 0]
# df = df.reset_index(drop=True)

# rename keys
df['GT_Events'] = df['GT_Events'].apply(lambda x: [rename_keys(i) for i in x])

# lower case keys
df['GT_Events'] = df['GT_Events'].apply(lambda x: [lower_case_keys(i) for i in x])
df['events'] = df['events'].apply(lambda x: [lower_case_keys(i) for i in x])
df['noevents_events'] = df['noevents_events'].apply(lambda x: [lower_case_keys(i) for i in x])



# for nuberless_events, add empty incident_type, number
df['GT_Events'] = df.apply(lambda x: [{k: '' if k == 'incident_type' and 'number' in i and i['number'] == '' else v for k, v in i.items()} for i in x['GT_Events']], axis=1)
df['noevents_events'] = df.apply(lambda x: [{**i, 'number': '', 'incident_type': ''} for i in x['noevents_events']], axis=1)


print('Number of Articles:', df.shape[0])

thresholds = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
qa_only_results = {}

for threshold in [0.2]:
    df_copy = df.copy()
    df_copy['events'] = df_copy.apply(lambda x: [i for i in x['events'] if i['score']>=threshold], axis=1)    
    event_eval = EventExtractionEvaluation(df_copy.copy())
    metrics = event_eval.evaluate_event_extraction()
    qa_only_results[threshold] = metrics
    
qa_nli_results = {}    
for threshold in [0.5]:
    df['events'] = df.apply(lambda x: [i for i in x['events'] if i['score']>=0.2], axis=1)
    df['noevents_events'] = df.apply(lambda x: [i for i in x['noevents_events'] if i['score']>=0.5], axis=1)
    # Add NLI predictions to events, where events is empty
    total_predicted_events = df['events'].apply(len).sum()
    print('Total GT events:', df['GT_Events'].apply(len).sum())
    print('total predicted events:', total_predicted_events)
    df['events'] = df.apply(lambda x: x['noevents_events'] if  len(x['events'])==0 and len(x['noevents_events'])!=0 else x['events'], axis=1)
    print('total noevents_events:', df['events'].apply(len).sum()- total_predicted_events)
    
    event_eval = EventExtractionEvaluation(df.copy())
    metrics = event_eval.evaluate_event_extraction()
    qa_nli_results[threshold] = metrics
    

Number of Articles: 735
{(('disease', 'lumpy skin disease'), ('incident', 'death'), ('incident_type', 'total'), ('location', 'uttarkhand'), ('number', '321')), (('disease', 'lumpy skin disease'), ('incident', 'case'), ('incident_type', 'new'), ('location', 'uttarakhand'), ('number', '19404'))}
{(('disease', 'lumpy skin disease'), ('incident', 'case'), ('incident_type', 'total'), ('location', 'uttar pradesh'), ('number', '19404')), (('disease', 'lumpy skin disease'), ('incident', 'death'), ('incident_type', 'total'), ('location', 'uttarakhand'), ('number', '321'))}
set()
{(('disease', 'heat stroke'), ('incident', 'death'), ('incident_type', 'total'), ('location', 'bihar'), ('number', '10')), (('disease', 'heat stroke'), ('incident', 'case'), ('incident_type', 'new'), ('location', 'aurangabad'), ('number', '50'))}
{(('disease', 'covid 19'), ('incident', 'case'), ('incident_type', 'total'), ('location', 'india'), ('number', '43938764')), (('disease', 'covid 19'), ('incident', 'death'), ('

In [32]:
# count total number of events
df['GT_Events'].apply(len).sum()


849

# Results with QA Only

In [33]:
# write results to csv
import csv
fieldnames = ['threshold', 'precision', 'recall', 'f1', 'exact_match', 'jaccard', 'subset']

# Write data to CSV file
with open('results_qa.csv', mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    
    # Write rows
    for threshold, scores in qa_only_results.items():
        row = {'threshold': threshold}
        for key, value in scores.items():
            row[key] = round(value, 3) 
        writer.writerow(row)
df = pd.read_csv('results_qa.csv')
df

Unnamed: 0,threshold,precision,recall,f1,exact_match,jaccard,subset
0,0.2,0.253,0.243,0.244,0.197,0.231,0.257


# Results with QA+NLI

In [34]:
# write results to csv
import csv
fieldnames = ['threshold', 'precision', 'recall', 'f1', 'exact_match', 'jaccard', 'subset']

# Write data to CSV file
with open('results_qa_nli.csv', mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    
    # Write rows
    for threshold, scores in qa_nli_results.items():
        row = {'threshold': threshold}
        for key, value in scores.items():
            row[key] = round(value, 3) 
        writer.writerow(row)
df = pd.read_csv('results_qa_nli.csv')
df

Unnamed: 0,threshold,precision,recall,f1,exact_match,jaccard,subset
0,0.5,0.223,0.213,0.214,0.167,0.201,0.214
