In [1]:
import os
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import argparse
sns.set(style="darkgrid")
from sklearn.metrics import classification_report

In [2]:
args= argparse.Namespace(
    loc = '../data/'
)

In [3]:
!ls ../data/runs

FalsePostive_english_task_1_run_1.tsv FalsePostive_german_task_2_run_3.tsv
FalsePostive_english_task_2_run_1.tsv FalsePostive_hindi_task_1_run_2.tsv
FalsePostive_english_task_2_run_2.tsv FalsePostive_hindi_task_2_run_1.tsv
FalsePostive_english_task_3_run_1.tsv FalsePostive_hindi_task_3_run_1.tsv
FalsePostive_german_task_1_run_2.tsv


# Global Variables

## Gold Labels

In [7]:
ENGLISH_GOLD = pd.read_csv(
    os.path.join(args.loc,'gold','english_data.tsv'),
    sep='\t'
)
HINDI_GOLD = pd.read_csv(
    os.path.join(args.loc,'gold','hindi_data.tsv'),
    sep='\t'
)
GERMAN_GOLD = pd.read_csv(
    os.path.join(args.loc,'gold','german_data.tsv'),
    sep='\t'
)

## Model Predictions

In [8]:
GERMAN_PREDS = {  
    'task_1': pd.read_csv(
        os.path.join(args.loc,'runs','FalsePostive_german_task_1_run_2.tsv',),
        sep='\t'
    ) ,
    'task_2': pd.read_csv(
        os.path.join(args.loc,'runs','FalsePostive_german_task_2_run_3.tsv',),
        sep='\t'
    ) ,
}

In [9]:
ENGLISH_PREDS = {  
    'task_1': pd.read_csv(
        os.path.join(args.loc,'runs','FalsePostive_english_task_1_run_1.tsv',),
        sep='\t'
    ) ,
    'task_2': pd.read_csv(
        os.path.join(args.loc,'runs','FalsePostive_english_task_2_run_2.tsv',),
        sep='\t'
    ) ,
    'task_3': pd.read_csv(
        os.path.join(args.loc,'runs','FalsePostive_english_task_3_run_1.tsv',),
        sep='\t'
    ) ,
}


In [10]:
HINDI_PREDS = {  
    'task_1': pd.read_csv(
        os.path.join(args.loc,'runs','FalsePostive_hindi_task_1_run_2.tsv',),
        sep='\t'
    ) ,
    'task_2': pd.read_csv(
        os.path.join(args.loc,'runs','FalsePostive_hindi_task_2_run_1.tsv',),
        sep='\t'
    ) ,
    'task_3': pd.read_csv(
        os.path.join(args.loc,'runs','FalsePostive_hindi_task_3_run_1.tsv',),
        sep='\t'
    ) ,
}


# Helper Functions

In [11]:
def get_analysis(
    gold_df:pd.DataFrame,
    pred_df:pd.DataFrame,
    subtask:str,
):
    """
    Args:
        gold_df: DataFrame containing gold labels
        preds_df: DataFrame containing model preds
        subtask: 1,2 or 3
    Returns:
        analysis_df: DF containing the columns  Text, True labels as 'true'
            predicted labels as 'pred'
        cross_tab: A pandas crosstab as confusion matrix
    """
    pred_df.rename(
        columns={'result':'pred'},
        inplace=True,
    )
    gold_df = gold_df[['text_id',f'task_{subtask}']]
        
    analysis_df = pd.merge(
        pred_df,
        gold_df,
        on='text_id',
    )
    
    analysis_df.rename(
        columns={f'task_{subtask}': 'true'},
        inplace=True
    )
    cross_tab = pd.crosstab(
        analysis_df.true , #y_true
        analysis_df.pred , #y_pred
        rownames=['True'], colnames=['Predicted'], margins=True
    )
    return analysis_df, cross_tab

In [12]:

def print_samples(
    analysis_df:pd.DataFrame,
    mistakes:bool=True,
    num_samples:int=5
)-> None:
    """Prints the samples for analysis"""
    
    with pd.option_context('display.max_colwidth', -1): 
        if mistakes: #print misclassifications
            df = analysis_df[analysis_df.true != analysis_df.pred]
            [['Text','true','pred']]
        else: #print correct classifications
            df = analysis_df[analysis_df.true == analysis_df.pred]
            [['Text','true','pred']]
            
        print(df.sample(num_samples))
        
        print('\n',df['Text'].map(len).describe())

# English Error Analysis

## Task 1

In [13]:
ENGLISH_PREDS[f'task_{1}']

Unnamed: 0.1,Unnamed: 0,text_id,result
0,0,hasoc_en_207,HOF
1,1,hasoc_en_568,HOF
2,2,hasoc_en_137,HOF
3,3,hasoc_en_214,HOF
4,4,hasoc_en_869,HOF
...,...,...,...
1148,1148,hasoc_en1_7212,NOT
1149,1149,hasoc_en1_3958,NOT
1150,1150,hasoc_en1_4648,NOT
1151,1151,hasoc_en1_4832,NOT


In [14]:
task = 1
eng_task_1_analysis_df, eng_task_1_crosstab =  get_analysis(
    gold_df = ENGLISH_GOLD,
    pred_df = ENGLISH_PREDS[f'task_{task}'],
    subtask = task,
)

In [15]:
print(classification_report(
    y_true = eng_task_1_analysis_df.true,
    y_pred = eng_task_1_analysis_df.pred,
    digits = 4,
))

              precision    recall  f1-score   support

         HOF     0.4079    0.4688    0.4362       288
         NOT     0.8139    0.7734    0.7931       865

    accuracy                         0.6973      1153
   macro avg     0.6109    0.6211    0.6147      1153
weighted avg     0.7125    0.6973    0.7040      1153



## Task 2

In [18]:
task = 2
eng_task_2_analysis_df, eng_task_2_crosstab =  get_analysis(
    gold_df = ENGLISH_GOLD,
    pred_df = ENGLISH_PREDS[f'task_{task}'],
    subtask = task,
)

In [19]:
print(classification_report(
    y_true = eng_task_2_analysis_df.true,
    y_pred = eng_task_2_analysis_df.pred,
    digits = 4,
))

              precision    recall  f1-score   support

        HATE     0.1688    0.3145    0.2197       124
        NONE     0.8139    0.7734    0.7931       865
        OFFN     0.2308    0.0423    0.0714        71
        PRFN     0.2989    0.2796    0.2889        93

    accuracy                         0.6392      1153
   macro avg     0.3781    0.3524    0.3433      1153
weighted avg     0.6671    0.6392    0.6463      1153



## Task 3

In [21]:
task = 3
eng_task_3_analysis_df, eng_task_3_crosstab = get_analysis(
    gold_df = ENGLISH_GOLD,
    pred_df = ENGLISH_PREDS[f'task_{task}'],
    subtask = task,
)

In [22]:
print(classification_report(
    y_true = eng_task_3_analysis_df.true,
    y_pred = eng_task_3_analysis_df.pred,
    digits = 4,
))

              precision    recall  f1-score   support

        NONE     0.8139    0.7734    0.7931       865
         TIN     0.3652    0.4204    0.3909       245
         UNT     0.0816    0.0930    0.0870        43

    accuracy                         0.6730      1153
   macro avg     0.4202    0.4289    0.4237      1153
weighted avg     0.6912    0.6730    0.6813      1153



# Hindi Error Analysis