In [1]:
!pip install git+https://github.com/csebuetnlp/normalizer rouge-score -q

In [5]:
import pandas as pd
from normalizer import normalize
from sklearn.metrics import accuracy_score, classification_report
import re
from rouge_score import rouge_scorer
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
!ls /kaggle/working/*.json

/kaggle/working/M-CLIP-XLM-Roberta-Large-Vit-B-16Plus-ViT-B-16-plus-240_score_0.0190_test_preds.json
/kaggle/working/M-CLIP-XLM-Roberta-Large-Vit-B-16Plus-ViT-B-16-plus-240_score_0.0190_val_preds.json
/kaggle/working/M-CLIP-XLM-Roberta-Large-Vit-B-16Plus-ViT-B-16-plus-240_score_0.0713_en_en_test_preds.json
/kaggle/working/M-CLIP-XLM-Roberta-Large-Vit-B-16Plus-ViT-B-16-plus-240_score_0.0713_en_en_val_preds.json
/kaggle/working/banglabert_beit_with_caption_concat_based_test_preds.json
/kaggle/working/banglabert_beit_with_caption_concat_based_val_preds.json
/kaggle/working/banglabert_vit_with_caption_concat_based_test_preds.json
/kaggle/working/banglabert_vit_with_caption_concat_based_val_preds.json
/kaggle/working/banglabert_vit_with_caption_summed_based_test_preds.json
/kaggle/working/banglabert_vit_with_caption_summed_based_val_preds.json
/kaggle/working/beit_co_attention__imt+mlm+mcl+ucl_bn_test_preds.json
/kaggle/working/beit_co_attention__imt+mlm+mcl+ucl_bn_val_preds.json
/kaggle/wo

In [8]:
df = pd.read_json('/kaggle/working/banglabert_vit_with_caption_concat_based_val_preds.json')
df.head()

Unnamed: 0,identity,label,pred
0,chitron_7881.png,পাঁচ,পাঁচ
1,chitron_5952.png,দুই,দুই
2,chitron_1272.png,তিন,পাঁচ
3,chitron_3587.png,আট,পাঁচ
4,chitron_3106.png,দুই,দুই


In [9]:
def normalise_bn(text_bn):
    return normalize(
        text_bn,
        unicode_norm="NFKC",
        punct_replacement=None,
        url_replacement=None,
        emoji_replacement=None,
        apply_unicode_norm_last=True
    )

In [10]:
train_df = pd.read_csv("/kaggle/input/vqa-bangla-meta/train.csv")
val_df = pd.read_csv("/kaggle/input/vqa-bangla-meta/valid.csv")
test_df = pd.read_csv("/kaggle/input/vqa-bangla-meta/test.csv")

all_df = pd.concat([train_df, val_df, test_df], ignore_index=True)

all_df.head()

Unnamed: 0,image_name,Captions,Question,Answer,Category,Question_en,Answer_en,Captions_en,Answer_fixed
0,bnature_663.jpg,খালের পানিতে তিনটি গাছের প্রতিচ্ছবি সাথে গৌধোল...,ছবিতে কতগুলো গাছের প্রতিচ্ছবি দেখা যাচ্ছে?,তিনটি,numeric,How many trees are reflected in the picture?,three,Goudholi's beauty with three trees reflected i...,তিন
1,chitron_5113.png,অনেকগুলো মানুষ বসে আছে। মঞ্চের উপর কয়েকজন মানু...,ছবিতে কতজন মানুষ মঞ্চের উপর দাঁড়িয়ে আছে?,পাঁচজন,numeric,How many people are on the stage?,five,"A lot of people were sitting, a few people wer...",পাঁচ
2,bnature_876.jpg,দুজন ছেলে ও দুজন মেয়ে রাস্তা দিয়ে পাশাপাশি হ...,ছবিতে কতজন ছেলে ও মেয়ে একসাথে হাটছে?,চারজন,numeric,How many boys and girls are walking together i...,four,Two boys and two girls walking side by side on...,চার
3,bnature_1007.jpg,"রাস্তা দিয়ে কয়েকজন ছাত্র ছাত্রী যাচ্ছে, যাদে...",ছবিতে কতজন ছাত্র ছাত্রী রাস্তা দিয়ে হাঁটছে?,৪ জন,numeric,How many students are walking on the street in...,four,"Several students walking on the street, carryi...",চার
4,chitron_7446.png,'১ ইট তালগাছ ১ টি খেজুর গাছ এবং রাস্তা দিয়ে ছা...,ছবিতে কতগুলো গাছ দেখা যাচ্ছে?,২ টি,numeric,How many trees are shown in the picture?,Two,1 brick palm tree 1 date tree and 4 school stu...,দুই


In [11]:
mapping_dict = pd.Series(all_df.Answer_fixed.values, index=all_df.Answer_en).to_dict()
df['label'] = df['label'].map(mapping_dict).fillna(df['label'])
df['pred'] = df['pred'].map(mapping_dict).fillna(df['pred'])
df

Unnamed: 0,identity,label,pred
0,chitron_7881.png,পাঁচ,পাঁচ
1,chitron_5952.png,দুই,দুই
2,chitron_1272.png,তিন,পাঁচ
3,chitron_3587.png,আট,পাঁচ
4,chitron_3106.png,দুই,দুই
...,...,...,...
1524,chitron_8276.png,হাঁসি হাঁসি,হাঁড়িপাতিল
1525,bornon_3774.jpg,পরিষ্কার,ধান ক্ষেত
1526,chitron_3659.png,ট্রেনের জানালায়,সাইকেল চালাচ্ছে
1527,chitron_3739.png,কয়াশাচ্ছন্ন,মেঘলা


In [12]:
all_labels = list(set(all_df['Answer'].unique().astype(str)))
all_labels.sort()
label_map = dict()
for idx, label in enumerate(all_labels):
    label_map[normalise_bn(str(label))] = idx

In [None]:
report = classification_report(df['label'], df['pred'], digits=4, zero_division=0)
print(report)

In [14]:
bangla_numbers = re.compile(r'[০১২৩৪৫৬৭৮৯]+')

numeric_to_bangla_text = {
        '০': 'শূন্য', '১': 'এক', '২': 'দুই', '৩': 'তিন', '৪': 'চার',
        '৫': 'পাঁচ', '৬': 'ছয়', '৭': 'সাত', '৮': 'আট', '৯': 'নয়',
        '১০': 'দশ', '১১': 'এগারো', '১২': 'বারো', '১৩': 'তেরো', '১৪': 'চৌদ্দ',
        '১৫': 'পনেরো', '১৬': 'ষোল', '১৭': 'সতেরো', '১৮': 'আঠারো', '১৯': 'ঊনিশ',
        '২০': 'বিশ', '২১': 'একুশ', '২২': 'বাইশ', '২৩': 'তেইশ', '২৪': 'চব্বিশ',
        '২৫': 'পঁচিশ', '২৬': 'ছাব্বিশ', '২৭': 'সাতাশ', '২৮': 'আটাশ', '২৯': 'ঊনত্রিশ',
        '৩০': 'ত্রিশ', '৩১': 'একত্রিশ', '৩২': 'বত্রিশ', '৩৩': 'তেত্রিশ', '৩৪': 'চৌত্রিশ',
        '৩৫': 'পঁইত্রিশ', '৩৬': 'ছত্রিশ', '৩৭': 'সাঁইত্রিশ', '৩৮': 'আটত্রিশ', '৩৯': 'ঊনচল্লিশ',
        '৪০': 'চল্লিশ', '৪১': 'একচল্লিশ', '৪২': 'বিয়াল্লিশ', '৪৩': 'তেতাল্লিশ', '৪৪': 'চুয়াল্লিশ',
        '৪৫': 'পঁয়তাল্লিশ', '৪৬': 'ছেচল্লিশ', '৪৭': 'সাতচল্লিশ', '৪৮': 'আটচল্লিশ', '৪৯': 'ঊনপঞ্চাশ',
        '৫০': 'পঞ্চাশ', '৫১': 'একান্ন', '৫২': 'বাহান্ন', '৫৩': 'তিপ্পান্ন', '৫৪': 'চুয়ান্ন',
        '৫৫': 'পঁচান্ন', '৫৬': 'ছাপ্পান্ন', '৫৭': 'সাতান্ন', '৫৮': 'আটান্ন', '৫৯': 'ঊনষাট',
        '৬০': 'ষাট', '৬১': 'একষট্টি', '৬২': 'বাষট্টি', '৬৩': 'তেষট্টি', '৬৪': 'চৌষট্টি',
        '৬৫': 'পঁয়ষট্টি', '৬৬': 'ছেষট্টি', '৬৭': 'সাতষট্টি', '৬৮': 'আটষট্টি', '৬৯': 'ঊনসত্তর',
        '৭০': 'সত্তর', '৭১': 'একাত্তর', '৭২': 'বাহাত্তর', '৭৩': 'তিয়াত্তর', '৭৪': 'চুয়াত্তর',
        '৭৫': 'পঁচাত্তর', '৭৬': 'ছিয়াত্তর', '৭৭': 'সাতাত্তর', '৭৮': 'আটাত্তর', '৭৯': 'ঊনআশি',
        '৮০': 'আশি', '৮১': 'একাশি', '৮২': 'বিরাশি', '৮৩': 'তিরাশি', '৮৪': 'চুরাশি',
        '৮৫': 'পঁচাশি', '৮৬': 'ছিয়াশি', '৮৭': 'সাতাশি', '৮৮': 'আটাশি', '৮৯': 'ঊননব্বই',
        '৯০': 'নব্বই', '৯১': 'একানব্বই', '৯২': 'বিরানব্বই', '৯৩': 'তিরানব্বই', '৯৪': 'চুরানব্বই',
        '৯৫': 'পঁচানব্বই', '৯৬': 'ছিয়ানব্বই', '৯৭': 'সাতানব্বই', '৯৮': 'আটানব্বই', '৯৯': 'নিরানব্বই',
        '১০০': 'একশ'
    }

misc_map = {
    'দুটি': 'দুই'
}

bangla_text_numbers = '|'.join(map(re.escape, sorted(numeric_to_bangla_text.values(), key=len, reverse=True)))
bangla_text_numbers = re.compile(f'({bangla_text_numbers})\s*(জন|টি|টা)')

def remove_anything_but_number(text):
    m = bangla_numbers.search(text)
    if m:
        return m.group(0)
    return text

def replace_numeric_with_bangla_text(text):    
    sorted_keys = sorted(numeric_to_bangla_text.keys(), key=len, reverse=True)
    pattern = '|'.join(map(re.escape, sorted_keys))
    
    def replacement(match):
        return numeric_to_bangla_text[match.group(0)]

    return re.sub(pattern, replacement, text)

def remove_extras(text):
    m = bangla_text_numbers.search(text)
    if m:
        return m.group(1)
    return text

def misc(text):
    sorted_keys = sorted(misc_map.keys(), key=len, reverse=True)
    pattern = '|'.join(map(re.escape, sorted_keys))
    
    def replacement(match):
        return misc_map[match.group(0)]

    return re.sub(pattern, replacement, text)
    

In [15]:
def clean_labels(text):
    x = remove_anything_but_number(text)
    x = replace_numeric_with_bangla_text(x)
    x = remove_extras(x)
    x = misc(x)
    return x.strip()

In [16]:
df['label_fixed'] = df['label'].apply(clean_labels)
df['pred_fixed'] = df['pred'].apply(clean_labels)
df

Unnamed: 0,identity,label,pred,label_fixed,pred_fixed
0,chitron_7881.png,পাঁচ,পাঁচ,পাঁচ,পাঁচ
1,chitron_5952.png,দুই,দুই,দুই,দুই
2,chitron_1272.png,তিন,পাঁচ,তিন,পাঁচ
3,chitron_3587.png,আট,পাঁচ,আট,পাঁচ
4,chitron_3106.png,দুই,দুই,দুই,দুই
...,...,...,...,...,...
1524,chitron_8276.png,হাঁসি হাঁসি,হাঁড়িপাতিল,হাঁসি হাঁসি,হাঁড়িপাতিল
1525,bornon_3774.jpg,পরিষ্কার,ধান ক্ষেত,পরিষ্কার,ধান ক্ষেত
1526,chitron_3659.png,ট্রেনের জানালায়,সাইকেল চালাচ্ছে,ট্রেনের জানালায়,সাইকেল চালাচ্ছে
1527,chitron_3739.png,কয়াশাচ্ছন্ন,মেঘলা,কয়াশাচ্ছন্ন,মেঘলা


In [17]:
report_fixed = classification_report(df['label_fixed'], df['pred_fixed'], digits=4, zero_division=0)
print(report_fixed)

                                   precision    recall  f1-score   support

                            DREAM     0.0000    0.0000    0.0000         1
                        LOVE LIFE     0.0000    0.0000    0.0000         1
         Our Identity Our Culture     0.0000    0.0000    0.0000         1
                            R DAY     0.0000    0.0000    0.0000         1
SOS Children's Village Chittagong     0.0000    0.0000    0.0000         1
              Thirty 3 Restaurant     0.0000    0.0000    0.0000         1
                          Thirty3     0.0000    0.0000    0.0000         1
              অগ্নিনির্বাপক কর্মী     0.0000    0.0000    0.0000         1
                         অজগর সাপ     0.0000    0.0000    0.0000         1
                         অটো বাইক     0.0000    0.0000    0.0000         1
                         অটোরিকশা     0.0000    0.0000    0.0000         1
                             অনেক     0.0000    0.0000    0.0000         1
                        

In [18]:
scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=False)

def calculate_rouge1(reference, candidate):
    score = scorer.score(reference, candidate)
    return score['rouge1'].fmeasure

df['rouge1_score'] = df.apply(lambda row: calculate_rouge1(row['label_fixed'], row['pred_fixed']), axis=1)
df['rouge1_score']

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
1524    0.0
1525    0.0
1526    0.0
1527    0.0
1528    0.0
Name: rouge1_score, Length: 1529, dtype: float64

In [23]:
bert_model = AutoModel.from_pretrained("csebuetnlp/banglabert").to(device)
bert_tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglabert")

cos = nn.CosineSimilarity(dim=1, eps=1e-6)

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/528k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [24]:
def bert_scorer(df):
    sim_list = []
    
    for index, row in df.iterrows():
        original = bert_tokenizer.encode_plus(normalise_bn(str(row['label_fixed'])), return_tensors="pt").to(device)
        preds = bert_tokenizer.encode_plus(normalise_bn(str(row['pred_fixed'])), return_tensors="pt").to(device)
        
        with torch.no_grad():
            d1 = bert_model(original['input_ids'],attention_mask=original['attention_mask'])['last_hidden_state'][:, 0, :]
            d2 = bert_model(preds['input_ids'],attention_mask=preds['attention_mask'])['last_hidden_state'][:, 0, :]
        
        sim_list.append(cos(d1, d2).item())
        
    return sim_list

In [25]:
similarity_list = bert_scorer(df)
df['bert_sim_score'] = similarity_list
df.head()

Unnamed: 0,identity,label,pred,label_fixed,pred_fixed,rouge1_score,bert_sim_score
0,chitron_7881.png,পাঁচ,পাঁচ,পাঁচ,পাঁচ,0.0,1.0
1,chitron_5952.png,দুই,দুই,দুই,দুই,0.0,1.0
2,chitron_1272.png,তিন,পাঁচ,তিন,পাঁচ,0.0,0.983745
3,chitron_3587.png,আট,পাঁচ,আট,পাঁচ,0.0,0.989326
4,chitron_3106.png,দুই,দুই,দুই,দুই,0.0,1.0


In [26]:
df['bert_sim_score'].mean()

0.9060767903348993

In [2]:
!curl https://getcroc.schollz.com | bash

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 25460  100 25460    0     0  36187      0 --:--:-- --:--:-- --:--:-- 36164
              ____
             / ___|_ __ ___   ___
            | |   | '__/ _ \ / __|
            | |___| | | (_) | (__
             \____|_|  \___/ \___|

       ___           _        _ _
      |_ _|_ __  ___| |_ __ _| | | ___ _ __
       | || '_ \/ __| __/ _` | | |/ _ \ '__|
       | || | | \__ \ || (_| | | |  __/ |
      |___|_| |_|___/\__\__,_|_|_|\___|_| 
[0m== Install prefix set to /usr/local/bin[0m
[0m== Created temp dir at /tmp/croc.NM9JMt[0m
[0m== Architecture detected as x86_64[0m
[0m== OS detected as Linux[0m
https://github.com/schollz/croc/releases/download/v10.0.8/croc_v10.0.8_Linux-64bit.tar.gz /tmp/croc.NM9JMt croc_v10.0.8_Linux-64bit.tar.gz
[0m== Downloaded croc archive into /tmp/croc.NM9JMt[0m
[0m== Downloaded croc checksu

In [3]:
!croc <<< "1477-gondola-tokyo-cafe"

Accept 'reresults.zip' (160.5 kB)? (Y/n) 
Receiving (<-103.253.246.146:62008)
 reresults.zip 100% |████████████████████| (164/164 kB, 251 kB/s)             


In [4]:
!unzip reresults.zip

Archive:  reresults.zip
  inflating: banglabert_vit_with_caption_summed_based_test_preds.json  
  inflating: banglabert_vit_with_caption_concat_based_test_preds.json  
  inflating: banglabert_beit_with_caption_concat_based_test_preds.json  
  inflating: banglabert_vit_with_caption_concat_based_val_preds.json  
  inflating: banglabert_beit_with_caption_concat_based_val_preds.json  
  inflating: banglabert_vit_with_caption_summed_based_val_preds.json  
  inflating: merged_attention_mlm_val_preds.json  
  inflating: merged_attention_mlm_test_preds.json  
