In [None]:
import json
import os

import shutil
import base64
import requests
import pandas as pd 

from dotenv import load_dotenv
import google.generativeai as genai

In [None]:
list_validation_images = os.listdir("dataset/bornon") #chitron/bnature
len(list_validation_images)

In [None]:
files_to_copy = list_validation_images[:100]
len(files_to_copy)

In [None]:
source_folder = "dataset/Bornon" 
destination_folder = "dataset/validation"

In [None]:
for filename in files_to_copy:
  source_path = os.path.join(source_folder, filename)
  destination_path = os.path.join(destination_folder, filename)
  
  if os.path.isfile(source_path):
      shutil.copy2(source_path, destination_path)
    #   print(f"Copied {filename} to {destination_folder}")
  else:
      print(f"File {filename} not found in source folder.")

In [None]:
df_train = pd.read_csv("dataset/bornon_train_token.txt", delimiter="#0\s+", names=["image_name", "caption"], header=None)
df_test = pd.read_csv("dataset/bornon_test_token.txt", delimiter="#0\s+", names=["image_name", "caption"], header=None)
# df.head()

In [None]:
def consolidate(gdf):
    captions = gdf['caption'].to_list()
    return captions

In [None]:
df1 = df_train.groupby("image_name").apply(consolidate, include_groups=False).reset_index().rename(columns={0: 'captions'})
df2 = df_test.groupby("image_name").apply(consolidate, include_groups=False).reset_index().rename(columns={0: 'captions'})

In [None]:
df1.sort_values(by='image_name', key=lambda x: pd.to_numeric(x.str.rstrip('.jpg'), errors='coerce'), inplace=True)
df2.sort_values(by='image_name', key=lambda x: pd.to_numeric(x.str.rstrip('.jpg'), errors='coerce'), inplace=True)

In [None]:
df2.head()

In [None]:
df = pd.concat([df1, df2], axis=0)
df.tail()

In [None]:
# df = pd.read_csv("top_captioned.csv")
df = pd.read_csv("dataset/generated_bornon.csv")
# df.reset_index(drop=True, inplace=True)

df.head()

In [None]:
# mask = df['image_name'].isin(files_to_copy)
# validation = df[mask]
df.isna().sum()

In [None]:
load_dotenv()

OPENAI_API = os.getenv("NEW_OPENAI_API")
GEMINI_API = os.getenv("GOOGLE_API")

### GPT-4 vision QnA and Captioning test for these images

In [None]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')        

In [None]:
def prompt(captions):
    return """
    You are an expert in generating Bengali visual question answers. For a given image and the captions to the image, your task is to generate the question and the answer. You should always abide by the guidelines that are mentioned below:

    GUIDELINE 1:  The questions should be always image-aligned, caption-aligned, and informative \\
    GUIDELINE 2:  Try to generate the answer in one or two words. The answer must never contain more than three words \\
    GUIDELINE 3:  Generate the question-answer pair in the Bengali language \\
    Here is the caption: \\
    <CAPTION> {} \\
    Based on the captions above and the image, generate one question-answer pair 
    in Bengali. Generate the question-answer pair in the following format:\\

    Q\# <GENERATED QUESTION>, A\# <GENERATED ANSWER>
    """.format(captions)

In [None]:
def give_q_n_a(image_name, captions):
    # Path to your image
    image_path = f"dataset/bornon/{image_name}"

    # Encode the image
    base64_image = encode_image(image_path)

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {OPENAI_API}"
    }
    payload = {
        # "model": "gpt-4-vision-preview", "gpt-4-turbo"
        "model": "gpt-4-turbo",
        # "response_format": {"type": "json_object"},
        "messages": [
            {
                "role": "user",
                "content": [
                    # {"type": "text", "text": "Caption the image explaining the contents in it. Use Bengali language to caption the image."},
                    {"type": "text", "text": prompt(captions)},

                    {"type": "image_url", "image_url": {"url": 
    f"data:image/jpeg;base64,{base64_image}"}}
                ]
            }
        ],
        "max_tokens": 100
    }   


    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    
    return response.json()#['choices'][0]['message']['content']


In [None]:
# df.loc[:,'Generated'] = None
# df = final_df
# df[400:415]

In [None]:
# validation.loc[:,'Generated'] = None
# df.loc[:,'Generated'] = None
# df = pd.read_csv("bornon-dataset.csv")

In [None]:
import time

sliced_df = df[0:1]
# sliced_df.head(10)
# validation.loc[validation['image_name'] == rows['image_name'], 'Generated'] = res


In [None]:
total_tokens = 2397670 #1393072 # 325355
total_input_tokens = 2176137 #405867 # 293207
total_output_tokens = 482592 # 1537880 # 293207
#  36.23913 # 3.8965100000000006 $
val = 0

In [None]:
# final_df = pd.read_csv("bornon-dataset.csv")


sliced_df.isna().sum()

In [None]:

for idx, rows in sliced_df.iterrows():
    res = (give_q_n_a(rows['image_name'], rows['top_captions']))
    # print(prompt(rows['top_captions']))
    total_input_tokens = total_input_tokens + res['usage']['prompt_tokens']
    total_output_tokens = total_output_tokens + res['usage']['completion_tokens']
    total_tokens = total_tokens + res['usage']['total_tokens']
    df.loc[df['image_name'] == str(rows['image_name']), 'Generated'] = res['choices'][0]['message']['content']
    print("done: ", val, "input_token: ", res['usage']['prompt_tokens'], "output_token: ", res['usage']['completion_tokens'])
    val = val+1
    time.sleep(1)
    break

In [None]:
res

In [None]:
sliced_df.head(15)

In [None]:
print("total input tokens: ", total_input_tokens)
print("total output tokens: ", total_output_tokens)
print("total token count: " , total_tokens)
print(f"total cost: {total_input_tokens * 1e-5 + total_output_tokens * 3e-5}")

total input tokens:  1393072
total output tokens:  405867
total token count:  1537880
total cost: 26.10673

In [None]:
# validation['Generated'] = validation['image_name'].apply(res)
# res
df.isna().sum()

In [None]:
# sliced_df.tail()
# merged_df = pd.merge(df, sliced_df, on=["image_name", "captions", "top_captions"], how="left")

In [None]:
# merged_df.isna().sum()

In [None]:
# merged_df.head()

In [None]:
# merged_df.rename(columns={"Generated_y": "Generated"}, inplace= True)
# merged_df.drop('Generated_x',axis=1, inplace=True)

In [None]:
df.dropna(inplace=True)

In [None]:
df[df['Generated'].isnull()]

In [None]:
# df.to_csv("dummy.csv", index=False)

In [None]:
df.to_csv("dataset/generated_bornon.csv", index = False)

In [None]:
def give_q_n_a(image_name, captions):
    pass
    # Path to your image
    # image_path = f"dataset/Bornon/{image_name}"

    # # Encode the image
    # base64_image = encode_image(image_path)

    # headers = {
    #     "Content-Type": "application/json",
    #     "Authorization": f"Bearer {OPENAI_API}"
    # }
    # payload = {
    #     # "model": "gpt-4-vision-preview", "gpt-4-turbo"
    #     "model": "gpt-4-turbo",
    #     # "response_format": {"type": "json_object"},
    #     "messages": [
    #         {
    #             "role": "user",
    #             "content": [
    #                 # {"type": "text", "text": "Caption the image explaining the contents in it. Use Bengali language to caption the image."},
    #                 {"type": "text", "text": f"Generate a Question and answer pair in Bengali language based on the Captions: {captions} and the image given.\
    #                     give me the question and answer in the following format: \
    #                         Q#   'QUESTION_GENERATED', \
    #                         A#   'ANSWER_GENERATED',\
    #                     Please keep in mind that always generate the question keeping the context of the captions and the image. Also keep in mind that \
    #                         generate everything in Bengali. Generate only one question and answer pair. \
    #                     "},

    #                 {"type": "image_url", "image_url": {"url": 
    # f"data:image/jpeg;base64,{base64_image}"}}
    #             ]
    #         }
    #     ],
    #     "max_tokens": 300
    # }   


    # response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    
    # return response.json()#['choices'][0]['message']['content']


In [None]:
import textwrap
from IPython.display import display
from IPython.display import Markdown

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:

import PIL.Image

img = PIL.Image.open('dataset/Bornon/1.jpg')
img

In [None]:
genai.configure(api_key=GEMINI_API)

In [None]:
model = genai.GenerativeModel('gemini-1.0-pro-vision-latest')

In [None]:
def gemini_give_q_n_a(image, captions):
    response = model.generate_content([f"Generate a Question and answer pair in Bengali language based on the Captions: {captions} and the image given.\
                            give me the question and answer in the following format: \
                                Q: 'QUESTION_GENERATED',\
                                A: 'ANSWER_GENERATED',\
                            Please keep in mind that always generate the question keeping the context of the captions and the image. Also keep in mind that \
                                generate everything in Bengali \
                            ", image], stream=True)
    response.resolve()
    
    return response.text

In [None]:
# to_markdown(response.text)
import time

In [None]:
type(response.text)

In [None]:
sliced_df = validation[95:100]

In [None]:
for idx, rows in sliced_df.iterrows():
    image_path = f"dataset/validation/{rows['image_name']}"
    img = PIL.Image.open(image_path)
    print((rows['image_name']), rows['captions'])
    res = gemini_give_q_n_a(img , rows['captions'])
    validation.loc[validation['image_name'] == str(rows['image_name']), 'Gemini_generated'] = res
    # print(res)
    time.sleep(3)
    # break

In [None]:
validation.head(10)

In [None]:
validation.isna().sum()

In [None]:
datapath = "dataset/Bornon"

In [None]:
images = os.listdir(datapath)

In [None]:
for i in images:
    img = PIL.Image.open(f'{datapath}/{i}')
    if img.__dict__['_size'] != (400, 400):
        print(img.height, img.width)    
        # print(img.__dict__['_size'])

### Bornon Dataset
each image size is 400*400. 
and 

In [None]:
df = pd.read_csv("dataset/generated_bornon.csv")

In [None]:
dummy = df#[df['image_name'] == '237.jpg']
# strs = dummy.iloc[0]['Generated']
# strs = strs + "Q# ছবিতে সময়টা কি কোনো বিশেষ পর্যায়ে আছে?\nA# সূর্যাস্তের"

# strs.split('#')

In [None]:
def split_qa_string(qa_string):
    parts = qa_string.split('A#')
    
    question = parts[0].replace('Q# ', '').strip().strip('",')
    answer = parts[1].strip().strip('"')
    
    if len(parts) != 2:
        print((parts))
        question2 = parts[2].replace('Q# ', '').strip().strip('",')
        answer2 = parts[3].strip().strip('"')
        # print(question2, answer2)
    return [question, answer]


In [None]:
dummy['Generated_list'] = dummy['Generated'].apply(split_qa_string)


In [None]:
dummy

In [None]:
dummy[['Question', 'Answer']] = pd.DataFrame(dummy['Generated_list'].tolist(), index=df.index)

In [None]:
dummy.drop(columns=['Generated_list'], inplace=True) 

In [None]:
dummy.head()

In [None]:
s = dummy.iloc[0]['top_captions']

In [None]:
dummy.drop(columns = 'captions', inplace=True)

In [None]:
def splitting(strs:str):
    s = strs.split(',')
    
    print(s)

In [None]:
dummy.to_csv("dataset/generated_bornon.csv", index = False)

## uploadable to kaggle


In [None]:
images = os.listdir("dataset/bornon")
(images)[0]

In [None]:
udf = pd.read_csv("dataset/generated_bornon.csv")

udf.head()

In [None]:
df = udf[udf['image_name'].isin(images)]

In [None]:
df.head()

In [None]:
df.to_csv("dataset/bornon_subset.csv", index=False)

In [None]:
uniques = set(df['Answer'].unique())

In [None]:
with open('dataset/unique_labels.txt', 'w') as file:
    for label in uniques:
        file.write(f"{label}\n")