In [119]:
import pandas as pd
import numpy as np
import contractions
import re
from collections import defaultdict
import joblib

In [120]:
csv_paths = [
    "../batches/Batch_284001_batch_results.csv",
    "../batches/Batch_284002_batch_results.csv",
    "../batches/Batch_284003_batch_results.csv",
    "../batches/Batch_284004_batch_results.csv"
]

In [121]:
def parse_df(df, mappings):
    parsed_data = {
        "image_name": [],
        "description": [],
        "question": [],
        "answer": [],
        "bounding_box": []
    }
    
    def fix_text(sentence):
        sentence = re.findall(r"[\w']+|[.,!?;]", contractions.fix(sentence.lower()))
        add2mapping(sentence)
        sentence = ["<sos>"] + sentence + ["<eos>"]
        return " ".join(sentence)
    
    def add2mapping(sentence):
        for word in sentence:
            if word not in mappings["w2i"]:
                idx = len(mappings["w2i"])
                mappings["w2i"][word] = idx
                mappings["i2w"][idx] = word
    
    for i in range(len(df)):
        # Extract entry
        row = df.iloc[i]
        
        # do not add if description is empty
        if pd.isna(row['Answer.Descriptive_description']):
            continue
        
        # Add entries to the dict
        parsed_data["image_name"].append(row['Input.image_url'])
        parsed_data["description"].append(fix_text(row['Answer.Descriptive_description']))
        parsed_data["bounding_box"].append(row['Answer.annotated_images.boundingBoxes'])
        parsed_data["question"].append([])
        parsed_data["answer"].append([])
        questions = []
        answers = []
        for qno in range(1, 9):
            question_column = "Answer.Question {}".format(qno)
            answer_column = "Answer.Answer {}".format(qno)
            
            if pd.isna(df.iloc[i][question_column]) or pd.isna(df.iloc[i][answer_column]):
                continue

            parsed_data["question"][-1].append(fix_text(row[question_column])) # Add question
            parsed_data["answer"][-1].append(fix_text(row[answer_column])) # Add answer
        parsed_data["question"][-1].append(fix_text("Describe the scene"))
        parsed_data["answer"][-1].append(parsed_data["description"][-1])
    return parsed_data

In [122]:
def merge_batches(paths):
    parsed_data_frames = []

    mappings = {
        "i2w": dict(),
        "w2i": dict()
    }
    mappings["i2w"][0], mappings["w2i"]["<pad>"] = "<pad>", 0
    mappings["i2w"][1], mappings["w2i"]["<unk>"] = "<unk>", 1
    mappings["i2w"][2], mappings["w2i"]["<sos>"] = "<sos>", 2
    mappings["i2w"][3], mappings["w2i"]["<eos>"] = "<eos>", 3
    
    for csv_path in paths:
        df = pd.read_csv(csv_path)
        parsed_dict = parse_df(df, mappings)
        parsed_df = pd.DataFrame.from_dict(parsed_dict)
        parsed_data_frames.append(parsed_df)
    return pd.concat(parsed_data_frames, ignore_index=True), mappings

In [123]:
final_df, mappings = merge_batches(csv_paths)

In [124]:
final_df.head()

Unnamed: 0,image_name,description,question,answer,bounding_box
0,map_2_inside_env_001806.png,<sos> there is a partial view of the room from...,"[<sos> what is present in the room ? <eos>, <s...","[<sos> there is nothing present in the room , ...","[{""height"":422,""label"":""Door"",""left"":251,""top""..."
1,map_2_inside_env_001063.png,<sos> room with furniture and a victim <eos>,"[<sos> is there a painting <eos>, <sos> does t...",[<sos> yes there is a painting on the wall <eo...,"[{""height"":63,""label"":""Painting"",""left"":396,""t..."
2,map_1_inside_env_004857.png,<sos> there is a corridor with a door and a fe...,"[<sos> is the door open ? <eos>, <sos> how man...","[<sos> yes , the door is open . <eos>, <sos> t...","[{""height"":315,""label"":""Door"",""left"":400,""top""..."
3,map_2_inside_env_002306.png,<sos> a door in a corridor . <eos>,"[<sos> is there fire in the corridor ? <eos>, ...","[<sos> no , there is no fire in the corridor ....","[{""height"":142,""label"":""Door"",""left"":213,""top""..."
4,map_1_inside_env_002893.png,<sos> room with a bookshelf and a yellow victi...,"[<sos> is there a door <eos>, <sos> can a leve...","[<sos> no door visible <eos>, <sos> no lever c...","[{""height"":274,""label"":""Bookshelf"",""left"":0,""t..."


In [125]:
final_df.to_csv("../batches/combined.csv")

In [126]:
joblib.dump({"df": final_df, "mappings": mappings}, "../batches/malmo.data")

['../batches/malmo.data']