In [None]:
import csv
import pandas as pd
from pprint import pprint 
from pathlib import Path

In [None]:
samples = []

is_clean = False
target_file = "batch_results_4000.csv"

clean_csv_file = []
same_tag_list = []
diff_tag_list = []
with open(target_file, 'r') as f:
    df = pd.read_csv(f)
    clean_df = pd.DataFrame()
    
    if is_clean:
        clean_df = df
        if 'labels' not in clean_df.keys():
            clean_df = clean_df.rename(columns={"effectiveness" : "labels"})
    
    else:
        for key in df.keys():
            if "Input." in key:
                new_key = key.split('Input.')[-1]
                clean_df[new_key] = df[key]
            
            if "Answer" in key:
                clean_df["labels"] = df[key] 

    for idx in range(len(clean_df)):
        line = clean_df.iloc[idx]
        
        line_dict = line.to_dict()
        line_dict['labels'] = 'O' if line_dict['labels']=='[{"effectiveness":"yes"}]' or line_dict['labels'] == 'O' else 'X'
        
        found = False
        for sample in samples:
            if sample['q_id'] == line_dict['q_id'] and sample['intermediate_question'] == line_dict['intermediate_question']:
                found=True
                if sample['labels'] == line_dict['labels']:
                    same_tag_list.append(line_dict)
                else:
                    diff_tag_list.append(sample)
                    diff_tag_list.append(line_dict)
        if not found:
            samples.append(line_dict)
            
print(len(clean_df))
print(len(same_tag_list))
print(len(diff_tag_list) / 2)

assert len(same_tag_list) * 2 + len(diff_tag_list) == len(clean_df)

same_df = pd.DataFrame(same_tag_list)
diff_df = pd.DataFrame(diff_tag_list)

file_name = Path(target_file).stem

clean_df.sort_values("q_id")
same_df.sort_values("q_id")
diff_df.sort_values("q_id")

clean_df.to_csv(file_name+"_clean.csv",index=False)
same_df.to_csv(file_name+"_same.csv",index=False)
diff_df.to_csv(file_name+"_diff.csv",index=False)
                              

In [None]:
from collections import Counter
 
print(Counter(same_df['labels']))

In [None]:
from datasets import load_dataset
from glob import glob
import pandas as pd
from collections import Counter

csv_files = glob("*_results_*_same.csv")

split_unit = 200
test_nums = 100

dataset = load_dataset("csv", data_files={'train' : csv_files})
dataset.shuffle(42)
dataset = dataset['train'].train_test_split(test_size=test_nums)

test_dict = dataset['test']
test_dataset = pd.DataFrame(test_dict)
test_dataset.to_csv(f"test_{test_nums}_same.csv", index=False)
print(Counter(dataset['train']['labels']))
print(Counter(test_dict['labels']))

stop = False
while(not stop):
    train_dict = dataset['train'].sort('labels', reverse=True)[:split_unit]
    train_dataset = pd.DataFrame(train_dict)
    train_dataset.to_csv(f"train_{split_unit}_same_bal.csv", index=False)
    split_unit = split_unit * 2
    
    if split_unit > len(dataset['train']) - test_nums:
        stop = True
    print(Counter(train_dict["labels"]))


In [None]:
from datasets import load_dataset, concatenate_datasets, DatasetDict
from glob import glob
import pandas as pd
from collections import Counter

same_csv_files = glob("*_results_*_same.csv")
diff_csv_files = glob("*_results_*_diff.csv")

split_unit = 200
test_nums = 100

same_dataset = load_dataset("csv", data_files={'train' : same_csv_files})
diff_dataset = load_dataset("csv", data_files={'train' : diff_csv_files})

diff_dataset = diff_dataset.filter(lambda example: example['labels'] == "X")

print(diff_dataset['train'])

dataset = DatasetDict()
dataset['train'] = concatenate_datasets([same_dataset['train'], diff_dataset['train']])
print(dataset)
dataset.shuffle(42)


dataset = dataset['train'].train_test_split(test_size=test_nums)

test_dict = dataset['test']
test_dataset = pd.DataFrame(test_dict)
test_dataset.to_csv(f"test_{test_nums}_same_diff.csv", index=False)
print(Counter(dataset['train']['labels']))
print(Counter(test_dict['labels']))

stop = False
while(not stop):
    #train_dict = dataset['train'].sort('labels', reverse=True)[:split_unit]
    train_dict = dataset['train'][:split_unit]
    train_dataset = pd.DataFrame(train_dict)
    train_dataset.to_csv(f"train_{split_unit}_same_diff.csv", index=False)
    split_unit = split_unit * 2
    
    if split_unit > len(dataset['train']) - test_nums:
        stop = True
    print(Counter(train_dict["labels"]))


In [None]:
from datasets import load_dataset, concatenate_datasets, DatasetDict
from glob import glob
import pandas as pd
from collections import Counter

def augment_dataset(samples, augment_nums):
    
    new_samples = {}
    print(len(samples['labels']))
    for key in samples.keys():
        for i in range(augment_nums):
            if i == 0:
                new_samples[key] = samples[key].copy()
            else:
                new_samples[key].extend(samples[key].copy())
    
    print(len(samples['labels']))
    
    return new_samples
        
        
csv_files = glob("*_results_*_same.csv")

split_unit = 200
test_nums = 100

dataset = load_dataset("csv", data_files={'train' : csv_files})
dataset.shuffle(42)

dataset = dataset['train'].train_test_split(test_size=test_nums,seed=42)

test_dict = dataset['test']
test_dataset = pd.DataFrame(test_dict)
test_dataset.to_csv(f"test_{test_nums}_same.csv", index=False)
print(Counter(dataset['train']['labels']))
print(Counter(test_dict['labels']))

stop = False
while(not stop):
    
    true_examples = dataset.filter(lambda example: example["labels"] == "O",with_indices=False)['train']
    false_examples = dataset.filter(lambda example: example["labels"] == "X",with_indices=False)['train']
    
    if len(false_examples) < split_unit // 2:
        
        augment_nums = split_unit // 2 // len(false_examples)
        
        print(augment_nums)
        augmented_examples = false_examples.map(augment_dataset ,batched=True ,fn_kwargs={'augment_nums':augment_nums},load_from_cache_file=False,keep_in_memory=False)
        print(augmented_examples)
        
        false_examples=augmented_examples
    

    true_examples=true_examples.select(range(split_unit // 2))
    if len(false_examples) >= (split_unit // 2):
        false_examples=false_examples.select(range(split_unit // 2))
    
    train_dict = concatenate_datasets([true_examples, false_examples])    

    train_dataset = pd.DataFrame(train_dict)
    train_dataset.to_csv(f"train_{split_unit}_same_balan.csv", index=False)
    split_unit = split_unit * 2
    
    if split_unit // 2  > len(dataset['train']):
        stop = True
    print(Counter(train_dict["labels"]))

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from glob import glob
import json

def make_negative_samples(samples, intermediate_questions):
    
    total_negative_samples=[]
    for idx, image_id in enumerate(samples['image_id']):
        intermediate_questions_list = intermediate_questions[str(image_id)][str(samples['entity_id'][idx])]
        negative_samples=[]
        for intermediate_question in intermediate_questions_list:
            if intermediate_question['label'] == "negative":
                negative_sample = dict()
                for key in samples.keys():
                    negative_sample[key] = samples[key][idx]
                
                negative_sample['intermediate_question'] = intermediate_question['question']
                negative_sample['intermediate_answer'] = intermediate_question['answer']
                negative_sample['labels'] = "X"
                negative_samples.append(negative_sample)
        total_negative_samples.append(negative_samples)

    return {"data" : total_negative_samples}


def make_negative_dataset(positive_dataset, intermediate_questions):
    #positive_dataset = DatasetDict({"positive_samples" : positive_dataset})
    print(positive_dataset)
    negative_dataset = positive_dataset.map(make_negative_samples, batched=True, fn_kwargs={"intermediate_questions" : intermediate_questions},)

    ngative_samples_list = [sample for negative_samples in negative_dataset['data'] for sample in negative_samples]

    df = pd.DataFrame(ngative_samples_list)
    print(len(df))
    unique_df = df.drop_duplicates()

    negative_dataset = Dataset.from_pandas(unique_df,preserve_index=False)
    
    return negative_dataset


# csv_files = glob("*_results_*_same.csv")

# dataset = load_dataset("csv", data_files=csv_files)

# print(dataset)

# positive_dataset = dataset.filter(lambda example: example["labels"] == "O",with_indices=False)

# # with open("./sceneGraphs/train_sceneGraphs.json") as f:
# #     sceneGraphs = json.load(f)
    
# with open("./intermediate_questions.json") as f:
#     intermediate_questions = json.load(f)
    
# negative_dataset = make_negative_dataset(positive_dataset['train'], intermediate_questions)

In [None]:
from datasets import load_dataset, concatenate_datasets, DatasetDict, Dataset
from glob import glob
import pandas as pd
from collections import Counter


csv_files = glob("*_results_*_same.csv")

dataset = load_dataset("csv", data_files={'train' : csv_files})
dataset.shuffle(42)


true_examples = dataset.filter(lambda example: example["labels"] == "O",with_indices=False)['train']
#print(true_examples)
false_examples = dataset.filter(lambda example: example["labels"] == "X",with_indices=False)['train']
#print(len(false_examples))

true_dataset = true_examples.train_test_split(test_size=len(false_examples),seed=42)
false_dataset = DatasetDict()
false_dataset['test'] = false_examples

split_unit = 200

dataset = DatasetDict()
dataset_list = [true_dataset, false_dataset]
for key in true_dataset.keys():
    dataset[key] = concatenate_datasets([target_dataset[key] for target_dataset in dataset_list if key in target_dataset.keys()])

print(dataset)
test_dict = dataset['test']
test_nums = len(test_dict)
test_dataset = pd.DataFrame(test_dict)
test_dataset.to_csv(f"test_{test_nums}_same.csv", index=False)
print(Counter(dataset['train']['labels']))
print(Counter(test_dict['labels']))

with open("./intermediate_questions.json") as f:
        intermediate_questions = json.load(f)

stop = False
while(not stop):

    true_examples=dataset['train'].select(range(split_unit // 2))
    
    
#     if len(false_examples) < split_unit // 2:
        
    augment_nums = split_unit // 2 // len(false_examples)
    
    print(augment_nums)
    
    negative_dataset = make_negative_dataset(true_examples, intermediate_questions)
    
    
    false_examples=negative_dataset
    

    true_examples=true_examples.select(range(split_unit // 2))
    if len(false_examples) >= (split_unit // 2):
        false_examples=false_examples.select(range(split_unit // 2))
    
    train_dict = concatenate_datasets([true_examples, false_examples])    

    train_dataset = pd.DataFrame(train_dict)
    train_dataset.to_csv(f"train_{split_unit}_same_aug.csv", index=False)
    split_unit = split_unit * 2
    
    if split_unit // 2  > len(dataset['train']):
        stop = True
    print(Counter(train_dict["labels"]))