In [113]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import social_iqa_experiment_utils
from tqdm import tqdm
import os
import csv
import utils
from transformers import T5Tokenizer, T5ForConditionalGeneration

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [114]:
json_path = './socialIQa_v1.4_withDims/socialIWa_v1.4_dev_wDims.jsonl'
df = pd.read_json(json_path, lines=True)
answer_choices = ['A', 'B', 'C']

In [115]:
social_iqa_t5_results_csv_path = './results/social_iqa_t5_results.csv'
if not os.path.exists(social_iqa_t5_results_csv_path):
    # Load model
    max_memory = {0: "20GIB", 1: "20GIB", 2: "20GIB", 3: "20GIB"}
    t5_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xxl", device_map="auto", max_memory=max_memory)
    t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xxl")

    social_iqa_llm_results_file = open(social_iqa_t5_results_csv_path, 'w')
    writer = csv.writer(social_iqa_llm_results_file)
    header = ['template']
    header += answer_choices
    writer.writerow(header)

    for row_idx, row in tqdm(df.iterrows()):
        template = social_iqa_experiment_utils.create_template_t5(row)
        probs = utils.get_probs_t5([template], answer_choices, t5_model, t5_tokenizer)[0]
        writer.writerow([template] + probs)

    social_iqa_llm_results_file.close()

In [116]:
social_iqa_davinci_results_csv_path = './results/social_iqa_davinci_results.csv'
if not os.path.exists(social_iqa_davinci_results_csv_path):
    social_iqa_llm_results_file = open(social_iqa_davinci_results_csv_path, 'w')
    writer = csv.writer(social_iqa_llm_results_file)
    header = ['template']
    header += answer_choices
    writer.writerow(header)

    for row_idx, row in tqdm(df.iterrows()):
        template = social_iqa_experiment_utils.create_template_davinci(row)
        probs = utils.get_probs_davinci(template, answer_choices)
        writer.writerow([template] + probs)

    social_iqa_llm_results_file.close()

In [117]:
print('Davinci Results: \n')
davinci_results_df = pd.read_csv(social_iqa_davinci_results_csv_path)
social_iqa_experiment_utils.analyze_result(davinci_results_df, df)

Davinci Results: 

Average Invalid Completion Rate : 0.026519429022858088

Dimension Intent: Accuracy: 0.7352941176470589
Dimension Need: Accuracy: 0.7017543859649122
Dimension Attr: Accuracy: 0.7177700348432056
Dimension Effect: Accuracy: 0.6880733944954128
Dimension React: Accuracy: 0.7686746987951807
Dimension Want: Accuracy: 0.7059859154929577

Overall Accuracy: 0.7221084953940634


In [118]:
print('T5 Results: \n')
t5_results_df = pd.read_csv(social_iqa_t5_results_csv_path)
social_iqa_experiment_utils.analyze_result(t5_results_df, df)

T5 Results: 

Average Invalid Completion Rate : 0.001164901296521692

Dimension Intent: Accuracy: 0.8529411764705882
Dimension Need: Accuracy: 0.8552631578947368
Dimension Attr: Accuracy: 0.7804878048780488
Dimension Effect: Accuracy: 0.7798165137614679
Dimension React: Accuracy: 0.8361445783132531
Dimension Want: Accuracy: 0.8116197183098591

Overall Accuracy: 0.8188331627430911
