In [64]:
n = 50
sample_len = 100
import random
import json
import pandas as pd

In [65]:
# extracting random samples from commonsenseQA dataset
file_path = 'data/commonsense_qa.parquet'

# load parquet file
df = pd.read_parquet(file_path)

# randomly sample n rows, save as json
import random
sample = df.sample(1000, random_state=42)
sample.to_json('data/samples/commonsense_qa_sample.jsonl', orient='records', lines=True)


In [66]:
# rebuild sample, construct formatted sentences
'''
Relevant columns:
- id (str)
- question (str)
- label (list, always ['A', 'B', 'C', 'D', 'E'])
- text (list, answers corresponding to labels)
- answerKey (str, e.g. 'A')

Under qualitative framework, interested in structure of questions
'''

# format answer
with open('data/samples/commonsense_qa_sample.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

# filter list for shortest questions + answers
data = [q for q in data if len(q['question']) + len(q['answerKey']) < sample_len]

# format questions
formatted_cs_statements = []
for q in data:
    c = q['choices']
    labels, choices = c['label'], c['text']
    correct_answer = dict(zip(labels, choices))[q['answerKey']]
    formatted_cs_statements.append(
        {'id': q['id'],
         'statement': f'{q['question']} Answer: {correct_answer}'}
    )

formatted_cs_statements = formatted_cs_statements[:n]
print(formatted_cs_statements)


[{'id': '8d0b2621524162b7c900ebba92bf7019', 'statement': 'Bill sits down on a whoopee cushion, what sound does he make when he sits? Answer: flatulence'}, {'id': '136c08fe01cb3e029b18b62997267810', 'statement': 'What is likely heard by those going to a party? Answer: laughter'}, {'id': '819890987657d108834837af52d7b42b', 'statement': 'A handsome prince is a stock character common to what? Answer: fairy tale'}, {'id': '033908492f55b9092e0fbdbcf49bab94', 'statement': 'What covers the largest percentage of the pacific northwest? Answer: united states'}, {'id': 'b67f97a3fd9f4025d34f927a7a616512', 'statement': 'When black and white balls are kicked with feet what is being played? Answer: soccer game'}, {'id': '0377b8205981a2da2da4c5b80b27057c', 'statement': 'If someone had been smoking, an easy way to tell would be if they were what? Answer: coughing'}, {'id': '411b40d253960936de500ef3a57d650b', 'statement': 'Billy was smart but inexperienced.  In many ways he was what? Answer: ignorant'}, 

In [67]:
# extracting random samples from hellaSWAG dataset
file_path = 'data/hellaswag_train.jsonl'

# load jsonl file
import json
with open(file_path, 'r') as f:
    data = [json.loads(line) for line in f]

# split activity net and wikihow
act_net_split = [q for q in data if q['source_id'].__contains__('activitynet')]
wikihow_split = [q for q in data if q['source_id'].__contains__('wikihow')]

# filter list for shortest questions + answers
act_net_split = [q for q in act_net_split if len(f'{q['ctx']} Answer: {q['endings'][q['label']]}') < sample_len]
wikihow_split = [q for q in wikihow_split if len(f'{q['ctx']} Answer: {q['endings'][q['label']]}') < sample_len]

# randomly sample n rows, save as json
sample = random.sample(act_net_split, n)
with open('data/samples/hellaswag_act_net_sample.jsonl', 'w') as f:
    for line in sample:
        f.write(json.dumps(line) + '\n')
'''
# randomly sample n rows, save as json
sample = random.sample(wikihow_split, n)
with open('data/samples/hellaswag_wikihow_sample.jsonl', 'w') as f:
    for line in sample:
        f.write(json.dumps(line) + '\n')
'''
# print sample
print(len(sample))

50


In [68]:
# rebuild sample, construct formatted sentences
'''
Relevent columns:
- ind (int)
- ctx (str)
- label (int: correct answer)
- endings (list of str)
'''

# format answer
with open('data/samples/hellaswag_act_net_sample.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

formatted_swag_statements = []
for q in data:
    formatted_swag_statements.append(
        {'id': q['ind'],
         'statement': f'{q['ctx']} Answer: {q['endings'][q['label']]}'}
    )

print(formatted_swag_statements)


[{'id': 9117, 'statement': 'An older man sits on an orange riding lawnmower. the man Answer: mows the yard in circles.'}, {'id': 36033, 'statement': 'A reclining woman knits with her hands. the woman Answer: talks without facing the camera.'}, {'id': 26750, 'statement': "A boy wearing glasses is seated at a table. he Answer: is attempting to solve a rubik's cube."}, {'id': 8484, 'statement': 'Two people are sitting at a table. they Answer: are playing a game of rock paper scissors.'}, {'id': 32772, 'statement': 'A roof is repaired by pvc. the roof Answer: is cleaned and the seal is made again.'}, {'id': 20155, 'statement': 'A band plays music on stage. a man Answer: plays the drums intensely.'}, {'id': 38555, 'statement': 'Man is vacuuming the seats of a car. man Answer: is cleaning the car holding a hosepipe.'}, {'id': 31097, 'statement': 'A woman wearing a santa hat is talking. she Answer: puts christmas lights onto the tree.'}, {'id': 15596, 'statement': 'A woman is standing in her 

In [69]:
# extracting random samples from socialIQA dataset
file_path_data = 'data/siqa_dev.jsonl'
file_path_labels = 'data/siqa_dev_labels.lst'

# load data
with open(file_path_data, 'r') as f:
    data = [json.loads(line) for line in f]

# load labels
with open(file_path_labels, 'r') as f:
    labels = [int(label) for label in f]

# merge labels into data
for i in range(len(data)):
    data[i]['label'] = labels[i]

# filter list for short questions + answers
data = [q for q in data if len(f'{q['context']} {q['question']} Answer: {q[f'answer{['A', 'B', 'C'][q['label']-1]}']}') < sample_len]

print(len(data))

# randomly sample n rows, save as json
sample = random.sample(data, n)
with open('data/samples/social_iqa_sample.jsonl', 'w') as f:
    for line in sample:
        f.write(json.dumps(line) + '\n')

# print sample
print(sample)


70
[{'context': 'Kai swung through the trees while she was outside.', 'question': 'How would you describe Kai?', 'answerA': 'quiet', 'answerB': 'athletic', 'answerC': 'kai who has swung the trees', 'label': 2}, {'context': 'Robin thanked Jordan for the dinner.', 'question': 'Why did Robin do this?', 'answerA': 'Get a goodnight kiss', 'answerB': 'Be taken home by Jordan', 'answerC': 'caring', 'label': 1}, {'context': 'Jan gave the entire class an F on the assignment.', 'question': 'Why did Jan do this?', 'answerA': 'make them fail', 'answerB': 'catch them cheating', 'answerC': 'hurt their grades', 'label': 3}, {'context': 'Robin held her breath underwater in the pool.', 'question': 'How would you describe Robin?', 'answerA': 'exhausted', 'answerB': 'fun', 'answerC': 'tired', 'label': 2}, {'context': 'Quinn climbed into bed after a full, busy day of work.', 'question': 'Why did Quinn do this?', 'answerA': 'felt tired', 'answerB': 'shut the lights off', 'answerC': 'felt energized', 'label

In [70]:
# rebuild sample, construct formatted sentences
'''
Relevent columns:
- context (str)
- question (str)
- answerA, answerB, answerC (str)
- label (int: correct answer)
'''

with open('data/samples/social_iqa_sample.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

# format answer
formatted_si_statements = []
for q in data:
    answer_key = f'answer{['A', 'B', 'C'][q['label']-1]}'
    formatted_si_statements.append(
        {'id': '',
         'statement': f'{q['context']} {q['question']} Answer: {q[answer_key]}'}
    )

print(formatted_si_statements)

[{'id': '', 'statement': 'Kai swung through the trees while she was outside. How would you describe Kai? Answer: athletic'}, {'id': '', 'statement': 'Robin thanked Jordan for the dinner. Why did Robin do this? Answer: Get a goodnight kiss'}, {'id': '', 'statement': 'Jan gave the entire class an F on the assignment. Why did Jan do this? Answer: hurt their grades'}, {'id': '', 'statement': 'Robin held her breath underwater in the pool. How would you describe Robin? Answer: fun'}, {'id': '', 'statement': 'Quinn climbed into bed after a full, busy day of work. Why did Quinn do this? Answer: felt tired'}, {'id': '', 'statement': 'Sasha got very tired, so she took a nap. How would Sasha feel afterwards? Answer: energetic'}, {'id': '', 'statement': 'Carson turned music down to go to college instead. How would you describe Carson? Answer: practical'}, {'id': '', 'statement': 'Robin was on a diet. How would Robin feel afterwards? Answer: Someone with good self control'}, {'id': '', 'statement':

In [71]:
# extracting random samples from piqa dataset
file_path = 'data/piqa.jsonl'

# load jsonl file
with open(file_path, 'r') as f:
    data = [json.loads(line) for line in f]

# filter list for short questions + answers
data = [q for q in data if len(f'{q['ctx']} Answer: {q['endings'][q['label']]}') < sample_len]

print(len(data))

# randomly sample n rows, save as json
sample = random.sample(data, n)
with open('data/samples/piqa_sample.jsonl', 'w') as f:
    for line in sample:
        f.write(json.dumps(line) + '\n')

# print sample
print(sample)


1030
[{'ind': 43808, 'activity_label': 'Using parallel bars', 'ctx_a': 'A gymnast mounts a high beam in a gym.', 'ctx_b': 'he', 'ctx': 'A gymnast mounts a high beam in a gym. he', 'split': 'train', 'split_type': 'indomain', 'label': 2, 'endings': ['then dismounts, landing in a sand pit on the gym floor.', 'steps onto the beam and spins around many times before performing.', 'flips and does several springs.', 'does a forward somersault, flipping down the steps.'], 'source_id': 'activitynet~v_fg_R9Vrr1KI'}, {'ind': 18146, 'activity_label': 'Table soccer', 'ctx_a': 'A foosball table is on a tiled floor.', 'ctx_b': 'people', 'ctx': 'A foosball table is on a tiled floor. people', 'split': 'train', 'split_type': 'indomain', 'label': 2, 'endings': ['are sitting down playing the game.', 'sit on the foosball table and juggle.', 'are playing foosball indoors.', 'are playing dodgeball on a foosball table.'], 'source_id': 'activitynet~v_DP9hfhq8sro'}, {'ind': 35367, 'activity_label': 'Tumbling', '

In [72]:
# rebuild sample, construct formatted sentences
'''
Revelent columns:
- ind (str)
- ctx (str)
- label (int: index of correct answer)
- endings (list: str)
'''

# format answer
with open('data/samples/piqa_sample.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

formatted_pi_statements = []

for q in data:
    formatted_pi_statements.append(
        {'id': q['ind'],
         'statement': f'{q['ctx']} Answer: {q['endings'][q['label']]}'}
    )

print(formatted_pi_statements)

[{'id': 43808, 'statement': 'A gymnast mounts a high beam in a gym. he Answer: flips and does several springs.'}, {'id': 18146, 'statement': 'A foosball table is on a tiled floor. people Answer: are playing foosball indoors.'}, {'id': 35367, 'statement': 'We see a girl jumping on a trampoline in doors. we Answer: see the girl spin in the air.'}, {'id': 48695, 'statement': 'A boy walks backwards on the lawn. the boy Answer: picks up the croquet stick and makes his shot.'}, {'id': 1598, 'statement': 'Letters are shown on a screen. a blue bucket Answer: is put into a sink.'}, {'id': 30304, 'statement': 'A person holds a ring in a small box. then Answer: , the man wraps the box with gift paper.'}, {'id': 17324, 'statement': 'A woman is dancing up on a stage. she Answer: is leading a large group in zumba class.'}, {'id': 46581, 'statement': 'A person is holding a cats paw and clipping their nails. they Answer: pet the can afterwards.'}, {'id': 5350, 'statement': 'Little kid is playing saxop

In [73]:
# extracting samples from winogrande dataset
file_path = 'data/winogrande_dev.jsonl'

# load jsonl file
with open(file_path, 'r') as f:
    data = [json.loads(line) for line in f] 

# filter list for short questions + answers
data = [q for q in data if len(f'{q['sentence']} Answer: {q[f'option{q['answer']}']}') < sample_len]

print(len(data))    

# randomly sample n rows, save as json
sample = random.sample(data, n)
with open('data/samples/winogrande_sample.jsonl', 'w') as f:
    for line in sample:
        f.write(json.dumps(line) + '\n')

# print sample
print(sample)


226
[{'qID': '3SCKNODZ0XENKCVS5ZYT8YEDYG97NE-1', 'sentence': 'Adam loved dogs but Jason was afraid of them, so only _ petted the poodle.', 'option1': 'Adam', 'option2': 'Jason', 'answer': '1'}, {'qID': '3CRWSLD91K2ON02RAK0V65B8OXWOMD-1', 'sentence': 'I did not see the sign on the billboard on the highway because the _ was too tiny.', 'option1': 'sign', 'option2': 'billboard', 'answer': '1'}, {'qID': '3NQUW096N661AVG08JGY921UTSY9LU-2', 'sentence': 'the scanning tool could not get through the cervix because the _ was too big.', 'option1': 'cervix', 'option2': 'tool', 'answer': '2'}, {'qID': '3G3AJKPCXLQKDOHNA39XM2SABXEY40-1', 'sentence': "Brian was jealous of Brett's new car because _ couldn't afford to buy a new car.", 'option1': 'Brian', 'option2': 'Brett', 'answer': '1'}, {'qID': '3V7ICJJAZCU3U5NJK97D6PIHJB24BJ-1', 'sentence': 'Donald was able to catch the Frisbee thrown by Eric, then _ threw the Frisbee back.', 'option1': 'Donald', 'option2': 'Eric', 'answer': '1'}, {'qID': '3SA4EMRV

In [74]:
# rebuild sample, construct formatted sentences
'''
Relevent columns:
- qID (str)
- sentence (str)
- option1, option2 (str)
- answer (str)
'''

# format answer
with open('data/samples/winogrande_sample.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

formatted_wg_statements = []
for q in data:
    answer_key = f'option{q['answer']}'
    formatted_wg_statements.append(
        {'id': q['qID'],
         'statement': f'{q['sentence']} Answer: {q[answer_key]}'}
    )

print(formatted_wg_statements)

[{'id': '3SCKNODZ0XENKCVS5ZYT8YEDYG97NE-1', 'statement': 'Adam loved dogs but Jason was afraid of them, so only _ petted the poodle. Answer: Adam'}, {'id': '3CRWSLD91K2ON02RAK0V65B8OXWOMD-1', 'statement': 'I did not see the sign on the billboard on the highway because the _ was too tiny. Answer: sign'}, {'id': '3NQUW096N661AVG08JGY921UTSY9LU-2', 'statement': 'the scanning tool could not get through the cervix because the _ was too big. Answer: tool'}, {'id': '3G3AJKPCXLQKDOHNA39XM2SABXEY40-1', 'statement': "Brian was jealous of Brett's new car because _ couldn't afford to buy a new car. Answer: Brian"}, {'id': '3V7ICJJAZCU3U5NJK97D6PIHJB24BJ-1', 'statement': 'Donald was able to catch the Frisbee thrown by Eric, then _ threw the Frisbee back. Answer: Donald'}, {'id': '3SA4EMRVJXGOZB6SKD2CP6XTJPQ0PH-1', 'statement': 'Betty had more burns on their hands than Amy because _ worked as a welder. Answer: Betty'}, {'id': '3AA88CN98P15RQ6QVVWDTJH6D14YKV-1', 'statement': 'They were wanting  to mo

In [75]:
# build full csv of formatted statements

formatted_statements = formatted_cs_statements + formatted_swag_statements + formatted_si_statements + formatted_pi_statements + formatted_wg_statements
formatted_statements_df = pd.DataFrame(formatted_statements)
formatted_statements_df.to_csv('data/samples/formatted_statements.csv', index=False)
print(formatted_statements_df)

                                   id  \
0    8d0b2621524162b7c900ebba92bf7019   
1    136c08fe01cb3e029b18b62997267810   
2    819890987657d108834837af52d7b42b   
3    033908492f55b9092e0fbdbcf49bab94   
4    b67f97a3fd9f4025d34f927a7a616512   
..                                ...   
245  3ZZAYRN1I857UKRI3FD7KHU86YXOTC-1   
246  3EQPA8A374UALWAD9WYK6BQVZSFZJY-1   
247  3B9J25CZ27R56VH0OAZQFC46G6YCSP-1   
248  3D4BBDG7ZJA1MEWLYVYP74WUODH3CG-2   
249  3KQC8JMJGCQD9X8U0P0QQX51CLEH3H-1   

                                             statement  
0    Bill sits down on a whoopee cushion, what soun...  
1    What is likely heard by those going to a party...  
2    A handsome prince is a stock character common ...  
3    What covers the largest percentage of the paci...  
4    When black and white balls are kicked with fee...  
..                                                 ...  
245  Jody wanted new mascara and eyeliner, either b...  
246  Joseph has an allergy to mangoes, but Nelson d