In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset
import pandas as pd
import json
import os
import re
from google.colab import drive

Please adjust this path to the data folder in your drive with HC3 and RedditEli5

In [None]:
# having error when importing HC3 dataset from HuggingFace so using local downloaded file
drive.mount('/content/drive')
os.chdir("/content/drive/MyDrive/AI")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Load HC3

In [None]:
#load local HC3 data
hc3_df = pd.read_json(path_or_buf="reddit_eli5.jsonl", lines=True)
hc3_df

Unnamed: 0,question,human_answers,chatgpt_answers,index
0,"Why is every book I hear about a "" NY Times # ...","[Basically there are many categories of "" Best...",[There are many different best seller lists th...,
1,"If salt is so bad for cars , why do we use it ...",[salt is good for not dying in car crashes and...,[Salt is used on roads to help melt ice and sn...,
2,Why do we still have SD TV channels when HD lo...,[The way it works is that old TV stations got ...,[There are a few reasons why we still have SD ...,
3,Why has nobody assassinated Kim Jong - un He i...,[You ca n't just go around assassinating the l...,[It is generally not acceptable or ethical to ...,
4,How was airplane technology able to advance so...,[Wanting to kill the shit out of Germans drive...,[After the Wright Brothers made the first powe...,
...,...,...,...,...
17107,Why is the general Xbox One reveal feedback ne...,[A few items people do n't seem to like : 1 . ...,[It's difficult to say exactly why the general...,
17108,If infrared is thermal why does n't my remote ...,"[It 's very , very low power . Around the orde...",[Infrared light is a type of light that we can...,
17109,Why are cars all about their brake - horse pow...,[Brake horsepower has nothing to do with brake...,"[Cars are not just about how fast they can go,...",
17110,"What does a CO^2 scrubber do , and how does it...",[A [ scrubber ] ( URL_0 ) is a engineering pro...,[A CO2 scrubber is a device that removes carbo...,


Process HC3 to ensure that for each row there is one human answer only. For both datasets, 1 question can have multiple human answers

In [None]:
# drop index col which is all NaN
processed_hc3_df = hc3_df.drop(["index"], axis=1)

# keep rows with non empty chatgpt ans
has_chatgpt_ans = processed_hc3_df["chatgpt_answers"].map(lambda ans_lst:ans_lst!=[])
processed_hc3_df = processed_hc3_df[has_chatgpt_ans]

#Need to expand the answers and chatgpt ans -- as for one question, there can be several human answers obtained
row_list = []
for index, row in processed_hc3_df.iterrows():
  human_ans_list = row["human_answers"]
  chatgpt_ans_list = row["chatgpt_answers"]
  # note that sometimes there can be 2 human ans, and 1 chatgpt ans for e.g. 
  num_chatgpt_ans = len(chatgpt_ans_list)
  for i in range(len(human_ans_list)):
    human_ans = human_ans_list[i]
    if i >= num_chatgpt_ans:
      chatgpt_ans = chatgpt_ans_list[num_chatgpt_ans-1]
    else:
      chatgpt_ans = chatgpt_ans_list[i]
    row_list.append([row["question"],human_ans,chatgpt_ans])

processed_hc3_df = pd.DataFrame(row_list, columns = ["question", "human_answer", "chatgpt_answer"])
processed_hc3_df.head()

Unnamed: 0,question,human_answer,chatgpt_answer
0,"Why is every book I hear about a "" NY Times # ...","Basically there are many categories of "" Best ...",There are many different best seller lists tha...
1,"Why is every book I hear about a "" NY Times # ...","If you 're hearing about it , it 's because it...",There are many different best seller lists tha...
2,"Why is every book I hear about a "" NY Times # ...","One reason is lots of catagories . However , h...",There are many different best seller lists tha...
3,"If salt is so bad for cars , why do we use it ...",salt is good for not dying in car crashes and ...,Salt is used on roads to help melt ice and sno...
4,"If salt is so bad for cars , why do we use it ...","In Minnesota and North Dakota , they tend to u...",Salt is used on roads to help melt ice and sno...


Load ELI5 (Training)

In [None]:
# import datasets -- eli5_train
eli_5_dataset = load_dataset("eli5")

Downloading builder script:   0%|          | 0.00/18.2k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/6.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading and preparing dataset eli5/LFQA_reddit to /root/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa...


Downloading:   0%|          | 0.00/3.50k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/576M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/21.1M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/286M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.65M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/330M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/18.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/36.2M [00:00<?, ?B/s]

Dataset eli5 downloaded and prepared to /root/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa. Subsequent calls will reuse this data.


  0%|          | 0/9 [00:00<?, ?it/s]

In [None]:
# filter to train_eli5
train_eli_5_dataset = eli_5_dataset["train_eli5"]
train_eli_5_df = train_eli_5_dataset.to_pandas()
train_eli_5_df.head()

Unnamed: 0,q_id,title,selftext,document,subreddit,answers,title_urls,selftext_urls,answers_urls
0,1oy5tc,in football whats the point of wasting the fir...,,,explainlikeimfive,"{'a_id': ['ccwtgnz', 'ccwtmho', 'ccwt946', 'cc...",{'url': []},{'url': []},{'url': []}
1,2lojul,Why are different tiers (regular < mid < premi...,I've noticed that the difference in price betw...,,explainlikeimfive,"{'a_id': ['clwqgxl', 'clwqpjq', 'clwuh3s'], 't...",{'url': []},{'url': []},{'url': []}
2,8v5e3s,Stars and Visibility,Why do stars in the night's sky seem to disapp...,,explainlikeimfive,"{'a_id': ['e1kpw6u'], 'text': ['It's a quirk o...",{'url': []},{'url': []},{'url': []}
3,1v3wij,How do we know all the money the government is...,We hear about these large billion dollar bank ...,,explainlikeimfive,"{'a_id': ['ceohakd', 'ceoikhs', 'ceoji15', 'ce...",{'url': []},{'url': ['http://dealbook.nytimes.com/2014/01/...,{'url': ['http://www.reuters.com/article/2013/...
4,2jlp6f,What are good and bad sides of manual and auto...,Please consider I'm not a driver. Automatic se...,,explainlikeimfive,"{'a_id': ['clcur3j'], 'text': ['Automatics wei...",{'url': []},{'url': []},{'url': []}


Expand answers column such that there is one answer per row

In [None]:
#process eli_5_df by expanding the answers -- NOTE: this will make the qn ID not unique -- refer to https://huggingface.co/datasets/eli5 for schema

# drop un-useful columns
processed_train_eli_5_df = train_eli_5_df.drop(["title_urls","selftext_urls", "answers_urls"], axis=1)
row_list = []
#need to expand as for one question, there can be several answers obtained
for index, row in processed_train_eli_5_df.iterrows():
  answer_dict = row["answers"]
  id_list = answer_dict["a_id"]
  text_list = answer_dict["text"]
  score_list = answer_dict["score"]
  for i in range(len(answer_dict["a_id"])):
    id = id_list[i]
    text = text_list[i]
    score = score_list[i]
    row_vals = row.values.flatten().tolist()+ [id, text, score]
    row_list.append(row_vals)

new_colname_list = processed_train_eli_5_df.columns.to_list() + ["a_id", "text", "score"]

processed_train_eli_5_df = pd.DataFrame(row_list, columns = new_colname_list)
processed_train_eli_5_df.head()

Unnamed: 0,q_id,title,selftext,document,subreddit,answers,a_id,text,score
0,1oy5tc,in football whats the point of wasting the fir...,,,explainlikeimfive,"{'a_id': ['ccwtgnz', 'ccwtmho', 'ccwt946', 'cc...",ccwtgnz,"Keep the defense honest, get a feel for the pa...",3
1,1oy5tc,in football whats the point of wasting the fir...,,,explainlikeimfive,"{'a_id': ['ccwtgnz', 'ccwtmho', 'ccwt946', 'cc...",ccwtmho,"If you throw the ball all the time, then the d...",2
2,1oy5tc,in football whats the point of wasting the fir...,,,explainlikeimfive,"{'a_id': ['ccwtgnz', 'ccwtmho', 'ccwt946', 'cc...",ccwt946,In most cases the O-Line is supposed to make a...,2
3,1oy5tc,in football whats the point of wasting the fir...,,,explainlikeimfive,"{'a_id': ['ccwtgnz', 'ccwtmho', 'ccwt946', 'cc...",ccwvj0u,"I you don't like those type of plays, watch CF...",2
4,2lojul,Why are different tiers (regular < mid < premi...,I've noticed that the difference in price betw...,,explainlikeimfive,"{'a_id': ['clwqgxl', 'clwqpjq', 'clwuh3s'], 't...",clwqgxl,"As someone who uses quality Premium, I wish th...",14


Keep Explain like I'me 5 Reddit data in ELI5

In [None]:
explain_like_im_five_eli5_df = processed_train_eli_5_df[processed_train_eli_5_df["subreddit"] == "explainlikeimfive"]f

Attempt to Filter HC3 datapoints from ELI5 in order to ensure that we do not poll data already in HC3 from ChatGPT

In [None]:
# process eli5 to be easily joined with hc3 dataset
processed_for_hc3_eli5_df = explain_like_im_five_eli5_df.rename({"title": "question", "text": "human_answer"}, axis=1)
processed_for_hc3_eli5_df = processed_for_hc3_eli5_df[['q_id', 'a_id', 'question','human_answer']] 

ISSUE: Unable to remove HC3 from ELI5 with exact match of questions due to adjustment of questions in HC3 and lack of common key

In [None]:
#Attempt to standardise the questions
added_text = "Please explain like I'm five."
added_text_1 = "Explain like I'm five."
hc3_qn_list = processed_hc3_df["question"].to_list()
hc3_qn_list = [" ".join(qn.split()).replace(added_text,"").replace(added_text_1,"").lower() for qn in hc3_qn_list]
eli5_qn_list = processed_for_hc3_eli5_df["question"].to_list()
eli5_qn_list = [" ".join(qn.split()).lower() for qn in eli5_qn_list]

hc3_question_set = set(hc3_qn_list)
print(f"HC3 question count: {len(hc3_question_set)}")
eli5_question_set = set(eli5_qn_list)
print(f"Eli5 question count: {len(eli5_question_set)}")
filtered_set = eli5_question_set.difference(hc3_question_set)
print(f"Filtered set count: {len(filtered_set)}")

HC3 question count: 15172
Eli5 question count: 266804
Filtered set count: 266804


Attempt to filter out HC3 by human answer column instead to achieve improvement of filtering 

In [None]:
import re
hc3_ans_list = processed_hc3_df["human_answer"].to_list()
hc3_ans_list = [re.sub('[^A-Za-z0-9]+','',ans.strip().lower()) for ans in hc3_ans_list]
hc3_ans_set = set(hc3_ans_list)

eli5_ans_list = processed_for_hc3_eli5_df["human_answer"].to_list()
eli5_ans_list = [re.sub('[^A-Za-z0-9]+','',ans.strip().lower()) for ans in eli5_ans_list]
processed_for_hc3_eli5_df["human_answer_normalised"] = eli5_ans_list

not_in_hc3 = [ans not in hc3_ans_set for ans in processed_for_hc3_eli5_df["human_answer_normalised"]]
filtered_eli5_df = processed_for_hc3_eli5_df[not_in_hc3]

hc3_question_set = set(hc3_qn_list)
print(f"HC3 question count: {len(hc3_question_set)}")
eli5_question_set = set(eli5_qn_list)
print(f"Eli5 question count: {len(eli5_question_set)}")
filtered_set = set(filtered_eli5_df["question"])
print(f"Filtered set count: {len(filtered_set)}")

HC3 question count: 15172
Eli5 question count: 266804
Filtered set count: 263016


Split ELI5 dataset that is not in HC3 for us to poll ChatGPT API

In [None]:
import numpy as np

output_eli5_df = filtered_eli5_df
#to be filled with API
output_eli5_df["chatgpt_answer"] = ""
output_eli5_df = output_eli5_df.drop(["human_answer_normalised"], axis=1)
#we have to map back to the eli5 dataset to get the human_answer since for 1 qn there can be multiple ans so keep the q_id
output_eli5_df = output_eli5_df[["q_id", "question"]]
output_eli5_df = output_eli5_df.drop_duplicates(subset=["question"], keep=False)

chunked_df_list = np.array_split(output_eli5_df, 4)
for i in range(len(chunked_df_list)):
  df = chunked_df_list[i]
  name = f"Dataset{i}.csv"
  df.to_csv(name, index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_eli5_df["chatgpt_answer"] = ""


'\nchunked_df_list = np.array_split(output_eli5_df, 4)\nfor i in range(len(chunked_df_list)):\n  df = chunked_df_list[i]\n  name = f"Dataset{i}.csv"\n  df.to_csv(name, index=False)\n'