In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
# csv for answering model, trained by margin loss -> answer_magrin.csv

import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/drive/MyDrive/CSEG321/dataset/total_dataset.csv', encoding='utf-8-sig')
split_cols = df['context'].str.split(r'\[BLANK\]', n=1, expand=True)
df['prefix'], df['suffix'] = split_cols[0], split_cols[1]

train_df, tmp = train_test_split(df, test_size=0.2, stratify=df['grade'], random_state=42)
valid_df, test_df = train_test_split(tmp, test_size=0.5, stratify=tmp['grade'], random_state=42)

for d, name in [(train_df,'train'), (valid_df,'valid'), (test_df,'test')]:
    d['split'] = name

answer_margin = pd.concat([train_df, valid_df, test_df], ignore_index=True)
answer_margin.to_csv('/content/drive/MyDrive/CSEG321/dataset/answer_margin.csv',
                     index=False, encoding='utf-8-sig')

print("Columns:", answer_margin.columns.tolist())
print("First row:\n", answer_margin.iloc[0].to_dict())
print("Split counts:\n", answer_margin['split'].value_counts())

Columns: ['problem', 'context', 'choice_1', 'choice_2', 'choice_3', 'choice_4', 'choice_5', 'explanation', 'answer', 'grade', 'prefix', 'suffix', 'split']
First row:
 {'problem': 'Choose from (1) to (5) the most appropriate words to be included in the [BLANK] of the following article, and explain in detail why the words are appropriate', 'context': 'Most mice in the wild are eaten or die before their life span of two years is over. They die from external causes, such as disease, starvation, or predators, not due to internal causes, such as aging. That is why nature has made mice to live, on average, for no longer than two years. Now we have arrived at an important point: The average life span of an animal species, or the rate at which it ages, is determined by [BLANK]. That explains why a bat can live to be 30 years old. In contrast to mice, bats can fly, which is why they can escape from danger much faster. Thanks to their wings, bats can also cover longer distances and are better abl

In [14]:
# csv for answering model, trained by context -> answer_context.csv

import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/drive/MyDrive/CSEG321/dataset/total_dataset.csv', encoding='utf-8-sig')

def fill_full(row):
    ans = int(row['answer'])
    return row['context'].replace('[BLANK]', row[f'choice_{ans}'])
df['full_context'] = df.apply(fill_full, axis=1)

train_df, tmp = train_test_split(df, test_size=0.2, stratify=df['grade'], random_state=42)
valid_df, test_df = train_test_split(tmp, test_size=0.5, stratify=tmp['grade'], random_state=42)

for d,name in [(train_df,'train'), (valid_df,'valid'), (test_df,'test')]:
    d['split'] = name

answer_context = pd.concat([
    train_df[['full_context','answer','grade','split']],
    valid_df[['full_context','answer','grade','split']],
    test_df[['full_context','answer','grade','split']]
], ignore_index=True)

answer_context.to_csv('/content/drive/MyDrive/CSEG321/dataset/answer_context.csv',
                      index=False, encoding='utf-8-sig')

print("Columns:", answer_context.columns.tolist())
print("First row:\n", answer_context.iloc[0].to_dict())
print("Split counts:\n", answer_context['split'].value_counts())

Columns: ['full_context', 'answer', 'grade', 'split']
First row:
 {'full_context': 'Most mice in the wild are eaten or die before their life span of two years is over. They die from external causes, such as disease, starvation, or predators, not due to internal causes, such as aging. That is why nature has made mice to live, on average, for no longer than two years. Now we have arrived at an important point: The average life span of an animal species, or the rate at which it ages, is determined by the average time that this animal species can survive in the wild. That explains why a bat can live to be 30 years old. In contrast to mice, bats can fly, which is why they can escape from danger much faster. Thanks to their wings, bats can also cover longer distances and are better able to find food. Every genetic change in the past that made it possible for a bat to live longer was useful, because bats are much better able than mice to flee from danger, find food, and survive.', 'answer': 2

In [16]:
# csv for explanation model, using only the correct choice -> explanation_only_answer.csv

import pandas as pd
import re
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/drive/MyDrive/CSEG321/dataset/total_dataset.csv', encoding='utf-8-sig')

def replace_refs(row):
    expl = row['explanation']
    # choice 리스트 준비
    choices = {str(i): row[f'choice_{i}'] for i in range(1,6)}
    # '(1)'~'(5)' 패턴을 찾아 치환
    def repl(m):
        key = m.group(1)
        return choices.get(key, m.group(0))
    return re.sub(r'\((\d)\)', repl, expl)

df['explanation_fixed'] = df.apply(replace_refs, axis=1)

records = []
for _, r in df.iterrows():
    idx     = int(r['answer'])
    ans_txt = r[f'choice_{idx}']
    filled  = r['context'].replace('[BLANK]', ans_txt)
    prompt  = f"Context: {filled}\nAnswer: {ans_txt}\nExplain:"
    records.append({
        'input':  prompt,
        'target': r['explanation_fixed'],
        'grade':  r['grade']
    })
new_df = pd.DataFrame(records)

train_df, tmp_df = train_test_split(new_df, test_size=0.2,
                                    stratify=new_df['grade'], random_state=42)
valid_df, test_df = train_test_split(tmp_df, test_size=0.5,
                                     stratify=tmp_df['grade'], random_state=42)
for part, name in [(train_df,'train'), (valid_df,'valid'), (test_df,'test')]:
    part['split'] = name

out = pd.concat([train_df, valid_df, test_df], ignore_index=True)
out.to_csv('/content/drive/MyDrive/CSEG321/dataset/explanation_only_answer.csv',
           index=False, encoding='utf-8-sig')

print("Columns:", out.columns.tolist())
print("First row:\n", out.iloc[0].to_dict())
print("Split counts:\n", out['split'].value_counts())


Columns: ['input', 'target', 'grade', 'split']
First row:
 {'input': 'Context: Most mice in the wild are eaten or die before their life span of two years is over. They die from external causes, such as disease, starvation, or predators, not due to internal causes, such as aging. That is why nature has made mice to live, on average, for no longer than two years. Now we have arrived at an important point: The average life span of an animal species, or the rate at which it ages, is determined by the average time that this animal species can survive in the wild. That explains why a bat can live to be 30 years old. In contrast to mice, bats can fly, which is why they can escape from danger much faster. Thanks to their wings, bats can also cover longer distances and are better able to find food. Every genetic change in the past that made it possible for a bat to live longer was useful, because bats are much better able than mice to flee from danger, find food, and survive.\nAnswer: the avera

In [15]:
# csv for explanation model, regarding all choices -> explanation_all_options.csv

import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/drive/MyDrive/CSEG321/dataset/total_dataset.csv', encoding='utf-8-sig')

records = []
for _, r in df.iterrows():
    # assemble all choices
    opts = [r[f'choice_{i}'] for i in range(1,6)]
    opts_str = " ".join(f"{i}. {opt}" for i,opt in enumerate(opts,1))
    prompt = (
        f"Context: {r['context']}\n"
        f"Choices: {opts_str}\n"
        f"Answer: {r['answer']}\nExplain:"
    )
    records.append({'input': prompt, 'target': r['explanation'], 'grade': r['grade']})
all_opts_df = pd.DataFrame(records)

train_df, tmp = train_test_split(all_opts_df, test_size=0.2, stratify=all_opts_df['grade'], random_state=42)
valid_df, test_df = train_test_split(tmp, test_size=0.5, stratify=tmp['grade'], random_state=42)

for d,name in [(train_df,'train'), (valid_df,'valid'), (test_df,'test')]:
    d['split'] = name

explanation_all = pd.concat([train_df, valid_df, test_df], ignore_index=True)
explanation_all.to_csv('/content/drive/MyDrive/CSEG321/dataset/explanation_all_options.csv',
                       index=False, encoding='utf-8-sig')

print("Columns:", explanation_all.columns.tolist())
print("First row:\n", explanation_all.iloc[0].to_dict())
print("Split counts:\n", explanation_all['split'].value_counts())


Columns: ['input', 'target', 'grade', 'split']
First row:
 {'input': 'Context: Most mice in the wild are eaten or die before their life span of two years is over. They die from external causes, such as disease, starvation, or predators, not due to internal causes, such as aging. That is why nature has made mice to live, on average, for no longer than two years. Now we have arrived at an important point: The average life span of an animal species, or the rate at which it ages, is determined by [BLANK]. That explains why a bat can live to be 30 years old. In contrast to mice, bats can fly, which is why they can escape from danger much faster. Thanks to their wings, bats can also cover longer distances and are better able to find food. Every genetic change in the past that made it possible for a bat to live longer was useful, because bats are much better able than mice to flee from danger, find food, and survive.\nChoices: 1. the distance that migrating species can travel for their surviv