In [None]:
!pip install openai
!pip install --upgrade openaiimport numpy as np 
import openai
import json
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

api_key = "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"
openai.api_key = api_key

!rm -rf ./*

In [None]:
def get_q(year, code, question):
    file_path = f'/kaggle/input/taiwan-112-translated/{year}_{code}_model4.csv'
    df = pd.read_csv(file_path)
    if question > len(df) or question < 1:
        raise IndexError(f"Question index out of range. Please enter a value between 1 and {len(df)}.")
    nth_question = df.iloc[question - 1, 7]  # target located at the 8th column
    return nth_question

def get_a(year, code, question):
    file_path = f'/kaggle/input/taiwan-112-answered/{year}_{code}_4_p2_en.csv'
    df = pd.read_csv(file_path)
    if question > len(df) or question < 1:
        raise IndexError(f"Question index out of range. Please enter a value between 1 and {len(df)}.")
    nth_question = df.iloc[question - 1, 7]  # target located at the 8th column
    return nth_question

def gpt_ans(year, code, question, model):
    if model == 4:
        model_name = "gpt-4"
    elif model == 3.5:
        model_name = "gpt-3.5-turbo"
    else:
        raise ValueError("Invalid model value. Must be either 4 or 3.5")

    user_prompt = get_q(year, code, question)
    assistance_prompt = get_a(year, code, question)

    response = openai.ChatCompletion.create(
        model=model_name,
        messages=[
            {"role": "system", "content": "You are a medical doctor."},
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": assistance_prompt},
            {"role": "user", "content": "Please carefully review the question and your previous response for accuracy in accordance with medical knowledge, logical consistency, and proper interpretation. You can stand by your original answer or change the answer if you find any inconsistencies based on your training data. Update the consistency score on a scale of 1 to 10. Avoid repeating existing information from the question or prior response to save tokens."},
        ],
        max_tokens=2048,
        temperature=0,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response

def gpt_ans_loop(year, code, q_start, q_fin, model):
    data = {
        'year': [],
        'code': [],
        'question_no': [],
        'id': [],
        'created': [],
        'model': [],
        'object': [],
        'message_content': [],
        'message_role': [],
        'finish_reason': [],
        'index': [],
        'completion_tokens': [],
        'prompt_tokens': [],
        'total_tokens': [],
    }

    last_answered_q = q_start - 1
    try:
        for question in range(q_start, q_fin+1):
            response = gpt_ans(year, code, question, model)

            data['year'].append(year)
            data['code'].append(code)
            data['question_no'].append(question)
            data['id'].append(response['id'])
            data['created'].append(response['created'])
            data['model'].append(response['model'])
            data['object'].append(response['object'])
            data['message_content'].append(response['choices'][0]['message']['content'])
            data['message_role'].append(response['choices'][0]['message']['role'])
            data['finish_reason'].append(response['choices'][0]['finish_reason'])
            data['index'].append(response['choices'][0]['index'])
            data['completion_tokens'].append(response['usage']['completion_tokens'])
            data['prompt_tokens'].append(response['usage']['prompt_tokens'])
            data['total_tokens'].append(response['usage']['total_tokens'])

            print(f'Question {question} has been answered.')
            last_answered_q = question
    except IndexError:
        print("Question number out of range. Exporting the data up to the last valid question.")
    finally:
        df = pd.DataFrame(data)
        filename = f'{year}_{code}_{q_start}_{last_answered_q}_{model}_ec.csv'
        df.to_csv(filename, index=False)
        print(f'Questions {q_start} to {last_answered_q} have been answered and saved as {filename}.')
        estimate_cost(df, model)

        return df


def estimate_cost(df, model):
    if model == 4:
        cost_prompt = df['prompt_tokens'].sum() * 0.03 / 1000
        cost_completion = df['completion_tokens'].sum() * 0.06 / 1000
        total_cost = cost_prompt + cost_completion
    elif model == 3.5:
        total_cost = df['total_tokens'].sum() * 0.002 / 1000

    print(f'Estimated API call cost: ${total_cost:.6f}')


In [None]:
gpt_ans_loop(112, 1301, 37, 100, 4)

In [None]:
gpt_ans(112, 1301, 48, 4)

## Code for GPT translation

In [None]:
def get_q(year, code, question):
    file_path = f'/kaggle/input/taiwan-112/{year}_{code}.csv'
    df = pd.read_csv(file_path)
    if question > len(df) or question < 1:
        raise IndexError(f"Question index out of range. Please enter a value between 1 and {len(df)}.")
    nth_question = df.iloc[question - 1, -1]
    return nth_question

def gpt_trans(year, code, question, model):
    if model == 4:
        model_name = "gpt-4"
    elif model == 3.5:
        model_name = "gpt-3.5-turbo"
    else:
        raise ValueError("Invalid model value. Must be either 4 or 3.5")
    response = openai.ChatCompletion.create(
        model=model_name,
        messages=[
            {"role": "system", "content": "You are a professional medical translator."},
            {"role": "user", "content": f'translate the following question from zh-tw to en-us in proper medical terminology: {get_q(year, code, question)}'}
        ],
        max_tokens=1024,
        temperature=0,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response


def gpt_trans_loop(year, code, q_start, q_fin, model):
    data = {
        'year': [],
        'code': [],
        'question_no': [],
        'id': [],
        'created': [],
        'model': [],
        'object': [],
        'message_content': [],
        'message_role': [],
        'finish_reason': [],
        'index': [],
        'completion_tokens': [],
        'prompt_tokens': [],
        'total_tokens': [],
    }

    last_translated_q = q_start - 1
    try:
        for question in range(q_start, q_fin+1):
            response = gpt_trans(year, code, question, model)

            data['year'].append(year)
            data['code'].append(code)
            data['question_no'].append(question)
            data['id'].append(response['id'])
            data['created'].append(response['created'])
            data['model'].append(response['model'])
            data['object'].append(response['object'])
            data['message_content'].append(response['choices'][0]['message']['content'])
            data['message_role'].append(response['choices'][0]['message']['role'])
            data['finish_reason'].append(response['choices'][0]['finish_reason'])
            data['index'].append(response['choices'][0]['index'])
            data['completion_tokens'].append(response['usage']['completion_tokens'])
            data['prompt_tokens'].append(response['usage']['prompt_tokens'])
            data['total_tokens'].append(response['usage']['total_tokens'])

            print(f'Question {question} has been translated.')
            last_translated_q = question
    except IndexError:
        print("Question number out of range. Exporting the data up to the last valid question.")
    finally:
        df = pd.DataFrame(data)
        filename = f'{year}_{code}_{q_start}_{last_translated_q}_model{model}.csv'
        df.to_csv(filename, index=False)
        print(f'Questions {q_start} to {last_translated_q} have been translated and saved as {filename}.')
        estimate_cost(df, model)

        return df


def estimate_cost(df, model):
    if model == 4:
        cost_prompt = df['prompt_tokens'].sum() * 0.03 / 1000
        cost_completion = df['completion_tokens'].sum() * 0.06 / 1000
        total_cost = cost_prompt + cost_completion
    elif model == 3.5:
        total_cost = df['total_tokens'].sum() * 0.002 / 1000

    print(f'Estimated API call cost: ${total_cost:.6f}')


In [None]:
gpt_trans_loop(112, 1301, 75, 75, 3.5)

In [None]:
gpt_trans(112, 1301, 75, 3.5)

In [None]:
get_q(112, 1301, 1)

## Code for checking translation

In [None]:
def compare(year, code, question):
    # Define the file paths
    file_path1 = "/kaggle/input/taiwan-112/" + str(year) + "_" + str(code) + ".csv"
    file_path2 = "/kaggle/input/taiwan-112-translated/" + str(year) + "_" + str(code) + "_model3.5.csv"
    file_path3 = "/kaggle/input/taiwan-112-translated/" + str(year) + "_" + str(code) + "_model4.csv"

    try:
        df1 = pd.read_csv(file_path1, encoding='utf-8')
    except UnicodeDecodeError:
        print("Could not read the first file with the tried encoding.")
        return

    try:
        df2 = pd.read_csv(file_path2, encoding='utf-8')
        df3 = pd.read_csv(file_path3, encoding='utf-8')
    except UnicodeDecodeError:
        print("Could not read the second or third file with the tried encoding.")
        return

    # Find and print the desired row from the first file
    row1 = df1[df1[df1.columns[2]] == question]
    if len(row1) > 0:
        print("ORIGINAL QUESTION: " + row1.iloc[0, 3])
    else:
        print("No rows match the given question in the first file.")
    print()

    # Find and print the desired row from the second file
    row2 = df2[(df2[df2.columns[0]] == year) & (df2[df2.columns[1]] == code) & (df2[df2.columns[2]] == question)]
    if len(row2) > 0:
        print("GPT-3.5 TRANSLATION: " + row2.iloc[0, 7])
    else:
        print("No rows match the given year, code, and question in the second file.")
    print()

    # Find and print the desired row from the third file
    row3 = df3[(df3[df3.columns[0]] == year) & (df3[df3.columns[1]] == code) & (df3[df3.columns[2]] == question)]
    if len(row3) > 0:
        print("GPT-4   TRANSLATION: " + row3.iloc[0, 7])
    else:
        print("No rows match the given year, code, and question in the third file.")
    print()

In [None]:
# Test the function
compare(112, 2302, 4)

## Answer Key for 112

In [None]:
ans_112_1301 = [
    'A', 'D', 'A', 'C', 'A', 'D', 'D', 'A', 'A', 'C', 
    'C', 'D', 'C', 'D', 'B', 'B', 'C', 'A', 'C', 'D',
    'A', 'D', 'D', 'C', 'D', 'C', 'D', 'C', 'D', 'C', 
    'C', 'A', 'D', 'A', 'C', 'A', 'B', 'C', 'A', 'B',
    'B', 'C', 'A', 'B', 'B', 'B', 'D', 'B', 'A', 'A', 
    'A', 'D', 'D', 'A', 'C', 'A', 'D', 'A', 'B', 'D',
    'A', 'D', 'A', 'D', 'C', ['B', 'D'], 'D', 'A', 'D', 'D',
    'A', 'C', 'C', 'B', 'D', 'D', 'D', 'B', 'B', 'D',
    'D', 'C', 'B', 'B', 'A', 'D', 'C', 'B', 'C', 'D', 
    'B', 'A', 'C', 'D', 'D', 'B', 'C', 'B', 'C', 'B'
]

ans_112_1302 = [
    'D', 'B', 'C', 'D', 'D', 'C', 'C', 'B', 'D', 'C',
    'A', 'C', 'A', 'D', 'C', 'A', 'C', 'A', 'C', 'B',
    'A', 'B', 'C', 'B', 'D', 'A', 'B', 'C', 'B', 'D',
    'A', 'C', 'A', 'C', 'D', 'D', 'D', 'B', 'C', 'C',
    'B', 'D', 'C', 'A', 'B', 'C', ['A', 'B', 'C', 'D'], 'C', 'C', 'B',
    'D', 'B', 'D', 'B', 'A', 'B', 'C', 'A', 'A', 'A', 
    'A', 'D', 'C', 'D', 'C', 'A', 'D', 'D', 'B', 'B', 
    'B', 'C', 'A', 'A', 'D', 'D', 'D', 'A', 'C', 'D'
]

ans_112_2301 = [
    'B', 'B', 'D', 'C', 'D', 'B', 'C', 'B', 'B', 'C',
    'B', 'A', 'B', 'C', 'B', 'A', 'D', 'A', 'B', 'C',
    'B', 'C', 'A', 'D', 'A', 'D', 'D', 'B', 'C', 'A',
    'B', 'A', 'D', 'C', 'C', ['A', 'B', 'C', 'D'], 'B', 'B', 'C', 'B',
    'A', 'D', 'B', 'D', 'A', 'B', 'B', 'D', 'C', 'A',
    'B', 'A', 'C', 'B', 'B', 'C', 'B', 'B', 'A', 'C',
    'A', 'C', 'C', 'A', 'D', 'D', 'B', 'B', ['A', 'B', 'C', 'D'], 'D',
    'A', 'A', 'A', 'A', 'A', 'A', 'D', 'C', 'D', 'D',
    'A', 'D', 'C', 'B', 'B', 'D', 'A', 'C', 'A', 'C',
    'A', 'D', 'C', 'D', 'B', 'B', 'D', 'D', 'D', 'B'
]

ans_112_2302 = [
    'B', 'A', 'C', 'D', 'B', 'B', 'C', 'D', 'B', 'B',
    'A', 'D', 'A', 'A', 'C', 'B', 'D', 'B', 'C', 'B',
    'B', 'C', 'D', 'D', 'B', 'A', 'C', 'C', 'B', 'B',
    'A', 'A', 'D', 'B', 'D', 'B', 'D', 'C', 'B', 'B',
    'B', 'B', 'C', 'A', 'B', 'C', 'A', 'D', 'D', 'B',
    'C', 'D', 'A', 'B', 'D', 'B', 'B', 'C', 'C', 'C',
    'C', 'C', 'D', 'C', 'C', 'A', 'C', 'C', 'A', 'B',
    'D', 'D', 'C', 'C', 'B', 'A', 'C', 'C', 'C', 'D'
]

ans_112_3302 = [
    'B', 'B', 'C', 'C', 'B', 'A', 'D', 'A', 'D', 'B',
    'D', 'D', 'D', 'D', 'A', 'B', 'C', 'B', 'B', 'A',
    'C', 'D', 'A', 'C', 'B', 'B', 'D', 'B', 'A', 'D',
    'D', 'D', 'A', 'B', 'D', 'D', 'C', 'B', 'A', 'B',
    'D', 'D', 'B', 'B', 'B', 'A', 'C', 'A', 'D', 'C',
    'C', 'C', 'D', 'C', 'B', 'D', 'B', 'A', 'C', 'A',
    'A', 'A', 'C', 'C', 'A', 'A', 'A', 'A', 'B', 'C',
    'D', 'C', 'B', 'B', 'B', 'D', 'B', 'A', 'B', 'C'
]

ans_112_4302 = [
    'A', 'D', 'B', 'B', 'C', 'C', 'D', 'C', 'D', 'A', 
    'A', 'A', 'B', 'D', 'D', 'B', 'B', 'C', 'A', 'A', 
    'B', 'A', 'B', 'D', 'A', 'C', 'D', 'B', 'B', 'C', 
    'A', 'D', 'A', 'A', 'B', 'D', 'A', 'C', 'A', 'A', 
    'A', 'D', 'C', 'D', 'C', 'D', 'C', 'A', 'C', 'A', 
    'A', 'B', 'D', 'B', 'D', 'B', 'D', 'A', 'A', 'B', 
    'A', 'C', 'A', 'B', 'D', 'C', 'D', 'C', 'C', 'D', 
    'C', 'A', 'C', 'B', 'D', 'D', 'C', 'C', 'D', 'A'
]

def get_ans(year, code, question):
    answers = globals()[f"ans_{year}_{code}"]
    ans = answers[question - 1] 
    if isinstance(ans, list):
        if set(ans) == set(['A', 'B', 'C', 'D']):
            return '送分'
        else:
            return ' or '.join(ans)
    else:
        return ans

In [None]:
get_ans(112,1302, 48)

## Code for Q bank parsing

In [None]:
import re
import pandas as pd

text = input()

year_pattern = re.compile(r"(\d+)+年")
year = year_pattern.search(text)
if year:
    year = year.group(1)

subject_code_pattern = re.compile(r"代 號:(\d+)")
subject_code_match = subject_code_pattern.search(text)
if subject_code_match:
    subject_code = subject_code_match.group(1)

question_list = []
next_question_number = 1
for part in re.split(r"(\d+)\.", text):
    try:
        current_number = int(part.strip())
        if current_number == next_question_number:
            question_list.append({
                'Year': year,
                'Subject Code': subject_code,
                'Question Number': current_number,
                'Question and Options': '',
            })
            next_question_number += 1
        else:
            if question_list:
                question_list[-1]['Question and Options'] += part.strip()
    except ValueError:
        if question_list: 
            question_list[-1]['Question and Options'] += part.strip()

df = pd.DataFrame(question_list)
display(df)

file_name = f"{year}_{subject_code}.csv"
df.to_csv(file_name, index=False)