### Setting Envirionment

In [None]:
! pip install pandas
! pip install openai
! pip install langchain

In [1]:
import os
import json
import openai
import pandas as pd
import csv
import re
import time

In [2]:
with open('openai_api_key.txt', 'r') as f:
    openai.api_key = f.read().strip()
os.environ["OPENAI_API_KEY"] = openai.api_key
# os.environ.get("OPENAI_API_KEY") # api_key 확인 필요할 때

# models = dict(openai.Model.list())
# for i in models['data']:
#     if i['id'].startswith('gpt'):
#         print(i['id'])

## Preparing required files

In [None]:
# Load the CSV file
df = pd.read_csv('_output_scene/scene_curated_0831.csv')
print(df.head(5))

In [3]:
# Calling prompts
with open('rprompt.txt', 'r') as f:
    rprompt = f.read()

with open('qprompt.txt', 'r') as f:
    qprompt = f.read()

with open('qtype_prompt.txt', 'r') as f:
    qt_prompt = f.read()

In [None]:
# print(rprompt[:50])
# print(qprompt[:50])
print(qt_prompt[450:600])

## Acquiring reasoning and queries from scene list

In [4]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage, AIMessage

chat = ChatOpenAI(model="gpt-4-0613", temperature = 0)

### First loop for reasoning and questioning from scene lists

In [None]:
# 첫 번째 루프: reasoning과 query_value를 생성합니다.
intermediate_results = []
for i, row in df.iterrows():
    row_dict = row.to_dict()
    SceneDescription = row[1]
    
    messages = [SystemMessage(content=f"'{rprompt}'"), HumanMessage(content=f"'{SceneDescription}'")]
    reasoning = chat(messages).content

    messages = [SystemMessage(content=f"'{qprompt}'"), HumanMessage(content=f"'{SceneDescription}'")]
    query = chat(messages)
    query_value = [s.strip() for s in query.content.split(';')]

    intermediate_results.append({
        'row_dict': row_dict,
        'reasoning': reasoning,
        'query_value': query_value
    })

### Saving intermediate file

In [5]:
# Set the name of output file
FileName = '0831v1'

In [None]:
# Save the intermediate_results to a JSON file

with open(f'_output_result/intermediate_{FileName}.json', 'w') as json_file:
    json.dump(intermediate_results, json_file)

### Second loop for qtyping from the intermediate file

In [6]:
# 중간저장한 intermediate file 불러오기
with open(f'_output_result/intermediate_{FileName}.json', 'r') as file:
    intermediate_json = json.load(file)

In [None]:
# 두 번째 루프 - ratelimiterror 회피하기 위한 속도 조절 버전
# 각 query_value에 대해 qtype 처리 후 결과 생성
output = []

# 5개씩 끊어서 처리
for i in range(0, len(intermediate_json), 5):
    chunk = intermediate_json[i:i+5]

    for item in chunk:
        row_dict = item['row_dict']
        reasoning = item['reasoning']
        query_value = item['query_value']
        
        # qtype 처리
        messages = [SystemMessage(content=f"'{qt_prompt}'"), HumanMessage(content=f"'{query_value}'")]
        qtype_str = chat(messages).content
        matches = re.findall(r"\(([^)]+)\)", qtype_str)    
        qtype = [tuple(map(lambda x: x.strip().strip("'"), match.split(','))) for match in matches]

        # question 리스트 생성
        questions_list = []
        for idx, (query, (qtype_num, qtype_text)) in enumerate(zip(query_value, qtype), start=1):
            question_dict = {
                f"qid{idx:02}": f"{row_dict['scn_id']}Q0{idx}",
                f"question{idx:02}": query,
                f"q_type_num{idx:02}": qtype_num,
                f"q_type{idx:02}": qtype_text
            }
            questions_list.append(question_dict)

        # 최종 결과 생성
        output_dict = {
            "scn_id": row_dict["scn_id"],
            "scn_cls": row_dict["scn_cls"],
            "scn_sentence": row_dict["scn_sentence"],
            "reasoning": reasoning,  
            "question": questions_list
        }
        output.append(output_dict)

    # 5개 처리 후 10초 휴식
    time.sleep(10)


## Saving files

### Saving output to JSON

In [11]:
# Save the JSON output to a file

with open(f'_output_result/result_{FileName}.json', 'w') as json_file:
    json.dump(output, json_file)

### Saving CSV with qid and qtypes of each question

In [17]:
transformed_output = []

for entry in output:
        new_dict = entry.copy() # 기존 딕셔너리의 복사본을 생성    
        questions = new_dict.pop('question') # "question" 항목을 추출
        for q_dict in questions:
            new_dict.update(q_dict) # 각 하위 딕셔너리의 항목들을 상위 딕셔너리로 이동

        transformed_output.append(new_dict) # 변형된 딕셔너리를 새로운 리스트에 추가

# 새로운 리스트를 사용하여 DataFrame을 생성
df_output = pd.DataFrame(transformed_output)

# DataFrame을 CSV 파일로 저장
df_output.to_csv(f'_output_result/result_{FileName}.csv', index=False)