In [1]:
import pandas as pd
df = pd.read_csv("submit.csv")

In [2]:
import os
from dotenv import load_dotenv

from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)

load_dotenv()

llm = ChatGoogleGenerativeAI(
    model="gemini-pro",
    temperature=0.1,
    safety_settings={
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, 
    },
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(f'Number of questions: {len(df)}')
print(f'Number of unique tasks: {len(df["task"].unique())}')

Number of questions: 2061
Number of unique tasks: 10


In [4]:
from langchain.prompts.prompt import PromptTemplate

template = '''
You are a high school student you are well prepared for the SAT exam on {task}.

The following is a question that fits your subject:

{input}

A: {A}

B: {B}

C: {C}

D: {D}

In order to get the best score on the SAT exam, you must choose the best option that makes your score higher.
It's very important to your future.

You should think about the question step by step.
Show your work and demonstrate your understanding of the question starting with **Analysis:**.

At the end, choose the best option corresponding to the question and give the letter with angle brackets, i.e., <A>, <B>, <C>, or <D>.
'''

template = '''
You are a high school student you are well prepared for the SAT exam on {task}.

The following is a question that fits your subject:

{input}

A: {A}

B: {B}

C: {C}

D: {D}

You should think about the question step by step.
Show your work and demonstrate your understanding of the question starting with **Analysis:**.

In order to get the best score on the SAT exam, you must choose the best option that makes your score higher,
and give the letter with angle brackets, i.e., <A>, <B>, <C>, or <D>.
It's very important to your future.
'''

template = '''
You are a high school student you are well prepared for the collge exam on {task}.

The following is a question that fits your subject:

{input}

A: {A}

B: {B}

C: {C}

D: {D}

In order to get the best score on the collge exam, you must choose the best option that makes your score higher.
It's very important to your future, your family will be proud of you if you get a good score.

Choose the best option and just answer with only one letter.
'''

template = '''
You are a high school student you are well prepared for the collge exam on {task}.

The following is a question that fits your subject:

{input}

A: {A}

B: {B}

C: {C}

D: {D}

It's very important to your future, your family will be proud of you if you get a good score.
In order to get the best score on the collge exam, you must choose the best option that makes your score higher, and then just answer with only the letter.
'''

# Choose the best option and just answer with only one letter.

# As for answer, you should think about the question step by step.
# Show your work and demonstrate your understanding of the question starting with **Analysis:**.

# At the end, choose the best option and answer the letter with <A>, <B>, <C>, or <D>, begin with **Answer:**.



# I will give you an example question to help you understand the subject and the format of the exam.
# **Example:**
# {example}
# **End of example.**

# **Statement:**
# The answer is very important to your application.

# You should think about the question step by step.
# Show your work and demonstrate your understanding of the question starting with **Analysis:**.

# At the end, choose the best option corresponding to the question and give the letter with <A>, <B>, <C>, or <D>, begin with **Answer:**.

# answer the letter of the correct answer

# Choose the best option and just answer with only one letter.

# Choose the best option and answer the letter

prompt = PromptTemplate(input_variables=["input", "task", "A", "B", "C", "D", "example"], template=template)

chain = prompt | llm

In [5]:
sample_df = pd.read_csv("mmlu_sample.csv")

example_template = '''
{input}

A: {A}

B: {B}

C: {C}

D: {D}  

**Analysis:**
Your thoughts here...

**Answer:**
<{target}>
'''

example_prompt = PromptTemplate(input_variables=["input", "target", "A", "B", "C", "D"], 
                                template=example_template)

print(len(sample_df["task"].unique()))

In [6]:
# from time import sleep
# import re

# result = {'ID': [], 'target': []}

# for index, row in df.iterrows():
#     d = row.to_dict()
    
#     # task_df = sample_df["task"] == d['task']
#     # first_example = sample_df[task_df].iloc[0].to_dict()
    
#     d['task'] = d['task'].replace('_', ' ')    
    
#     try:
#         answer = chain.invoke(row.to_dict()).content
#     except:
#         ...
    
#     # final_answer = answer
    
#     try:
#         final_answer = re.findall(r'<([A-D|a-d])>', answer)[0].upper()
#     except:
#         print("ERROR: ", answer)
#         final_answer = "A"
    
#     print(row.iloc[0], final_answer)
    
#     result['ID'].append(row.iloc[0])
#     result['target'].append(final_answer)
#     sleep(1)

In [7]:
from time import sleep
import re

result = {'ID': [], 'target': []}

for index, row in df.iterrows():
    d = row.to_dict()
    
    # task_df = sample_df["task"] == d['task']
    # first_example = sample_df[task_df].iloc[0].to_dict()
    
    d['task'] = d['task'].replace('_', ' ')    
    
    answer = chain.invoke(row.to_dict()).content
    
    final_answer = answer[0].upper()
    
    # try:
    #     final_answer = re.findall(r'<([A-D|a-d])>', answer)[0].upper()
    # except:
    #     print("ERROR: ", answer)
    #     final_answer = "A"
    
    print(row.iloc[0], answer)
    
    result['ID'].append(row.iloc[0])
    result['target'].append(final_answer)
    sleep(1)


0 C
1 A
3 A
4 B
5 C
6 C
7 B
8 C
9 C
12 C
13 A
14 A
16 A
18 B
19 D
20 C
21 A
23 B
24 D
25 B
26 D
27 A
32 D
33 C
34 A
35 C
36 A
37 B
38 D
39 A
40 A
41 D
42 C
43 C
44 D
45 B
46 C
47 C
48 A
49 C
50 C
51 A
52 A
54 B
56 D
57 A
59 D
60 A
61 B
62 B
63 C
64 D
65 B
66 C
67 D
68 D
69 B
70 D
71 D
72 D
73 C
74 B
75 B
76 B
77 A
78 A
79 B
80 C
82 D
83 A
84 A
87 C
88 C
90 C
91 A
92 D
94 C
95 B
96 D
97 A
98 A
99 D
100 C
101 D
103 C
104 D
105 C
106 B
107 A
109 D
110 C
111 C
112 A
113 D
115 C
116 A
117 B
118 B
119 D
120 A
122 B
123 A
124 B
125 C
126 B
127 A
128 B
129 B
130 C
131 D
132 D
133 D
134 B
135 D
136 C
139 C
140 A
142 D
143 C
144 D
145 B
146 A
147 D
148 C
150 D
151 B
152 D
153 D
154 C
155 C
156 A
157 D
158 B
159 C
160 A
161 A
162 C
163 C
164 B
633 B
635 A
639 B
640 B
641 C
642 C
643 A
644 B
645 C
646 B
647 B
649 D
651 C
652 B
653 A
654 D
660 D
661 C
662 A
663 D
665 C
666 A
667 A
668 A
669 B
670 A
671 D
673 B
674 C
675 D
676 A
677 C
679 A
680 C
681 D
683 D
684 D
685 C
687 A
688 C
689 D
690 A
692 B

In [8]:
result_df = pd.DataFrame(result, columns=['ID', 'target'])

# print(result_df)
result_df.to_csv("answer.csv", index=False)

print("Dumped to answer.csv")

Dumped to answer.csv
