# Import Libraries

In [None]:
!pip install transformers
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [None]:
import sys
import gc
import os
import subprocess
import numpy as np
import pandas as pd
from tqdm import tqdm
from functools import partial
from scipy.special import softmax
import torch
from torch import nn
from transformers import LlamaTokenizer,  AutoTokenizer, AutoModelForCausalLM
import pickle

import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

# Download Resources

In [None]:
os.makedirs('ARC', exist_ok=True)
os.chdir('ARC')

os.makedirs('ARC-Challenge', exist_ok=True)
os.chdir('ARC-Challenge')

file_names = [
    'test.csv',
    'train.csv',
    'valid.csv',
]

repo_id = "dpquoc/ARC"

# Create a list to store subprocess.Popen objects
processes = []

for file_name in file_names:
    link = f'https://huggingface.co/datasets/{repo_id}/resolve/main/ARC-Challenge/{file_name}'
    command = ["wget", link]

    # Redirect output to /dev/null (Linux) or NUL (Windows)
    processes.append(subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL))

# Wait for all processes to complete
for process in processes:
    process.wait()

print("All downloads initiated.")
os.chdir('..')


os.makedirs('ARC-Easy', exist_ok=True)
os.chdir('ARC-Easy')

# Create a list to store subprocess.Popen objects
processes = []

for file_name in file_names:
    link = f'https://huggingface.co/datasets/{repo_id}/resolve/main/ARC-Easy/{file_name}'
    command = ["wget", link]

    # Redirect output to /dev/null (Linux) or NUL (Windows)
    processes.append(subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL))

# Wait for all processes to complete
for process in processes:
    process.wait()

print("All downloads initiated.")
os.chdir('..')
os.chdir('..')

All downloads initiated.
All downloads initiated.


In [None]:
os.makedirs('ARC-RAG', exist_ok=True)
os.chdir('ARC-RAG')

file_names = [
    "ARC-C_Enhance-RAG.csv",
    "ARC-C_RAG.csv",
    "ARC-E_Enhance-RAG.csv",
    "ARC-E_RAG.csv"
]

repo_id = "dpquoc/ARC-RAG"

# Create a list to store subprocess.Popen objects
processes = []

for file_name in file_names:
    link = f'https://huggingface.co/datasets/{repo_id}/resolve/main/{file_name}'
    command = ["wget", link]

    # Redirect output to /dev/null (Linux) or NUL (Windows)
    processes.append(subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL))

# Wait for all processes to complete
for process in processes:
    process.wait()

print("All downloads initiated.")
os.chdir('..')

All downloads initiated.


In [None]:
os.makedirs('Mistral-7B', exist_ok=True)
os.chdir('Mistral-7B')

file_names = [
    ".gitattributes",
    "README.md",
    "config.json",
    "generation_config.json",
    "pytorch_model-00001-of-00003.bin",
    "pytorch_model-00002-of-00003.bin",
    "pytorch_model-00003-of-00003.bin",
    "pytorch_model.bin.index.json",
    "special_tokens_map.json",
    "tokenizer.json",
    "tokenizer.model",
    "tokenizer_config.json"
]

# My repo to store the model since can not directly public download from mistralai
repo_id = "dpquoc/Mistral-7B-Instruct-v0.2"

# Create a list to store subprocess.Popen objects
processes = []

for file_name in file_names:
    link = f'https://huggingface.co/{repo_id}/resolve/main/{file_name}'
    command = ["wget", link]

    # Redirect output to /dev/null (Linux) or NUL (Windows)
    processes.append(subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL))

# Wait for all processes to complete
for process in processes:
    process.wait()

print("All downloads initiated.")
os.chdir('..')

# Load Model

In [None]:
model_name = '/content/Mistral-7B'

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = model.to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [None]:
# Get token ids
answers_token_id = tokenizer.encode("A B C D")[1:]
answers_token_id

In [None]:
# Get token ids
yn_token_id = tokenizer.encode("yes no")[1:]
yn_token_id

# 1. Baseline approach

In [None]:
# Read the CSV file

df = pd.read_csv("/content/ARC/ARC-Easy/test.csv").drop("id", axis=1)
# df = pd.read_csv("/content/ARC/ARC-Challenge/test.csv").drop("id", axis=1)
df.fillna(' ', inplace=True)
df = df.astype(str)

get_sample = False
if get_sample:
  # Define the sample index
  sample_index = 123
  df = df.iloc[[sample_index]]


# Create the 'instruction' column
df['instruction'] = 'Question: ' + df['question'] + '\n\nA. ' + df['A'] + '\n\nB. ' + df['B'] + '\n\nC. ' + df['C'] + ' \n\nD. ' + df['D']

In [None]:
print("Sample Question and Options:")
print("----------------------------\n")

print(f"Question: {df['question'].values[0]}")
print(f"A. {df['A'].values[0]}")
print(f"B. {df['B'].values[0]}")
print(f"C. {df['C'].values[0]}")
print(f"D. {df['D'].values[0]}")
print(f"True Answer: {df['answer'].values[0]}")

Sample Question and Options:
----------------------------

Question: Which statement best explains why photosynthesis is the foundation of most food webs?
A. Sunlight is the source of energy for nearly all ecosystems.
B. Most ecosystems are found on land instead of in water.
C. Carbon dioxide is more available than other gases.
D. The producers in all ecosystems are plants.
True Answer: A


In [None]:
# Note: Each LLM may have its own instruction finetuned format, which may differ from the current prompt format I am using. If the results seem incorrect when switching to a different LLM,
#       you may need to adjust the prompt to its own instruction format properly.

prompt = """
<s> [INST] Your task is to analyze the question and answer options below.
query [/INST] """

preds = []
logits = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    inputs = tokenizer(prompt.replace('query', row['instruction']) , return_tensors="pt").to(f"cuda:{model.device.index}")
    with torch.no_grad():
        output = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"],
#                                 max_length=MAX_LENGTH,
                                max_new_tokens = 1,
                                return_dict_in_generate=True, output_scores=True)

    first_token_logits = output.scores[0][0]
    option_logits = first_token_logits[[330, 365, 334, 384]].float().cpu().numpy() #ABCD

    logits.append(option_logits)
    pred = np.array(["A", "B", "C", "D"])[np.argsort(option_logits)[::-1][:4]] # Sort the prediction based on the high prob
    pred = ' '.join(pred)
    preds.append(pred)

# Calculate probabilities using the stored logits
probs = [softmax(logit) for logit in logits]

100%|██████████| 2376/2376 [03:10<00:00, 12.47it/s]


In [None]:
df['pred_answer'] = preds

In [None]:
# Extract the first key from each prediction
first_preds = [pred.split()[0] for pred in preds]

print("Sample Question Result:\n")
print(f"True Answer: {df['answer'].values[0]}")
print(f'Answer Prediction: {first_preds[0]}')
print(f'Logits: {logits[0]}')
print(f'Probabilities: {probs[0]}')

Sample Question Result:

True Answer: A
Answer Prediction: A
Logits: [21.375 20.625 20.5   20.875]
Probabilities: [0.40067968 0.18926767 0.16702813 0.24302451]


In [None]:
# CALCULATE ACCURACY OF PREDICTION

# Convert both lists to Series
first_preds_series = pd.Series(first_preds, index=df.index)
answers_series = df['answer']

# Ensure both Series have the same index
first_preds_series = first_preds_series.reset_index(drop=True)
answers_series = answers_series.reset_index(drop=True)

# Calculate the number of correct predictions
correct_preds = (first_preds_series == answers_series).sum()

# Calculate the accuracy
accuracy = correct_preds / len(df)

print(f'Accuracy: {accuracy * 100:.2f}%')

# 2. Basline approach + RAG

In [None]:
# ARC + Context from RAG
# df = pd.read_csv("/content/ARC-RAG/ARC-E_RAG.csv")
df = pd.read_csv("/content/ARC-RAG/ARC-C_RAG.csv")

# # ARC + Context from Enhanced-query RAG
# df = pd.read_csv("/content/ARC-RAG/ARC-E_Enhance-RAG.csv")
# df = pd.read_csv("/content/ARC-RAG/ARC-C_Enhance-RAG.csv")

df.fillna(' ', inplace=True)
df = df.astype(str)

get_sample = False
if get_sample:
  # Define the sample index
  sample_index = 312
  df = df.iloc[[sample_index]]

MAX_CONTEXT = 3000
# Create the 'instruction' column
df['instruction'] = '\n####\nContext: ' + df['context'].str[:MAX_CONTEXT] + '\n####\n\nQuestion: ' + df['question'] + '\n\nA. ' + df['A'] + '\n\nB. ' + df['B'] + '\n\nC. ' + df['C'] + ' \n\nD. ' + df['D']

In [None]:
print("Sample Question and Options:")
print("----------------------------\n")

print(f"Question: {df['question'].values[0]}")
print(f"A. {df['A'].values[0]}")
print(f"B. {df['B'].values[0]}")
print(f"C. {df['C'].values[0]}")
print(f"D. {df['D'].values[0]}")
print(f"True Answer: {df['answer'].values[0]}")

Sample Question and Options:
----------------------------

Question: Energy appears in many forms. What form of energy is lightning?
A. electrical energy
B. mechanical energy
C. magnetic energy
D. sound energy
True Answer: A


In [None]:
prompt = """
<s> [INST] Your task is to analyze the question and answer options below. As a potential aid to your answer, background context from Wikipedia articles is at your disposal, delimited by #### , even if they might not always be relevant.
query [/INST] """

preds = []
logits = []

for _, row in tqdm(df.iterrows(), total=len(df)):

    inputs = tokenizer(prompt.replace('query', row['instruction']) , return_tensors="pt").to(f"cuda:{model.device.index}")
    with torch.no_grad():
        output = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"],
#                                 max_length=MAX_LENGTH,
                                max_new_tokens = 1,
                                return_dict_in_generate=True, output_scores=True)

    del inputs
    gc.collect()
    torch.cuda.empty_cache()

    first_token_logits = output.scores[0][0]
    option_logits = first_token_logits[[330, 365, 334, 384]].float().cpu().numpy() #ABCD

    logits.append(option_logits)
    pred = np.array(["A", "B", "C", "D"])[np.argsort(option_logits)[::-1][:4]]
    pred = ' '.join(pred)
    preds.append(pred)

# Calculate probabilities using the stored logits
probs = [softmax(logit) for logit in logits]

 91%|█████████▏| 1070/1172 [07:03<00:40,  2.49it/s]

In [None]:
df['pred_answer'] = preds

In [None]:
# Extract the first key from each prediction
first_preds = [pred.split()[0] for pred in preds]

print("Sample Question Result:\n")
print(f"True Answer: {df['answer'].values[0]}")
print(f'Answer Prediction: {first_preds[0]}')
print(f'Logits: {logits[0]}')
print(f'Probabilities: {probs[0]}')

Sample Question Result:

True Answer: A
Answer Prediction: A
Logits: [10.1875  -5.6875  -4.5     -3.71875]
Probabilities: [9.9999845e-01 1.2751886e-07 4.1811822e-07 9.1325427e-07]


In [None]:
# CALCULATE ACCURACY OF PREDICTION

# Convert both lists to Series
first_preds_series = pd.Series(first_preds, index=df.index)
answers_series = df['answer']

# Calculate the number of correct predictions
correct_preds = (first_preds_series == answers_series).sum()

# Calculate the accuracy
accuracy = correct_preds / len(df)

print(f'Accuracy: {accuracy * 100:.2f}%')


# 3. STL approach

In [None]:
# Read the CSV file

df = pd.read_csv("/content/ARC/ARC-Easy/test.csv").drop("id", axis=1)
# df = pd.read_csv("/content/ARC/ARC-Challenge/test.csv").drop("id", axis=1)
df.fillna(' ', inplace=True)
df = df.astype(str)

get_sample = False
if get_sample:
  # Define the sample index
  sample_index = 223
  df = df.iloc[[sample_index]]


In [None]:
print("Sample Question and Options:")
print("----------------------------\n")

print(f"Question: {df['question'].values[0]}")
print(f"A. {df['A'].values[0]}")
print(f"B. {df['B'].values[0]}")
print(f"C. {df['C'].values[0]}")
print(f"D. {df['D'].values[0]}")
print(f"True Answer: {df['answer'].values[0]}")

Sample Question and Options:
----------------------------

Question: Which statement best explains why photosynthesis is the foundation of most food webs?
A. Sunlight is the source of energy for nearly all ecosystems.
B. Most ecosystems are found on land instead of in water.
C. Carbon dioxide is more available than other gases.
D. The producers in all ecosystems are plants.
True Answer: A


In [None]:
def get_prompts(row):
    instruction = """<s> [INST] Your task is to analyze the question and answer below. If the answer is correct, respond yes, if it is not correct respond no.
  {question} [/INST]"""
    question = f"\nQuestion: {row['question']}\nProposed answer: "

    prompts = []  # Store tokenized prompts

    for letter in "ABCD":
        prompt_suffix = f"{row[letter]}\n\n### Response:\n"
        full_prompt = instruction.format(question=question) + prompt_suffix
        prompts.append(full_prompt)

    return prompts

In [None]:
f = partial(get_prompts)
inputs = df.apply(f, axis=1).values
inputs = [item for sublist in inputs for item in sublist]

In [None]:
tokenizer.padding_side = "left"

# Define PAD Token = EOS Token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

In [None]:
batch_size = 4
yes_logits = []

with torch.no_grad():
    for i in range(0, len(inputs), batch_size):
        batch = inputs[i:i+batch_size]
        batch_tokens = tokenizer(batch, return_tensors="pt", return_attention_mask=True, padding=True).to(f"cuda:{model.device.index}")

        batch_outputs = model.generate(
            **batch_tokens,
            max_new_tokens = 1,
            return_dict_in_generate=True,
            output_scores=True,
        )

        first_token_logits = batch_outputs.scores[0]

        for scores in first_token_logits:
            yes_logits.append(float(scores[[5081]].float().cpu().numpy())) # yes, no

        del batch
        del batch_tokens
        del batch_outputs

        # Call the garbage collector
        gc.collect()
        torch.cuda.empty_cache()


  yes_logits.append(float(scores[[4874]].float().cpu().numpy())) # yes, no


In [None]:
# Assuming `output` is your Python list
output_np = np.array(yes_logits)  # Convert to numpy array
reshaped_output = np.reshape(output_np, (-1, 4))  # Reshape to (-1, 4)

probs_output = np.apply_along_axis(softmax, 1, reshaped_output)

In [None]:
# Assuming `reshaped_output` is your numpy array
labels = np.array(["A", "B", "C", "D"])  # Define your labels
preds = []  # Initialize your list

for option_scores in reshaped_output:
    pred = labels[np.argsort(option_scores)[::-1][:4]]  # Sort the scores, select the top 4, and map to labels
    pred = ' '.join(pred)  # Join the labels into a string
    preds.append(pred)  # Append the result to the list

In [None]:
# Extract the first key from each prediction
first_preds = [pred.split()[0] for pred in preds]

In [None]:
print("Sample Question Result:\n")
print(f"True Answer: {df['answer'].values[0]}")
print(f'Answer Prediction: {first_preds[0]}')
print(f'Yes Logits: {reshaped_output[0]}')
print(f'Probabilities: {probs_output[0]}')

Sample Question Result:

True Answer: A
Answer Prediction: A
Yes Logits: [13.375   7.8125 10.3125 12.4375]
Probabilities: [0.69337769 0.00266199 0.03242971 0.27153061]


In [None]:
# CALCULATE ACCURACY OF PREDICTION

# Convert both lists to Series
first_preds_series = pd.Series(first_preds, index=df.index)
answers_series = df['answer']

# Calculate the number of correct predictions
correct_preds = (first_preds_series == answers_series).sum()

# Calculate the accuracy
accuracy = correct_preds / len(df)

print(f'Accuracy: {accuracy * 100:.2f}%')


In [None]:
df['pred_answer'] = preds

# 4. STL approach + RAG

In [None]:
# ARC + Context from RAG
df = pd.read_csv("/content/ARC-RAG/ARC-E_RAG.csv")
# df = pd.read_csv("/content/ARC-RAG/ARC-C_RAG.csv")

# # ARC + Context from Enhanced-query RAG
# df = pd.read_csv("/content/ARC-RAG/ARC-E_Enhance-RAG.csv")
# df = pd.read_csv("/content/ARC-RAG/ARC-C_Enhance-RAG.csv")

df.fillna(' ', inplace=True)
df = df.astype(str)

get_sample = True
if get_sample:
  # Define the sample index
  sample_index = 312
  df = df.iloc[[sample_index]]

MAX_CONTEXT = 3000

In [None]:
print("Sample Question and Options:")
print("----------------------------\n")

print(f"Question: {df['question'].values[0]}")
print(f"A. {df['A'].values[0]}")
print(f"B. {df['B'].values[0]}")
print(f"C. {df['C'].values[0]}")
print(f"D. {df['D'].values[0]}")
print(f"True Answer: {df['answer'].values[0]}")

Sample Question and Options:
----------------------------

Question: Energy appears in many forms. What form of energy is lightning?
A. electrical energy
B. mechanical energy
C. magnetic energy
D. sound energy
True Answer: A


In [None]:
def get_prompts(row):
    instruction = """<s> [INST] Your task is to analyze the question and answer below. If the proposed answer is correct then respond yes, if it is wrong then respond no. As a potential aid to your answer, background context from Wikipedia articles is at your disposal, delimited by #### , even if they might not always be relevant.
  {question} [/INST]"""
    question = f"\n####\nContext: {row['context'][:MAX_CONTEXT]}.\n####\nQuestion: {row['question']}\nProposed answer: "

    prompts = []  # Store tokenized prompts

    for letter in "ABCD":
        prompt_suffix = f"{row[letter]}\n\n Response:"
        full_prompt = instruction.format(question=question) + prompt_suffix
        prompts.append(full_prompt)

    return prompts



In [None]:
f = partial(get_prompts)
inputs = df.apply(f, axis=1).values
inputs = [item for sublist in inputs for item in sublist]

In [None]:
tokenizer.padding_side = "left"

# Define PAD Token = EOS Token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id


In [None]:
# Assuming you have the prepared inputs ready

batch_size = 1
yes_logits = []

with torch.no_grad():
    with tqdm(total=len(inputs)) as pbar:  # Create a progress bar
        for i in range(0, len(inputs), batch_size):
            batch = inputs[i:i+batch_size]
            batch_tokens = tokenizer(batch, return_tensors="pt", return_attention_mask=True, padding=True).to(f"cuda:{model.device.index}")

            batch_outputs = model.generate(
                **batch_tokens,
                max_new_tokens=1,
                return_dict_in_generate=True,
                output_scores=True,
            )

            first_token_probs = batch_outputs.scores[0]

            for scores in first_token_probs:
                yes_logits.append(float(scores[[5081]].float().cpu().numpy()))  # yes, no

            del batch
            del batch_tokens
            del batch_outputs

            # Call the garbage collector
            gc.collect()
            torch.cuda.empty_cache()

            pbar.update(batch_size)  # Update the progress bar

  yes_logits.append(float(scores[[5081]].float().cpu().numpy()))  # yes, no
100%|██████████| 4/4 [00:02<00:00,  1.52it/s]


In [None]:
# Call the garbage collector
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Assuming `output` is your Python list
output_np = np.array(yes_logits)  # Convert to numpy array
reshaped_output = np.reshape(output_np, (-1, 4))  # Reshape to (-1, 4)

probs_output = np.apply_along_axis(softmax, 1, reshaped_output)

In [None]:
# Assuming `reshaped_output` is your numpy array
labels = np.array(["A", "B", "C", "D"])  # Define your labels
preds = []  # Initialize your list

for option_scores in reshaped_output:
    pred = labels[np.argsort(option_scores)[::-1][:3]]  # Sort the scores, select the top 3, and map to labels
    pred = ' '.join(pred)  # Join the labels into a string
    preds.append(pred)  # Append the result to the list

In [None]:
# Extract the first key from each prediction
first_preds = [pred.split()[0] for pred in preds]

print("Sample Question Result:\n")
print(f"True Answer: {df['answer'].values[0]}")
print(f'Answer Prediction: {first_preds[0]}')
print(f'Logits: {reshaped_output[0]}')
print(f'Probabilities: {probs_output[0]}')

Sample Question Result:

True Answer: A
Answer Prediction: A
Logits: [18.25        0.42773438  0.5859375   0.39453125]
Probabilities: [9.99999943e-01 1.81923314e-08 2.13105718e-08 1.75982071e-08]


In [None]:
# CALCULATE ACCURACY OF PREDICTION

# Convert both lists to Series
first_preds_series = pd.Series(first_preds, index=df.index)
answers_series = df['answer']

first_preds_series = first_preds_series.reset_index(drop=True)
answers_series = answers_series.reset_index(drop=True)

# Calculate the number of correct predictions
correct_preds = (first_preds_series == answers_series).sum()

# Calculate the accuracy
accuracy = correct_preds / len(df)

print(f'Accuracy: {accuracy * 100:.2f}%')
