In [2]:
!pip install pandas google-generativeai chardet tqdm


Collecting chardet
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
   ---------------------------------------- 0.0/199.4 kB ? eta -:--:--
   -- ------------------------------------- 10.2/199.4 kB ? eta -:--:--
   ------------ -------------------------- 61.4/199.4 kB 825.8 kB/s eta 0:00:01
   ------------------------ --------------- 122.9/199.4 kB 1.0 MB/s eta 0:00:01
   ---------------------------------- ----- 174.1/199.4 kB 1.2 MB/s eta 0:00:01
   ---------------------------------------- 199.4/199.4 kB 1.1 MB/s eta 0:00:00
Installing collected packages: chardet
Successfully installed chardet-5.2.0



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [36]:
import pandas as pd
import google.generativeai as genai
import time
import chardet
from tqdm import tqdm
import os 

In [37]:
with open('cti-mcq.tsv', 'rb') as f:
    result = chardet.detect(f.read())
encoding = result['encoding']
print(f"Detected encoding: {encoding}")

Detected encoding: MacRoman


In [44]:
df = pd.read_csv('cti-mcq.tsv', sep='\t', encoding=encoding)
# Read the API key from the file
def get_api_key(file_path):
    with open(file_path, 'r') as file:
        line = file.readline().strip()
        # Assuming the file contains only one line with the format KEY=value
        key, value = line.split('=', 1)
        return value

# Use the API key
api_key = get_api_key('api_key.txt')



In [45]:
# Define the start and end lines for processing
start_line = 594
end_line =601

# Select the subset of the DataFrame based on the start and end lines
df_limited = df.iloc[start_line:end_line]
genai.configure(api_key=api_key)
model = genai.GenerativeModel(model_name="gemini-1.5-flash")

# Path to the output TSV file
output_file = 'response3.tsv'

# Prepare to write to the TSV file
with open(output_file, 'a') as file:
    
    # Iterate over the limited dataset and get responses
    for index, row in tqdm(df_limited.iterrows(), total=df_limited.shape[0], desc="Processing Prompts"):
        prompt = row['Prompt']

        # Get response from the model
        response = model.generate_content(prompt, safety_settings="BLOCK_NONE")
        response_text = response.text.strip()

        # Write only the response to the file
        cleaned_response = response_text.replace("\n", " ").replace("\t", " ")
        file.write(f'{cleaned_response}\n')

       
        time.sleep(3)

print("TSV file created successfully.")


Processing Prompts:   0%|                                                                        | 0/7 [00:01<?, ?it/s]


ResourceExhausted: 429 Resource has been exhausted (e.g. check quota).

In [30]:
import re
import pandas as pd

# Define a function to extract the first standalone uppercase letter A, B, C, or D
def extract_first_letter(content):
    if isinstance(content, str):
        # Use regex to find the first uppercase letter A, B, C, or D that is alone
        match = re.search(r'\b[A-D]\b', content)
        return match.group(0) if match else ''
    return ''

# Path to the existing TSV file with responses
input_file = 'response2.tsv'

# Attempt to load the responses from the TSV file with different encodings
encodings = ['utf-8', 'latin1', 'utf-16']  # List of encodings to try
df = None

for encoding in encodings:
    try:
        df = pd.read_csv(input_file, sep='\t', header=None, names=['response'], encoding=encoding)
        print(f"Successfully loaded with encoding: {encoding}")
        break
    except UnicodeDecodeError as e:
        print(f"Failed to load with encoding {encoding}: {e}")

if df is None:
    raise ValueError("Failed to load the file with all attempted encodings.")

# Prepare a list to hold the results
letters = []

# Extract the first standalone letter from each response
for content in df['response']:
    letter = extract_first_letter(content)
    letters.append(letter)

# Convert results to DataFrame
letters_df = pd.DataFrame({'first_letter': letters})

# Save to a new TSV file
output_file = 'first_letters_extracted2.tsv'
letters_df.to_csv(output_file, sep='\t', index=False)

print("First letters extracted and saved successfully!")


Successfully loaded with encoding: utf-8
First letters extracted and saved successfully!


In [34]:
import pandas as pd

# Load the first TSV file
file1 = 'first_letters_extracted.tsv'
df1 = pd.read_csv(file1, sep='\t')

# Load the second TSV file
file2 = 'first_letters_extracted2.tsv'
df2 = pd.read_csv(file2, sep='\t')

# Concatenate the two DataFrames
concatenated_df = pd.concat([df1, df2])

# Save the concatenated DataFrame to a new TSV file
output_file = 'Responses_file2.tsv'
concatenated_df.to_csv(output_file, sep='\t', index=False)

print(f"Concatenated file saved as {output_file}")


Concatenated file saved as Responses_file2.tsv


In [5]:
import pandas as pd

# Charger les datasets
responses_file_path = "Responses_file.tsv"
cti_responses_file_path = "cti-mcq-responses.tsv"

responses_df = pd.read_csv(responses_file_path, sep='\t')
cti_responses_df = pd.read_csv(cti_responses_file_path, sep='\t')

# Vérifier que les colonnes nécessaires existent
if 'first_letter' not in responses_df.columns:
    raise ValueError("La colonne 'first_letter' est absente du fichier Responses_file.tsv.")
if 'Gemini-1.5' not in cti_responses_df.columns:
    raise ValueError("La colonne 'Gemini-1.5' est absente du fichier cti-mcq-responses.tsv.")

# Vérifier si le dataset cti_responses_df contient au moins 591 lignes
cti_responses_len = len(cti_responses_df)
responses_len = len(responses_df)

print(f"Nombre de lignes dans 'Responses_file.tsv': {responses_len}")
print(f"Nombre de lignes dans 'cti-mcq-responses.tsv': {cti_responses_len}")

# Comparer seulement les 591 premières lignes
if responses_len > 591:
    responses_df = responses_df.head(591)  # Prendre seulement les 591 premières lignes
if cti_responses_len > 591:
    cti_responses_df = cti_responses_df.head(591)  # Prendre seulement les 591 premières lignes

# Comparer les colonnes pour les 591 premières lignes
matches = 0
for i in range(responses_len):  # ou cti_responses_len, car les deux ont maintenant 591 lignes
    if responses_df.loc[i, 'first_letter'] == cti_responses_df.loc[i, 'Gemini-1.5']:
        matches += 1

# Calculer le pourcentage de correspondance
percentage = (matches / responses_len) * 100

# Afficher les résultats
print(f"Total correspondances : {matches}/{responses_len}")
print(f"Pourcentage de correspondance : {percentage:.2f}%")


Nombre de lignes dans 'Responses_file.tsv': 591
Nombre de lignes dans 'cti-mcq-responses.tsv': 2500
Total correspondances : 144/591
Pourcentage de correspondance : 24.37%
