In [19]:
def get_api_key(file_path):
    with open(file_path, 'r') as file:
        line = file.readline().strip()
        key, value = line.split('=', 1)
        return value

# Use the API key
GROQ_API_KEY = get_api_key('api_key_llama2.txt')

#### Generation des réponses de Llama3 sur dataset RCM

In [8]:
import pandas as pd
import time
from groq import Groq
from tqdm import tqdm

# Function to read the API key
def get_api_key(file_path):
    with open(file_path, 'r') as file:
        line = file.readline().strip()
        key, value = line.split('=', 1)
        return value

# Use the API key
GROQ_API_KEY = get_api_key('api_key_llama2.txt')
client = Groq(api_key=GROQ_API_KEY)

# Load the cti-rcm dataset
df = pd.read_csv('cti-rcm.tsv', sep='\t', encoding='ISO-8859-1')

# Select only the 'Prompt' column
df_prompts = df[['Prompt']]  # Ensure the column exists in your dataset

# Define the range of lines to process
start_line = 0  # Adjust start line
end_line = 400
df_limited = df_prompts.iloc[start_line:end_line]

# Path to the output TSV file
output_file = 'llama3_cti_rcm_R.tsv'

# Write responses to the TSV file
with open(output_file, 'a') as file:
    
    # Iterate through each row in the dataset
    for index, row in tqdm(df_limited.iterrows(), total=df_limited.shape[0], desc="Processing Prompts"):
        # Extract the prompt
        prompt = row['Prompt']

        # Get the response from the llama3 model using Groq
        try:
            completion = client.chat.completions.create(
                model="llama3-8b-8192",
                messages=[{"role": "user", "content": prompt}],
                temperature=1,
                max_tokens=50,
                top_p=1,
                stream=False
            )

            # Extract and clean the response
            response_text = completion.choices[0].message.content.strip()
            cleaned_response = response_text.replace("\n", " ").replace("\t", " ")

            # Write the cleaned response to the output TSV file
            file.write(f'{cleaned_response}\n')

            # Optional: Sleep to avoid rate-limiting
            time.sleep(1)

        except Exception as e:
            print(f"Error processing row {index + 1}: {e}")
            file.write('ERROR\n')  # Log errors as 'ERROR' in the output file

print("Responses saved successfully in:", output_file)


Processing Prompts: 100%|████████████████████████████████████████████████████████████| 400/400 [18:58<00:00,  2.85s/it]

Responses saved successfully in: llama3_cti_rcm_R.tsv





In [None]:
import pandas as pd
import time
from groq import Groq
from tqdm import tqdm

# Function to read the API key
def get_api_key(file_path):
    with open(file_path, 'r') as file:
        line = file.readline().strip()
        key, value = line.split('=', 1)
        return value

# Use the API key
GROQ_API_KEY = get_api_key('api_key_llama2.txt')
client = Groq(api_key=GROQ_API_KEY)

# Load the cti-rcm dataset
df = pd.read_csv('cti-rcm.tsv', sep='\t', encoding='ISO-8859-1')

# Select only the 'Prompt' column
df_prompts = df[['Prompt']]  # Ensure the column exists in your dataset

# Define the range of lines to process
start_line = 400  # Adjust start line
end_line = 800
df_limited = df_prompts.iloc[start_line:end_line]

# Path to the output TSV file
output_file = 'llama3_cti_rcm_R2.tsv'

# Write responses to the TSV file
with open(output_file, 'a') as file:
    
    # Iterate through each row in the dataset
    for index, row in tqdm(df_limited.iterrows(), total=df_limited.shape[0], desc="Processing Prompts"):
        # Extract the prompt
        prompt = row['Prompt']

        # Get the response from the llama3 model using Groq
        try:
            completion = client.chat.completions.create(
                model="llama3-8b-8192",
                messages=[{"role": "user", "content": prompt}],
                temperature=1,
                max_tokens=50,
                top_p=1,
                stream=False
            )

            # Extract and clean the response
            response_text = completion.choices[0].message.content.strip()
            cleaned_response = response_text.replace("\n", " ").replace("\t", " ")

            # Write the cleaned response to the output TSV file
            file.write(f'{cleaned_response}\n')

            # Optional: Sleep to avoid rate-limiting
            time.sleep(1)

        except Exception as e:
            print(f"Error processing row {index + 1}: {e}")
            file.write('ERROR\n')  # Log errors as 'ERROR' in the output file

print("Responses saved successfully in:", output_file)


Processing Prompts:  80%|████████████████████████████████████████████████▎           | 322/400 [14:07<03:41,  2.84s/it]

#### Extraction de la réponse

In [12]:
import pandas as pd
import re

# Charger la dataset
input_file = 'llama3_cti_rcm_R.tsv'
df = pd.read_csv(input_file, sep='\t', encoding='ISO-8859-1')

# Fonction pour extraire les expressions commençant par "CWE-"
def extract_cwe(text):
    # Utiliser une expression régulière pour trouver les mentions de CWE-
    matches = re.findall(r'\bCWE-\d+', str(text))
    # Retourner la première correspondance ou "N/A" si aucune trouvée
    return matches[0] if matches else "N/A"

# Appliquer la fonction à chaque ligne de la colonne contenant les réponses
df['Extracted_CWE'] = df['Responses'].apply(extract_cwe)  

# Créer une nouvelle dataset avec la colonne CWE extraite
Llamma3_reponses_rcm = df[['Responses', 'Extracted_CWE']]

# Sauvegarder la nouvelle dataset dans un fichier TSV
output_file = 'Llamma3_reponses_rcm.tsv'
Llamma3_reponses_rcm.to_csv(output_file, sep='\t', index=False)

print(f"Extraction terminée. Les résultats sont sauvegardés dans {output_file}.")


Extraction terminée. Les résultats sont sauvegardés dans Llamma3_reponses_rcm.tsv.


#### Comparaison

In [18]:
import pandas as pd

responses_file = 'Llamma3_reponses_rcm.tsv'
cti_file = 'cti-rcm-responses.tsv' #Llama3 responses , dataset of the benchmark

df_responses = pd.read_csv(responses_file, sep='\t', encoding='ISO-8859-1')
df_cti = pd.read_csv(cti_file, sep='\t', encoding='ISO-8859-1')

df_responses_limited = df_responses.head(400)
df_cti_limited = df_cti.head(400)

# Comparer les deux colonnes
comparison = df_responses_limited['Extracted_CWE'] == df_cti_limited['LLAMA3-8B']

# Calculer le pourcentage de correspondance
percentage_correct = (comparison.sum() / len(comparison)) * 100

print(f"Pourcentage de réponses correctes : {percentage_correct:.2f}%")

# Créer un nouveau DataFrame pour afficher les comparaisons
comparison_results = pd.DataFrame({
    'Llamma3_Responses': df_responses_limited['Extracted_CWE'],
    'CTI_Llamma3': df_cti_limited['LLAMA3-8B'],
    'Is_Correct': comparison
})

# Sauvegarder les résultats de la comparaison dans un fichier
output_file = 'comparison_results.tsv'
comparison_results.to_csv(output_file, sep='\t', index=False)

print(f"Les résultats de la comparaison ont été sauvegardés dans {output_file}.")


Pourcentage de réponses correctes : 38.75%
Les résultats de la comparaison ont été sauvegardés dans comparison_results.tsv.
