In [1]:
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu

# Assurer que NLTK est prêt
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/pionner01/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
def load_csv(file_path):
    """
    Charge un fichier CSV et le renvoie sous forme de DataFrame.
    """
    return pd.read_csv(file_path)

# Exemple d'utilisation
real_annotations_file = "/home/pionner01/auto-annotation/african_data_collector/storage/meta_data_image.csv"  # Remplace avec le bon chemin
generated_annotations_file = "/home/pionner01/auto-annotation/annotation/storage/annotations_blip2.csv"  # Remplace avec le bon chemin
real_annotations_df = load_csv(real_annotations_file)
generated_annotations_df = load_csv(generated_annotations_file)

real_annotations_df.head(), generated_annotations_df.head()


(       filename                                        description  \
 0  image_1.jpeg  Several freshly caught tilapia fish are placed...   
 1  image_2.jpeg  View of a cultivated field of leafy vegetables...   
 2  image_3.jpeg  An agronomist is giving an interview in a spin...   
 3  image_4.jpeg  A woman holds a machete and harvests spinach (...   
 4  image_5.jpeg  A group of cattle moves through the forest, wh...   
 
        category  
 0  fish farming  
 1   agriculture  
 2   agriculture  
 3    rural work  
 4      wildlife  ,
       image_name                                         annotation
 0   image_7.jpeg  a woman in yellow shirt standing in the middle...
 1  image_13.jpeg        a man is walking with a goat on a dirt road
 2   image_9.jpeg                 children washing water in a stream
 3   image_8.jpeg  a man is using a hose to clean a hole in the g...
 4  image_10.jpeg       two boys are playing in the water in a river)

In [4]:
def merge_annotations(real_df, generated_df):
    """
    Fusionne les DataFrames d'annotations réelles et générées sur la clé de l'image.
    """
    return pd.merge(real_df, generated_df, left_on='filename', right_on='image_name', suffixes=('_real', '_generated'))

# Fusionner les DataFrames
merged_df = merge_annotations(real_annotations_df, generated_annotations_df)

# Afficher un échantillon des résultats fusionnés
merged_df[['filename', 'description', 'annotation']].head()


Unnamed: 0,filename,description,annotation
0,image_1.jpeg,Plusieurs poissons tilapia fraîchement pêchés ...,a bucket full of fish sitting on top of a scale
1,image_2.jpeg,Vue d’un champ cultivé de légumes-feuilles (pr...,a field of green vegetables with people workin...
2,image_3.jpeg,Un agronome donne une interview en plein champ...,a man standing in a field with lettuce plants
3,image_4.jpeg,Une femme tient une machette et récolte des ép...,a woman is working in a field of green plants
4,image_5.jpeg,"Un groupe de bœufs se déplace dans la forêt, t...",cattle grazing in a field with birds flying ar...


In [11]:
from nltk.translate.bleu_score import sentence_bleu

# Calculer le score BLEU pour chaque paire d'annotation
def calculate_bleu_score(reference, hypothesis):
    # Tokenisation des textes
    reference_tokens = reference.split()
    hypothesis_tokens = hypothesis.split()
    return sentence_bleu([reference_tokens], hypothesis_tokens)

# Appliquer le calcul du score BLEU à chaque ligne
merged_df['bleu_score'] = merged_df.apply(lambda row: calculate_bleu_score(row['description'], row['annotation']), axis=1)

# Afficher les résultats avec les scores BLEU
merged_df[['filename', 'description', 'annotation', 'bleu_score']].head()


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Unnamed: 0,filename,description,annotation,bleu_score
0,african_village_0.jpg,A girl sent on an errand.,a young girl carrying a basket of dirty clothes,1.051835e-231
1,african_village_1.jpg,"Sunset tree in Kenya Safari, Africa",a lone tree stands in the middle of a field,1.2183320000000001e-231
2,african_village_2.jpg,Taken on a trip in 2016 with World Vision to S...,women in african traditional dress standing in...,2.339639e-232
3,african_village_3.jpg,"Road in Mukono, Uganda",a busy street with many people walking and rid...,0.0
4,african_village_4.jpg,Taken on a trip in 2016 with World Vision to S...,a young girl smiles as she is surrounded by ot...,2.8017009999999998e-232


In [13]:
from rouge_score import rouge_scorer

# Créer un scoreur ROUGE
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculer les scores ROUGE
def calculate_rouge_score(reference, hypothesis):
    scores = scorer.score(reference, hypothesis)
    return scores['rouge1'].fmeasure  # Par exemple, on peut se concentrer sur le score ROUGE-1

# Appliquer le calcul du score ROUGE à chaque ligne
merged_df['rouge_score'] = merged_df.apply(lambda row: calculate_rouge_score(row['description'], row['annotation']), axis=1)

# Afficher les résultats avec les scores ROUGE
merged_df[['filename', 'description', 'annotation', 'rouge_score']].head()


Unnamed: 0,filename,description,annotation,rouge_score
0,african_village_0.jpg,A girl sent on an errand.,a young girl carrying a basket of dirty clothes,0.266667
1,african_village_1.jpg,"Sunset tree in Kenya Safari, Africa",a lone tree stands in the middle of a field,0.25
2,african_village_2.jpg,Taken on a trip in 2016 with World Vision to S...,women in african traditional dress standing in...,0.113208
3,african_village_3.jpg,"Road in Mukono, Uganda",a busy street with many people walking and rid...,0.0
4,african_village_4.jpg,Taken on a trip in 2016 with World Vision to S...,a young girl smiles as she is surrounded by ot...,0.036364


In [14]:
# Filtrer les annotations avec un score BLEU faible
low_bleu_scores = merged_df[merged_df['bleu_score'] < 0.2]  # Par exemple, choisir un seuil de 0.2
low_bleu_scores[['filename', 'description', 'annotation', 'bleu_score']].head()


Unnamed: 0,filename,description,annotation,bleu_score
0,african_village_0.jpg,A girl sent on an errand.,a young girl carrying a basket of dirty clothes,1.051835e-231
1,african_village_1.jpg,"Sunset tree in Kenya Safari, Africa",a lone tree stands in the middle of a field,1.2183320000000001e-231
2,african_village_2.jpg,Taken on a trip in 2016 with World Vision to S...,women in african traditional dress standing in...,2.339639e-232
3,african_village_3.jpg,"Road in Mukono, Uganda",a busy street with many people walking and rid...,0.0
4,african_village_4.jpg,Taken on a trip in 2016 with World Vision to S...,a young girl smiles as she is surrounded by ot...,2.8017009999999998e-232
