In [38]:
import os
import subprocess
import pandas as pd
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import nltk
from collections import defaultdict
import warnings
import treetaggerwrapper
import seaborn as sns
# Ignorer les warnings spécifiques
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=SyntaxWarning)


In [39]:
print(plt.style.available)
plt.style.use('default')
sns.set_theme()  # Sets Seaborn's default themes for Matplotlib

['Solarize_Light2', '_classic_test_patch', '_mpl-gallery', '_mpl-gallery-nogrid', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'petroff10', 'seaborn-v0_8', 'seaborn-v0_8-bright', 'seaborn-v0_8-colorblind', 'seaborn-v0_8-dark', 'seaborn-v0_8-dark-palette', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8-deep', 'seaborn-v0_8-muted', 'seaborn-v0_8-notebook', 'seaborn-v0_8-paper', 'seaborn-v0_8-pastel', 'seaborn-v0_8-poster', 'seaborn-v0_8-talk', 'seaborn-v0_8-ticks', 'seaborn-v0_8-white', 'seaborn-v0_8-whitegrid', 'tableau-colorblind10']


In [43]:
import subprocess

def run_treetagger(input_file, output_file, TAGPARFILE='penn.par'):
    """
    Exécute Treetagger sur le fichier d'entrée et sauvegarde les résultats dans le fichier de sortie.
    """
    try:
        cmd = ["tree-tagger", TAGPARFILE, input_file, output_file]
        subprocess.run(cmd, check=True)
        print(f"Treetagger a traité avec succès {input_file}")
    except subprocess.CalledProcessError as e:
        print(f"Erreur lors de l'exécution de Treetagger : {e}")

def evaluate_precision(tagged_file, reference_file):
    """
    Compare les étiquettes du fichier traité par Treetagger avec le fichier de référence
    pour calculer la précision.
    """
    total_tags = 0
    correct_tags = 0

    with open(tagged_file, 'r') as tf, open(reference_file, 'r') as rf:
        for tagged_line, reference_line in zip(tf, rf):
            tagged_tokens = tagged_line.strip().split()
            reference_tokens = reference_line.strip().split()
            
            for tagged_token, reference_token in zip(tagged_tokens, reference_tokens):
                if '/' in tagged_token and '/' in reference_token:
                    tagged_word, tagged_tag = tagged_token.rsplit('/', 1)
                    reference_word, reference_tag = reference_token.rsplit('/', 1)
                    
                    if tagged_tag == reference_tag:
                        correct_tags += 1
                    total_tags += 1

    precision = correct_tags / total_tags if total_tags > 0 else 0
    return precision

def run_tests_on_files(input_files, output_files, tagset='penn.par', reference_files=None):
    """
    Exécute Treetagger sur tous les fichiers d'entrée et évalue la précision pour chaque fichier de test.
    """
    for input_file, output_file, reference_file in zip(input_files, output_files, reference_files):
        print(f"\nTraitement de {input_file} avec le tagset {tagset}...")
        # Exécution de Treetagger
        run_treetagger(input_file, output_file, tagset)
        # Calcul de la précision
        precision = evaluate_precision(output_file, reference_file)
        print(f"Précision pour {input_file} : {precision:.4f}")

# Liste des fichiers de test pour les différents tagsets
input_files = [
    "Datasets/input_files/NNC_test_text.txt", 
    "Datasets/input_files/that_adv.txt",
    "Datasets/input_files/that_conjunction.txt", 
    "Datasets/input_files/that_determiner.txt",
    "Datasets/input_files/that_pronoun.txt"
]
output_files = [
    "Datasets/output_files/test_NNC_output.txt", 
    "Datasets/output_files/test_that_adv_output.txt", 
    "Datasets/output_files/test_that_conjountion_output.txt",
    "Datasets/output_files/test_that_determiner_output.txt",
    "Datasets/output_files/test_that_pronoun_output.txt"
]
reference_files = [
    "Datasets/reference_files/that_NNC_reference.text",
    "Datasets/reference_files/that_adv_reference.txt", 
    "Datasets/reference_files/that_conjountion_reference.txt", 
    "Datasets/reference_files/that_determiner_reference.txt",
    "Datasets/reference_files/that_pronoun_reference.txt"
]

# Exécution des tests
run_tests_on_files(input_files, output_files, 'penn.par', reference_files)
# run_tests_on_files(input_files, output_files, 'bnc.par', reference_files)



Traitement de Datasets/input_files/NNC_test_text.txt avec le tagset penn.par...
Treetagger a traité avec succès Datasets/input_files/NNC_test_text.txt
Précision pour Datasets/input_files/NNC_test_text.txt : 0.0000

Traitement de Datasets/input_files/that_adv.txt avec le tagset penn.par...
Treetagger a traité avec succès Datasets/input_files/that_adv.txt
Précision pour Datasets/input_files/that_adv.txt : 0.0000

Traitement de Datasets/input_files/that_conjunction.txt avec le tagset penn.par...
Treetagger a traité avec succès Datasets/input_files/that_conjunction.txt
Précision pour Datasets/input_files/that_conjunction.txt : 0.0000

Traitement de Datasets/input_files/that_determiner.txt avec le tagset penn.par...
Treetagger a traité avec succès Datasets/input_files/that_determiner.txt
Précision pour Datasets/input_files/that_determiner.txt : 0.0000

Traitement de Datasets/input_files/that_pronoun.txt avec le tagset penn.par...
Treetagger a traité avec succès Datasets/input_files/that_pr

In [None]:
# Step 1: Define helper functions for Treetagger

def run_treetagger(input_file, output_file, language="english"):
    """
    Runs Treetagger on the input file and saves the output to output_file.
    """
    try:
        cmd = ["tree-tagger", language, input_file, output_file]
        subprocess.run(cmd, check=True)
        print(f"Treetagger successfully processed {input_file}")
    except subprocess.CalledProcessError as e:
        print(f"Error running Treetagger: {e}")


def parse_treetagger_output(output_file):
    """
    Parses the Treetagger output into a structured DataFrame.
    """
    data = []
    with open(output_file, "r") as f:
        for line in f:
            if line.strip():
                word, tag, lemma = line.strip().split("\t")
                data.append({"Word": word, "Tag": tag, "Lemma": lemma})
    return pd.DataFrame(data)

# Step 2: Load and preprocess datasets
def load_datasets(folder_path):
    """
    Loads datasets for evaluation from the specified folder.
    """
    datasets = {}
    for file in os.listdir(folder_path):
        if file.endswith(".txt"):
            dataset_name = file.split(".")[0]
            datasets[dataset_name] = os.path.join(folder_path, file)
    return datasets

# Step 3: Evaluate the precision of tags
def evaluate_tags(predictions, ground_truth):
    """
    Evaluates precision for the predicted tags against ground truth.
    """
    y_true = ground_truth["Tag"]
    y_pred = predictions["Tag"]
    precision = precision_score(y_true, y_pred, average="weighted", zero_division=0)
    return precision, classification_report(y_true, y_pred, zero_division=0)

# Step 4: Visualization
def plot_precision_results(results):
    """
    Plots precision results for different datasets.
    """
    df_results = pd.DataFrame(results)
    df_results.plot(kind="bar", x="Dataset", y="Precision", color="skyblue", legend=False)
    plt.title("Precision Results by Dataset")
    plt.xlabel("Dataset")
    plt.ylabel("Precision")
    plt.tight_layout()
    plt.show()

# Step 5: Main Workflow
def main():
    # Paths for input/output
    input_folder = "./datasets/test_files"  # Replace with your folder path
    output_folder = "./datasets/outputs"
    os.makedirs(output_folder, exist_ok=True)

    # Load datasets
    datasets = load_datasets(input_folder)

    # Initialize results storage
    results = []

    for dataset_name, input_file in datasets.items():
        print(f"Processing dataset: {dataset_name}")

        # Run Treetagger
        output_file = os.path.join(output_folder, f"{dataset_name}_output.txt")
        run_treetagger(input_file, output_file)

        # Parse Treetagger output
        predictions = parse_treetagger_output(output_file)

        # Ground truth comparison (simulated, replace with actual ground truth file)
        ground_truth_file = os.path.join(input_folder, f"{dataset_name}_ground_truth.csv")
        ground_truth = pd.read_csv(ground_truth_file)

        # Evaluate precision
        precision, report = evaluate_tags(predictions, ground_truth)

        # Store results
        results.append({"Dataset": dataset_name, "Precision": precision})

        # Print classification report
        print(f"Classification Report for {dataset_name}:\n{report}")

    # Plot results
    plot_precision_results(results)

if __name__ == "__main__":
    main()
