-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b4f64f9
commit b1e6d84
Showing
9 changed files
with
783 additions
and
828 deletions.
There are no files selected for viewing
517 changes: 466 additions & 51 deletions
517
6_rentability_study/notebook/1_Evaluate_rentability.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
340 changes: 0 additions & 340 deletions
340
6_rentability_study/notebook/DRAFTS_Business_relevance_sandbox.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
211 changes: 211 additions & 0 deletions
211
6_rentability_study/notebook/annotation_agreement_score.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
""" | ||
* Name: annotation_agreement_score | ||
* Description: Compute annotation agreement score between annotator and previous clustering. | ||
* Author: Erwan Schild | ||
* Created: 10/03/2023 | ||
* Licence: CeCILL (https://cecill.info/licences.fr.html) | ||
""" | ||
|
||
# ============================================================================== | ||
# IMPORT PYTHON DEPENDENCIES | ||
# ============================================================================== | ||
|
||
from typing import List, Dict, Optional, Tuple | ||
import json | ||
import numpy as np | ||
from scipy import stats as scipystats | ||
from matplotlib import pyplot as plt | ||
from matplotlib.figure import Figure | ||
|
||
# ============================================================================== | ||
# 1. COMPUTE ANNOTATION AGREEMENT SCORE | ||
# ============================================================================== | ||
def compute_annotation_agreement_score( | ||
clustering: Dict[str, int], | ||
annotations: List[Tuple[str, str, str]], | ||
) -> Optional[float]: | ||
""" | ||
Count the proportion of annotations that is similar to previous clustering results ("MUST_LINK" when same cluster, "CANNOT_LINK" when different clusters). | ||
NB : if agreement is low (near to 0.0), then constraints annotated will considerably fix clustering in next iteration. Otherwise, clustering is stable. | ||
Args: | ||
clustering (Dict[str, int]): The clustering result on the previous iteration. | ||
annotations (List[Tuple[str, str, str]): The constraints annotations on the current iteration. | ||
Returns: | ||
Optional[float]: The agreement score between annotations and previous clustering. Can be `None` if there is no annotations. | ||
""" | ||
ok: int = 0 | ||
ko: int = 0 | ||
skip: int = 0 | ||
for annotation in annotations: | ||
# Get annotation and cluster result. | ||
cluster_data_1: int = clustering[annotation[0]] | ||
cluster_data_2: int = clustering[annotation[1]] | ||
constraint_type: str = annotation[2] | ||
# Case of agreement. | ||
if ( | ||
(constraint_type == "MUST_LINK" and cluster_data_1 == cluster_data_2) | ||
or (constraint_type == "CANNOT_LINK" and cluster_data_1 != cluster_data_2) | ||
): | ||
ok += 1 | ||
# Case of disagreement. | ||
elif ( | ||
(constraint_type == "MUST_LINK" and cluster_data_1 != cluster_data_2) | ||
or (constraint_type == "CANNOT_LINK" and cluster_data_1 == cluster_data_2) | ||
): | ||
ko += 1 | ||
# Case of not annotated. | ||
else: | ||
skip += 1 | ||
# Return agreement score. | ||
return ( | ||
ok / (ok + ko) | ||
if (ok + ko) != 0 | ||
else None | ||
) | ||
|
||
# ============================================================================== | ||
# 2. DISPLAY ANNOTATION AGREEMENT SCORE EVOLUTION | ||
# ============================================================================== | ||
def display_annotation_agreement_score( | ||
implementation: str, | ||
list_of_experiments: List[str], | ||
list_of_iterations: Optional[List[str]] = None, | ||
plot_label: str = "Accord annotation/clustering.", | ||
plot_color: str = "black", | ||
graph_filename: str = "annotation_agreement_score.png", | ||
) -> Figure: | ||
""" | ||
Display annotation agreement score per iteration. | ||
Args: | ||
implementation (str): The folder that represents the folder to display. | ||
list_of_experiments (List[str]). The list of files that represent experiments to analyze. | ||
list_of_iterations (Optional[List[str]]): The list of iterations used for display. Defaults to `None`. | ||
plot_label (str): The label of the plot. Defaults to `"Accord annotation/clustering."`. | ||
plot_color (str): The color of plot. Defaults to `"black"`. | ||
graph_filename (str): The graph filename. Default to `"annotation_agreement_score.png"`. | ||
Returns: | ||
Figure: Figure of annotation agreement score evolution. | ||
""" | ||
|
||
# Definition of list_of_iteration: | ||
if list_of_iterations is None: | ||
|
||
# Initialize maximum iteration. | ||
max_iteration: str = "0001" | ||
|
||
# For each experiment... | ||
for exp1 in list_of_experiments: | ||
|
||
# Load data for the experiment. | ||
with open("../experiments/" + implementation + "/previous_results___" + exp1, "r") as file_data_r: | ||
dict_of_clustering_results: Dict[str, Dict[str, int]] = json.load(file_data_r)["dict_of_clustering_results"] | ||
|
||
# Update meximum iteration. | ||
max_iteration = max( | ||
max(dict_of_clustering_results.keys()), | ||
max_iteration, | ||
) | ||
|
||
# Update list of iterations | ||
list_of_iterations: List[str] = [ | ||
str(i).zfill(4) | ||
for i in range(int(max_iteration)) | ||
] | ||
|
||
# Update iteration by removing "0000". | ||
list_of_iterations = [ | ||
i | ||
for i in list_of_iterations | ||
if i != "0000" | ||
] | ||
|
||
# Initialize storage of experiment annotation agreement score for all iterations. | ||
dict_of_annotation_agreement_score_evolution: Dict[str, List[float]] = { | ||
iter_cons: [] for iter_cons in list_of_iterations | ||
} | ||
|
||
# For each experiment... | ||
for exp2 in list_of_experiments: | ||
|
||
# Load data for the experiment. | ||
with open("../experiments/" + implementation + "/annotation_agreement_score___" + exp2, "r") as file_scores_r: | ||
annotation_agreement_scores: Dict[str, float] = json.load(file_scores_r) | ||
|
||
# For each requested iteration... | ||
for iter_a in list_of_iterations: | ||
|
||
# Append the annotation agreement score for the current experiment and for this iteration. | ||
if iter_a in annotation_agreement_scores.keys(): | ||
dict_of_annotation_agreement_score_evolution[iter_a].append( | ||
annotation_agreement_scores[iter_a] | ||
) | ||
# If iteration isn't reached by this experiment, add 1.0. | ||
else: | ||
dict_of_annotation_agreement_score_evolution[iter_a].append(1.0) | ||
|
||
|
||
# Initialize storage of experiment annotation agreement score mean for all iterations. | ||
dict_of_annotation_agreement_score_evolution_MEAN: Dict[str, float] = { | ||
iter_mean: np.mean(dict_of_annotation_agreement_score_evolution[iter_mean]) | ||
for iter_mean in list_of_iterations | ||
} | ||
# Initialize storage of experiment annotation agreement score standard error of the mean for all iterations. | ||
dict_of_annotation_agreement_score_evolution_SEM: Dict[str, float] = { | ||
iter_sem: scipystats.sem(dict_of_annotation_agreement_score_evolution[iter_sem]) | ||
for iter_sem in list_of_iterations | ||
} | ||
|
||
# Create a new figure. | ||
fig_plot: Figure = plt.figure(figsize=(15, 7.5), dpi=300) | ||
axis_plot = fig_plot.gca() | ||
|
||
# Set range of axis. | ||
#axis_plot.set_xlim(xmin=0) | ||
axis_plot.set_ylim(ymin=-0.05, ymax=1.05) | ||
|
||
# Plot average annotation agreement score evolution. | ||
axis_plot.plot( | ||
[int(iter_mean) for iter_mean in list_of_iterations], # x | ||
[dict_of_annotation_agreement_score_evolution_MEAN[iter_mean] for iter_mean in list_of_iterations], # y | ||
label=plot_label, | ||
marker="", | ||
markerfacecolor=plot_color, | ||
markersize=5, | ||
color=plot_color, | ||
linewidth=2, | ||
linestyle="-", | ||
) | ||
axis_plot.fill_between( | ||
x=[int(iter_err) for iter_err in list_of_iterations], # x | ||
y1=[(dict_of_annotation_agreement_score_evolution_MEAN[iter_errinf] - dict_of_annotation_agreement_score_evolution_SEM[iter_errinf]) for iter_errinf in list_of_iterations], # y1 | ||
y2=[(dict_of_annotation_agreement_score_evolution_MEAN[iter_errsup] + dict_of_annotation_agreement_score_evolution_SEM[iter_errsup]) for iter_errsup in list_of_iterations], # y2 | ||
color=plot_color, | ||
alpha=0.2, | ||
) | ||
|
||
# Set axis name. | ||
axis_plot.set_xlabel("itération (#)", fontsize=18,) | ||
axis_plot.set_ylabel("accord annotation/clustering (%)", fontsize=18,) | ||
|
||
# Plot the legend. | ||
axis_plot.legend(fontsize=15, loc="lower right") | ||
|
||
# Plot the grid. | ||
axis_plot.grid(True) | ||
|
||
# Store the graph. | ||
if graph_filename is not None: | ||
fig_plot.savefig( | ||
"../results/" + graph_filename, | ||
dpi=300, | ||
transparent=True, | ||
bbox_inches="tight", | ||
) | ||
|
||
return fig_plot |
106 changes: 106 additions & 0 deletions
106
6_rentability_study/notebook/clustering_similarity_moving_average.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
""" | ||
* Name: clustering_similarity_moving_average | ||
* Description: Compute clustering similarity moving average with MACD (_moving average convergence divergence_). | ||
* Author: Erwan Schild | ||
* Created: 10/03/2023 | ||
* Licence: CeCILL (https://cecill.info/licences.fr.html) | ||
""" | ||
|
||
# ============================================================================== | ||
# IMPORT PYTHON DEPENDENCIES | ||
# ============================================================================== | ||
|
||
from typing import List, Dict, Optional, Tuple | ||
import json | ||
import pandas as pd | ||
from sklearn import metrics | ||
import numpy as np | ||
from scipy import stats as scipystats | ||
from matplotlib import pyplot as plt | ||
from matplotlib.figure import Figure | ||
|
||
# ============================================================================== | ||
# 1. COMPUTE CLUSTERING SIMILARITY MOVING AVERAGE | ||
# ============================================================================== | ||
def compute_clustering_similarity_moving_average( | ||
dict_of_clustering_results: Dict[str, Dict[str, int]], | ||
short_average: int = 3, | ||
long_average: int = 5, | ||
) -> Dict[str, Dict[str, float]]: | ||
""" | ||
Compute v-measure between two clustering, then moving average with MACD (_moving average convegrence divergence_) method to predict when clustering converge. | ||
NB : | ||
Args: | ||
dict_of_clustering_results (Dict[str, int]): The clustering results during iterations. | ||
short_average (int): The span for short average of v-measure. Defaults to `3`. | ||
long_average (int): The span for long average of v-measure. Defaults to `5`. | ||
Returns: | ||
Dict[str, Dict[str, float]]: The v-measures between clustering and the moving averages. | ||
""" | ||
|
||
# Compute av-measure between two clustering. | ||
vmeasures_evolution: Dict[str, float] = {} | ||
previous_iteration: Optional[str] = None | ||
for iteration in dict_of_clustering_results.keys(): | ||
# Compute av-measure between two clustering. It starts at iteration "0001". | ||
if iteration != "0000": | ||
vmeasures_evolution[iteration] = metrics.v_measure_score( | ||
labels_true=[str(dict_of_clustering_results[previous_iteration][text_id]) for text_id in dict_of_clustering_results[previous_iteration].keys()], | ||
labels_pred=[str(dict_of_clustering_results[iteration][text_id]) for text_id in dict_of_clustering_results[iteration].keys()], | ||
) | ||
# Update temporary variables. | ||
previous_iteration = iteration | ||
|
||
# Compute moving averages. | ||
df_averages = pd.DataFrame.from_dict({ | ||
"iteration": list(vmeasures_evolution.keys()), | ||
"vmeasure": list(vmeasures_evolution.values()) | ||
}) | ||
df_averages.set_index("iteration", inplace=True) | ||
df_averages["short_average"] = pd.Series.ewm(df_averages["vmeasure"], span=short_average).mean() | ||
df_averages["long_average"] = pd.Series.ewm(df_averages["vmeasure"], span=long_average).mean() | ||
df_averages["MACD"] = df_averages["short_average"] - df_averages["long_average"] | ||
|
||
# Return results. | ||
return { | ||
"vmeasure": vmeasures_evolution, | ||
"short_average": df_averages["short_average"].to_dict(), | ||
"long_average": df_averages["long_average"].to_dict(), | ||
"MACD": df_averages["MACD"].to_dict(), | ||
} | ||
|
||
# ============================================================================== | ||
# 2. DISPLAY CLUSTERING SIMILARITY MOVING AVERAGE | ||
# ============================================================================== | ||
def display_clustering_similarity_moving_average( | ||
implementation: str, | ||
list_of_experiments: List[str], | ||
list_of_iterations: Optional[List[str]] = None, | ||
plot_label: str = "MACD.", | ||
plot_color: str = "black", | ||
graph_filename: str = "MACD.png", | ||
) -> Figure: | ||
""" | ||
Display clustering similarity moving average per iteration. | ||
Args: | ||
implementation (str): The folder that represents the folder to display. | ||
list_of_experiments (List[str]). The list of files that represent experiments to analyze. | ||
list_of_iterations (Optional[List[str]]): The list of iterations used for display. Defaults to `None`. | ||
plot_label (str): The label of the plot. Defaults to `"Accord annotation/clustering."`. | ||
plot_color (str): The color of plot. Defaults to `"black"`. | ||
graph_filename (str): The graph filename. Default to `"annotation_agreement_score.png"`. | ||
Returns: | ||
Figure: Figure of clustering similarity movinging average evolution. | ||
""" | ||
return None | ||
# Compute mean | ||
# Display short average | ||
# Display long average | ||
# Display average vmeasure with groundtruth | ||
# Display average vmeasure beatween clustering |
Oops, something went wrong.