# ==== INTERACTIVE CLUSTERING : ANNOTATION TIME STUDY ====
> ### Stage 1 : Modelize annotation time with Interactive Clustering Methodology and Plot some figures.

-----

## READ-ME BEFORE RUNNING

### Quick Description

This notebook is **aimed at modelize interactive clustering annotation time experiments**.
- Environments are represented by subdirectories in the `/experiments` folder.
- Each subdirectories of `/experiments` folder represents an annotation experiment with several annotators.

### Description each steps

First of all, **load experiment synthesis XLSX file** that have made during annotation experiment.
- It contains sessions of annotation for each annotator.
- Each session contains the number of constraints annotated and the time needed for it.

Then, several analyses are performed:
1. Check hypotheses for parametric modelization
2. Modelize annotation time in function of constraints number
2. Modelize annotation speed in function of session number

-----

## 1. IMPORT PYTHON DEPENDENCIES

In [None]:
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
import openpyxl
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.figure import Figure
import matplotlib.cm as cm
from matplotlib.colors import Normalize
from scipy import stats as scipystats
import statistics
import statsmodels
import statsmodels.api
import statsmodels.formula.api

-----

## 2. LOAD DATA

### 2.1. Load data from XLSX file.

In [None]:
df_annotation_time: pd.DataFrame = pd.read_excel(
    io="../experiments/mlsum_fr_train_subset_v1.0.0.schild/results.xlsx",
    sheet_name="time",
    engine="openpyxl",
)
#df_annotation_time["CONSTRAINTS_PER_MINUTE"] = df_annotation_time["CONSTRAINTS_PER_MINUTE"].replace(",", ".").astype(float)
#df_annotation_time["CONSTRAINTS_PER_HOUR"] = df_annotation_time["CONSTRAINTS_PER_HOUR"].replace(",", ".").astype(float)
#df_annotation_time["SECONDS_PER_CONSTRAINT"] = df_annotation_time["SECONDS_PER_CONSTRAINT"].replace(",", ".").astype(float)
df_annotation_time.head()

In [None]:
print("Constraints number: mean={0:.2f}, median={1:.2f}, min={2:.2f}, max={3:.2f}, sigma={4:.2f}".format(
    np.mean(df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["CONSTRAINTS_NUMBER"]),
    np.median(df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["CONSTRAINTS_NUMBER"]),
    min(df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["CONSTRAINTS_NUMBER"]),
    max(df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["CONSTRAINTS_NUMBER"]),
    np.std(df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["CONSTRAINTS_NUMBER"]),
))

In [None]:
print("Needed seconds: mean={0:.2f}, min={1:.2f}, max={2:.2f}, sigma={3:.2f}".format(
    np.mean(df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["NEEDED_SECONDS"])/60,
    min(df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["NEEDED_SECONDS"])/60,
    max(df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["NEEDED_SECONDS"])/60,
    np.std(df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["NEEDED_SECONDS"])/60,
))

In [None]:
print("Session number: mean={0:.2f}, min={1:.2f}, max={2:.2f}, sigma={3:.2f}".format(
    np.mean(df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["SESSION_ID"]),
    min(df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["SESSION_ID"]),
    max(df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["SESSION_ID"]),
    np.std(df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["SESSION_ID"]),
))

In [None]:
print("Annotation speed: mean={0:.2f}, min={1:.2f}, max={2:.2f}, sigma={3:.2f}".format(
    np.mean(df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["CONSTRAINTS_PER_MINUTE"]),
    min(df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["CONSTRAINTS_PER_MINUTE"]),
    max(df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["CONSTRAINTS_PER_MINUTE"]),
    np.std(df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["CONSTRAINTS_PER_MINUTE"]),
))

### 2.2. Check hypotheses to run parametric modelization

The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution.

In [None]:
scipystats.shapiro(x=df_annotation_time["CONSTRAINTS_PER_MINUTE"]).pvalue
# 4.17e-05 => "CONSTRAINTS_PER_MINUTE" wasn't drawn from a normal distribution.

The Kolmogorov-Smirnov test tests the null hypothesis that the data was drawn from a given distribution (here: a normal distribution).

In [None]:
scipystats.kstest(rvs=df_annotation_time["CONSTRAINTS_PER_MINUTE"], cdf=scipystats.norm.cdf).pvalue
# 2.71e-251 => "CONSTRAINTS_PER_MINUTE" wasn't drawn from a normal distribution.

> Conclusion: Need a non-parametric modelizations

-----

## 3. ANALYZE DATA

### 3.1. Analyze annotation time par constraint

In [None]:
# Fit the model to the data and print results.
model_annotation_time = statsmodels.formula.api.glm(
    formula="NEEDED_SECONDS ~ 1 + CONSTRAINTS_NUMBER",
    data=df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1],
)
results_annotation_time = model_annotation_time.fit()
print(results_annotation_time.summary())

In [None]:
# Print the modelization.
print(
    "NEEDED_SECONDS ~",
    "{0:.2E}".format(results_annotation_time.params["Intercept"]),
    "{0:.2E}*{1}".format(results_annotation_time.params["CONSTRAINTS_NUMBER"], "CONSTRAINTS_NUMBER")
)

In [None]:
# Define the interpolation function.
def interpolation_annotation_time(constraints_number) -> Tuple[float, float, float]:
    # Initialization.
    res_low: float = 0.0
    res: float = 0.0
    res_high: float = 0.0
    # Intercept.
    res_low += (results_annotation_time.params["Intercept"] - results_annotation_time.bse["Intercept"])
    res += results_annotation_time.params["Intercept"]
    res_high += (results_annotation_time.params["Intercept"] + results_annotation_time.bse["Intercept"])
    # constraints_number.
    res_low += (results_annotation_time.params["CONSTRAINTS_NUMBER"] - results_annotation_time.bse["CONSTRAINTS_NUMBER"]) * constraints_number
    res += results_annotation_time.params["CONSTRAINTS_NUMBER"] * constraints_number
    res_high += (results_annotation_time.params["CONSTRAINTS_NUMBER"] + results_annotation_time.bse["CONSTRAINTS_NUMBER"]) * constraints_number
    # Return.
    return res_low, res, res_high

In [None]:
# Create a new figure.
fig_plot_annotation_time: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_annotation_time = fig_plot_annotation_time.gca()

# Set range of axis.
axis_plot_annotation_time.set_xlim(xmin=0, xmax=575)
axis_plot_annotation_time.set_ylim(ymin=0, ymax=100)

# Plot annotation time.
axis_plot_annotation_time.plot(
    df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["CONSTRAINTS_NUMBER"],  # x
    df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1]["NEEDED_SECONDS"]/60,  # y
    label="Temps d'annotation observé",
    marker="x",
    markerfacecolor="red",
    markersize=5,
    color="red",
    linewidth=0,
    linestyle="",
)
axis_plot_annotation_time.plot(
    range(0, 550, 10),  # x
    [
        interpolation_annotation_time(x)[1]/60
        for x in range(0, 550, 10)
    ],  # y
    label="Temps d'annotation modélisé",
    marker="",
    markerfacecolor="red",
    markersize=3,
    color="red",
    linewidth=2,
    linestyle="--",
)
axis_plot_annotation_time.fill_between(
    x=range(0, 550, 10),  # x
    y1=[
        interpolation_annotation_time(x)[0]/60
        for x in range(0, 550, 10)
    ],  # y1
    y2=[
        interpolation_annotation_time(x)[2]/60
        for x in range(0, 550, 10)
    ],  # y2
    color="red",
    alpha=0.2,
)

# Set axis name.
axis_plot_annotation_time.set_xlabel("nombre de contraintes [#]", fontsize=18,)
axis_plot_annotation_time.set_ylabel("temps d'annotation [m]", fontsize=18,)

# Plot the legend.
axis_plot_annotation_time.legend(
    loc="upper left",
    fontsize=15,
)

# Plot the grid.
axis_plot_annotation_time.grid(True)
    
# Store the graph.
fig_plot_annotation_time.savefig(
    "../results/etude-temps-annotation-1-modelisation-temps.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

### 3.2. Modelize annotation speed per session

In [None]:
# Fit the model to the data and print results.
model_annotation_speed = statsmodels.formula.api.glm(
    formula="CONSTRAINTS_PER_MINUTE ~ 1 + (1|ANNOTATOR_ID) + SESSION_ID",
    data=df_annotation_time[df_annotation_time["EXPERIMENT_ID"]==1],
    family=statsmodels.api.families.Gaussian(
        link=statsmodels.genmod.families.links.identity()
        #link=statsmodels.genmod.families.links.log()
    ),
)
results_annotation_speed = model_annotation_speed.fit()
print(results_annotation_speed.summary())

> Conclusion : Variance inter-annotators too high, so no conclusion on session id effect.

### 3.3. Case study of some annotators

> Specific study of annotators `3`,`7`,`9` ; `1`,`5`.

In [None]:
# Create a new figure.
fig_plot_annotator_speed_study: Figure = plt.figure(figsize=(15, 7.5), dpi=300)
axis_plot_annotator_speed_study = fig_plot_annotator_speed_study.gca()

# Set axis.
axis_plot_annotator_speed_study.set_xlim(xmin=0.75, xmax=9.25)
axis_plot_annotator_speed_study.set_ylim(ymin=0, ymax=15)

# Plot for annotation speed for some annotators.
colors = [
    "orange", "red",
    "blue", "purple",
]
markers = [
    ">", ">",
    "^", "^",
]
for i, annotator_id in enumerate([
    1, 5,  # constant slope
    7, 9, # increasing slope
]):
    axis_plot_annotator_speed_study.plot(
        df_annotation_time[(df_annotation_time["ANNOTATOR_ID"]==annotator_id)&(df_annotation_time["EXPERIMENT_ID"]==1)]["SESSION_ID"],  # x
        df_annotation_time[(df_annotation_time["ANNOTATOR_ID"]==annotator_id)&(df_annotation_time["EXPERIMENT_ID"]==1)]["CONSTRAINTS_PER_MINUTE"],  # y
        label="Vitesse d'annotation observée pour l'annotateur "+str(annotator_id),
        marker=markers[i],
        markerfacecolor=colors[i],
        markersize=3,
        color=colors[i],
        linewidth=1,
        linestyle="-",
    )

# Set axis name.
axis_plot_annotator_speed_study.set_xlabel("session d'annotation [#]", fontsize=18,)
axis_plot_annotator_speed_study.set_ylabel("vitesse d'annotation [#/m]", fontsize=18,)

# Plot the legend.
axis_plot_annotator_speed_study.legend(
    loc="lower right",
    fontsize=15,
)

# Plot the grid.
axis_plot_annotator_speed_study.grid(True)
    
# Store the graph.
fig_plot_annotator_speed_study.savefig(
    "../results/etude-temps-annotation-3-etude-de-cas.png",
    dpi=300,
    transparent=True,
    bbox_inches="tight",
)

----
## Discussion

1. hypothèse temps annotation est linéaire
    - OK: afficher temps/constraint

2. hypothèse vitesse augmente en fonction du nombre de session
    - KO: variation inter-annotateur trop forte
    - Stats descriptives
    - Discussion de quelques cas : un qui augmente, un qui stagne ?