# ==== INTERACTIVE CLUSTERING : BUSINESS RELEVANCE STUDY ====
> ### [DRAFT] FMC comparison sandbox

------------------------------
## READ-ME BEFORE RUNNING

### Quick Description

TODO

### Description each steps

TODO

------------------------------
## 1. IMPORT PYTHON DEPENDENCIES

In [None]:
from typing import Any, Dict, List, Optional, Tuple
from scipy.sparse import csr_matrix
from cognitivefactory.features_maximization_metric.fmc import FeaturesMaximizationMetric
import pandas as pd
import numpy as np
import math

------------------------------
## 2. DEFINE DATA

In [None]:
# Computation.
fmc_computer_1: FeaturesMaximizationMetric = FeaturesMaximizationMetric(
    data_vectors=csr_matrix(
        [
            [9, 5, 5],
            [9, 10, 5],
            [9, 20, 6],
            [5, 15, 5],
            [6, 25, 6],
            [5, 25, 5],
        ]
    ),
    data_classes=[
        "Man",
        "Man",
        "Man",
        "Woman",
        "Woman",
        "Woman",
    ],
    list_of_possible_features=[
        "Shoes size",
        "Hair size",
        "Nose size",
    ],
    amplification_factor=1,
)
pd.DataFrame(fmc_computer_1.features_activation)

In [None]:
# Computation.
fmc_computer_2: FeaturesMaximizationMetric = FeaturesMaximizationMetric(
    data_vectors=csr_matrix(
        [
            [9, 5, 5],
            [9, 10, 5],
            [9, 20, 6],
            [5, 15, 5],
            [6, 25, 6],
            [5, 25, 5],
            [5, 15, 9],
            [6, 10, 8],
            [9, 20, 8],
        ]
    ),
    data_classes=[
        "Man",
        "Man",
        "Man",
        "Woman",
        "Woman",
        "Woman",
        "??",
        "??",
        "??",
    ],
    list_of_possible_features=[
        "Shoes size",
        "Hair size",
        "Nose size",
    ],
    amplification_factor=1,
)
pd.DataFrame(fmc_computer_2.features_activation)

------------------------------
## 3. DRAFT OF METRICS

-----
### 3.A. Test with activation probability

In [None]:
def compare_fmc_modelization_v1(
    fmc_computed: FeaturesMaximizationMetric,
    fmc_reference: FeaturesMaximizationMetric,
) -> Tuple[
    float,
    float,
    float,
    Dict[str, Dict[str, float]],
    Dict[str, Dict[str, float]],
    Dict[str, Dict[str, float]],
]:
    """
    Gives a similarity score in agreement with a reference FMC modelization.
    Data classes can be different, but vector features must be similar.

    Args:
        fmc_computed (FeaturesMaximizationMetric): Computed Features Maximization modelization.
        fmc_reference (FeaturesMaximizationMetric): Reference Features Maximization modelization.

    Raises:
        ValueError: if `list_of_possible_features` are different.

    Returns:
        Tuple[
            float,
            float,
            float,
            Dict[str, Dict[str, float]],
            Dict[str, Dict[str, float]],
            Dict[str, Dict[str, float]]
        ]: Computation of features activation equivalence and modelization similarity.
    """
    
    ###
    ### Features activation Equivalence computations.
    ### (probability of activation of source on target)
    ###
    
    def _compute_activation_probability(
        fmc_source: FeaturesMaximizationMetric,
        classe_source: str,
        fmc_target: FeaturesMaximizationMetric,
        classe_target: str,
    ) -> float:
        
        numerator: float = 0.0
        denominator: float = 0.0

        for feature_target in fmc_target.list_of_possible_features:

            #### if (
            ####     bool(fmc_target.features_activation[feature_target][classe_target])
            ####     and len(fmc_target.get_most_activated_classes_by_a_feature(feature=feature_target))==1
            #### ):
            if fmc_target.get_most_activated_classes_by_a_feature(feature=feature_target) == [classe_target]:
            # if bool(fmc_target.features_activation[feature_target][classe_target]):
                denominator += fmc_target.features_fmeasure[feature_target][classe_target]

                #### if (
                ####     bool(fmc_target.features_activation[feature_target][classe_target])
                ####     and len(fmc_target.get_most_activated_classes_by_a_feature(feature=feature_target))==1
                ####     and bool(fmc_source.features_activation[feature_target][classe_source])
                ####     and len(fmc_source.get_most_activated_classes_by_a_feature(feature=feature_target))==1
                #### ):
                if fmc_source.get_most_activated_classes_by_a_feature(feature=feature_target) == [classe_source]:
                # if bool(fmc_source.features_activation[feature_target][classe_source]):
                    numerator += fmc_target.features_fmeasure[feature_target][classe_target]

        return (
            0.0
            if denominator == 0
            else numerator / denominator
        )
    
    activation_probability_of_computed_on_reference: Dict[str, Dict[str, float]] = {
        classe_computed: {
            classe_reference: _compute_activation_probability(
                fmc_source=fmc_computed,
                classe_source=classe_computed,
                fmc_target=fmc_reference,
                classe_target=classe_reference,
            )
            for classe_reference in fmc_reference.list_of_possible_classes
        }
        for classe_computed in fmc_computed.list_of_possible_classes
    }
        
    activation_probability_of_reference_on_computed: Dict[str, Dict[str, float]] = {
        classe_computed: {
            classe_reference: _compute_activation_probability(
                fmc_source=fmc_reference,
                classe_source=classe_reference,
                fmc_target=fmc_computed,
                classe_target=classe_computed,
            )
            for classe_reference in fmc_reference.list_of_possible_classes
        }
        for classe_computed in fmc_computed.list_of_possible_classes
    }
        
    activation_probability_reciprocity: Dict[str, Dict[str, float]] = {
        classe_computed: {
            classe_reference: (
                0.0
                if (
                    activation_probability_of_computed_on_reference[classe_computed][classe_reference]
                    + activation_probability_of_reference_on_computed[classe_computed][classe_reference]
                ) == 0
                else (
                    2 * (
                        activation_probability_of_computed_on_reference[classe_computed][classe_reference]
                        * activation_probability_of_reference_on_computed[classe_computed][classe_reference]
                    ) / (
                        activation_probability_of_computed_on_reference[classe_computed][classe_reference]
                        + activation_probability_of_reference_on_computed[classe_computed][classe_reference]
                    )
                )
            )
            for classe_reference in fmc_reference.list_of_possible_classes
        }
        for classe_computed in fmc_computed.list_of_possible_classes
    }
    
    ###
    ### Modelization similarity computations.
    ### (average of probability of activation of source on target)
    ###
            
            
    similarity_of_computed_on_reference: float =sum(
        (
            sum(
            activation_probability_of_computed_on_reference[classe_computed][classe_reference]
            for classe_computed in fmc_computed.list_of_possible_classes
            )
        )
        for classe_reference in fmc_reference.list_of_possible_classes
    ) / len(
        fmc_reference.list_of_possible_classes
    )
        
    similarity_of_reference_on_computed: float = sum(
        sum(
            activation_probability_of_reference_on_computed[classe_computed][classe_reference]
            for classe_reference in fmc_reference.list_of_possible_classes
        ) / len([
            classe_reference
            for classe_reference in fmc_reference.list_of_possible_classes
            if activation_probability_of_reference_on_computed[classe_computed][classe_reference] != 0
        ])
        for classe_computed in fmc_computed.list_of_possible_classes
    ) / len(
        fmc_computed.list_of_possible_classes
    )

    similarity_reciprocity: float = (
        0.0
        if (
            similarity_of_computed_on_reference
            + similarity_of_reference_on_computed
        ) == 0
        else (
            2 * (
                similarity_of_computed_on_reference
                * similarity_of_reference_on_computed
            ) / (
                similarity_of_computed_on_reference
                + similarity_of_reference_on_computed
            )
        )
    )
        
    return(
        similarity_of_computed_on_reference,
        similarity_of_reference_on_computed,
        similarity_reciprocity,
        activation_probability_of_computed_on_reference,
        activation_probability_of_reference_on_computed,
        activation_probability_reciprocity,
    )

In [None]:
compare_fmc_modelization_v1(
    fmc_computed = fmc_computer_1,
    fmc_reference = fmc_computer_2,
)

In [None]:
compare_fmc_modelization_v1(
    fmc_computed = fmc_computer_1,
    fmc_reference = fmc_computer_1,
)

-----
### 3.B. Test with présence and concentration of features activation

In [None]:
def compare_fmc_modelization_v2(
    fmc_computed: FeaturesMaximizationMetric,
    fmc_reference: FeaturesMaximizationMetric,
) -> Tuple[
    float,
    float,
]:
    """
    (1) Présence = (
        Sum of FMC metric for features actives in ref. and comp.
    )/(
        Sum of FMC metric for features actives in ref.
    )
    (2) Concentration

    Args:
        fmc_computed (FeaturesMaximizationMetric): Computed Features Maximization modelization.
        fmc_reference (FeaturesMaximizationMetric): Reference Features Maximization modelization.

    Raises:
        ValueError: if `list_of_possible_features` are different.

    Returns:
        Tuple[
            float,
            float
        ]: # TODO
    """
    
    ###
    ### PRESENCE OF MODEL REFERENCE IN MODEL COMPUTED
    ###
    
    num: float = 0.0
    den: float = 0.0
        
    for classe_ref in fmc_reference.list_of_possible_classes:
            
        for feature_ref in fmc_reference.list_of_possible_features:
            
            # update DENOMINATOR
            if fmc_reference.get_most_activated_classes_by_a_feature(feature=feature_ref) == [classe_ref]:
                den += fmc_reference.features_fmeasure[feature_ref][classe_ref]
        
            # update NUMERATOR
            for classe_comp in fmc_computed.list_of_possible_classes:
            
                if (
                    fmc_reference.get_most_activated_classes_by_a_feature(feature=feature_ref) == [classe_ref]
                    and fmc_computed.get_most_activated_classes_by_a_feature(feature=feature_ref) == [classe_comp]
                ):
                    num += fmc_reference.features_fmeasure[feature_ref][classe_ref]
                    
    presence = (
        0.0
        if den == 0
        else num / den
    )
    
    ###
    ### CONCENTRATION OF MODEL REFERENCE IN MODEL COMPUTED
    ###
    
    
    
    return presence, 0.0

In [None]:
compare_fmc_modelization_v2(
    fmc_computed = fmc_computer_1,
    fmc_reference = fmc_computer_2,
)

In [None]:
compare_fmc_modelization_v2(
    fmc_computed = fmc_computer_1,
    fmc_reference = fmc_computer_1,
)

-----
### 3.C. Test with VMeasure and features activation as clustering results

Definition of `Entropy` :

$
H(C)
= - \sum _{C_i \subset C} \frac{|C_i|}{N} \log \frac{|C_i|}{N}
= - \sum _{C_i \subset C} \frac{|C_i|}{N} \left( \log |C_i| - \log N \right)
$

where :
- $C=\{C_i\}$ = the labeling formated as a list of sets of data with the same label.
- $N$ = nb of data.

Definition of `Mutual Information` :

$
MI(C,K)
= \sum _{C_i \subset C} \sum _{K_j \subset K} \frac{|C_i \cap K_j|}{N} \log \frac{ N |C_i \cap K_j|}{|C_i||K_j|}
= \sum _{C_i \subset C} \sum _{K_j \subset K} \frac{|C_i \cap K_j|}{N} \left( \log |C_i \cap K_j| - \log |C_i| - \log|K_j| + \log N \right)
$

where :
- $C=\{C_i\}$ = the reference labeling formated as a list of sets of data with the same label.
- $K=\{K_j\}$ = the clustering labeling formated as a list of sets of data with the same label.
- $N$ = nb of data.

Definition of `V-Measure` :
    
$
Homogeneity(C,K) = \frac{H(C)}{MI(C,K)}
$
(ou $1$ si $MI(C,K) = 0$)

$
Completness(C,K) = \frac{H(K)}{MI(C,K)}
$
(ou $1$ si $MI(C,K) = 0$)

$
VMeasure(C,K) = 2 \times \frac{Homogeneity(C,K) \times Completness(C,K)}{Homogeneity(C,K) + Completness(C,K)}
$

where :
- $C=\{C_i\}$ = the reference labeling formated as a list of sets of data with the same label.
- $K=\{K_j\}$ = the clustering labeling formated as a list of sets of data with the same label.

In [None]:
from sklearn.metrics.cluster import homogeneity_completeness_v_measure

In [None]:
def from_fmc_to_clustering(fmc_computer):
    
    res = []
    outliers_index = 0
    for feature in fmc_computer.list_of_possible_features:
        
        most_activated_classes = fmc_computer.get_most_activated_classes_by_a_feature(
            feature=feature
        )
        
        if len(most_activated_classes) == 1:
            res.append(most_activated_classes[0])
        else:
            outliers_index += 1
            res.append(-1)
    return res

In [None]:
homogeneity_completeness_v_measure(
    labels_true=from_fmc_to_clustering(fmc_computer_1),
    labels_pred=from_fmc_to_clustering(fmc_computer_2),
)

$
C = \{
    F_{c}
    | c \in classes
\}
$

$
F_{c} = \{
    f_c | f_c \in features, selected(f_c), active (f_c, c), exclusif(f_c, c)
\}
$

$
P_{C}(c) =
$

- $
    \frac{
        |F_{c}|
    }{
        \sum_{x \in C}  |F_{x}|
    }
$

- $ \frac{
        \sum_{f_c \in F_{c}} 1 + FM(f_c)(c)
    }{
        \sum_{x \in C}  \sum_{f \in F_{x}} 1 + FM(f)(x)
    }
$

$
P_{C,K}(c,k) =
$

- $
    \frac{
        |F_{c} \cap F_{k}|
    }{
        |\cup _{x \in C \cup K} F_{x} |
    }
$
- $ \frac{
        \sum_{f_{ck} \in F_{c}, f_{ck} \in F_{k}} 1 + FM(f_{ck})(c) + FM(f_{ck})(l)
    }{
        \sum_{x \subset C}  \sum_{f \in F_{x}} ...
    }
$

$
H(C) = \sum _{C_i \subset C} P_{C}(C_i) log P_{C}(C_i)
$

$
MI(C,K) = \sum _{C_i \subset C} \sum _{K_j \subset K} P_{C,K}(C_i, K_j) log \frac{ P_{C,K}(C_i, K_j) }{ P_{C}(C_i) P_{K}(K_j) }
$