# WNC Library Functionality Testing

In [4]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
examples = [
    "Sir Alex Ferguson is the greatest football manager of all time.",
    "Sir Alex Ferguson is a great football manager.",
    "Sir Alex Ferguson is a football manager.",
    "the news-press was sold by the new york times company in 2000, and is now independently owned by wendy p. mccaw, a local resident and a biased and eerily thick-headed woman .",
    "it also marked the last season in quarteback 's brett favre illustrious career as a packer .",
]

## `SubjectivityNeutralizer` Testing

In [20]:
from src.inference import SubjectivityNeutralizer

In [62]:
MODEL_PATH = "/home/cdsw/models/bart-tst-full"
sn = SubjectivityNeutralizer(model_identifier=MODEL_PATH)

In [63]:
examples

['Sir Alex Ferguson is the greatest football manager of all time.',
 'Sir Alex Ferguson is a great football manager.',
 'Sir Alex Ferguson is a football manager.',
 'the news-press was sold by the new york times company in 2000, and is now independently owned by wendy p. mccaw, a local resident and a biased and eerily thick-headed woman .',
 "it also marked the last season in quarteback 's brett favre illustrious career as a packer ."]

In [64]:
sn.transfer(examples)

['Sir Alex Ferguson is one of the greatest football managers of all time.',
 'Sir Alex Ferguson is a football manager.',
 'Sir Alex Ferguson is a football manager.',
 'the news-press was sold by the new york times company in 2000, and is now independently owned by wendy p. mccaw, a local resident.',
 "it also marked the last season in quarteback 's career."]

## `StyleIntensityClassifier` Testing

In [23]:
from src.inference import StyleIntensityClassifier

In [24]:
MODEL_PATH = "../models/bert-cls-full3/checkpoint-96000"
sc = StyleIntensityClassifier(model_identifier=MODEL_PATH)

In [25]:
sc.score(examples)

[{'label': 'LABEL_0',
  'score': 0.9861143827438354,
  'distribution': [0.9861143827438354, 0.013885647989809513]},
 {'label': 'LABEL_0',
  'score': 0.9860873818397522,
  'distribution': [0.9860873818397522, 0.013912606984376907]},
 {'label': 'LABEL_1',
  'score': 0.9913819432258606,
  'distribution': [0.008618118241429329, 0.9913819432258606]},
 {'label': 'LABEL_0',
  'score': 0.9853444695472717,
  'distribution': [0.9853444695472717, 0.014655554667115211]},
 {'label': 'LABEL_0',
  'score': 0.9856260418891907,
  'distribution': [0.9856260418891907, 0.014373908750712872]}]

### Calculate Style Transfer Intensity

In [180]:
# transfer style from examples
input_text = examples[:2]
output_text = sn.transfer(examples[:2])

text_pairs = {"input_text": input_text, "output_text": output_text}
text_pairs

{'input_text': ['Sir Alex Ferguson is the greatest football manager of all time.',
  'Sir Alex Ferguson is a great football manager.'],
 'output_text': ['Sir Alex Ferguson is one of the greatest football managers of all time.',
  'Sir Alex Ferguson is a football manager.']}

In [182]:
sc.calculate_transfer_intensity(**text_pairs)



[0.0398, 0.9775]

#### Build Intensity Metric

In [73]:
input_text = examples[:2]
output_text = sn.transfer(examples[:2])

In [76]:
input_text

['Sir Alex Ferguson is the greatest football manager of all time.',
 'Sir Alex Ferguson is a great football manager.']

In [77]:
output_text

['Sir Alex Ferguson is one of the greatest football managers of all time.',
 'Sir Alex Ferguson is a football manager.']

In [85]:
text_pairs = {"input_text": input_text, "output_text": output_text}

In [176]:
from typing import List


def calculate_transfer_intensity(
    input_text: List[str], output_text: List[str], target_class_idx: int = 1
):
    """

    Args:
        input_text (list) - list of input texts with indicies corresponding
            to counterpart in output_text
        ouptput_text (list) - list of output texts with indicies corresponding
            to counterpart in input_text

    """

    if len(input_text) != len(output_text):
        raise ValueError(
            "input_text and output_text must be of same length with corresponding items"
        )

    input_dist = [item["distribution"] for item in sc.score(input_text)]
    output_dist = [item["distribution"] for item in sc.score(output_text)]

    return [
        calculate_emd(input_dist[i], output_dist[i], target_class_idx)
        for i in range(len(input_dist))
    ]

In [177]:
calculate_transfer_intensity(**text_pairs)

[0.0398, 0.9775]

In [133]:
input_text[1]

'Sir Alex Ferguson is a great football manager.'

In [134]:
output_text[1]

'Sir Alex Ferguson is a football manager.'

In [146]:
input_dist = sc.score(input_text[1])[0]["distribution"]

In [147]:
output_dist = sc.score(output_text[1])[0]["distribution"]

In [148]:
input_dist

[0.9860873818397522, 0.013912606984376907]

In [150]:
output_dist

[0.008618118241429329, 0.9913819432258606]

In [163]:
calculate_emd(input_dist * 100, output_dist * 100, 1)

0.0053

In [162]:
calculate_emd(output_dist, input_dist, 1)

-0.0053

In [158]:
from pyemd import emd, emd_samples

In [160]:
emd_samples(input_dist, output_dist)

0.0

In [161]:
input_dist, output_dist

([0.9860873818397522, 0.013912606984376907],
 [0.008618118241429329, 0.9913819432258606])

In [155]:
N = len(input_dist)
distance_matrix = np.ones((N, N))
emd(np.array(input_dist), np.array(output_dist), distance_matrix)

In [157]:
emd(np.array(input_dist), np.array(output_dist), distance_matrix)

0.9774691327255313

In [151]:
wasserstein_distance(input_dist, output_dist)

0.0052945250645279884

In [170]:
from scipy.stats import wasserstein_distance
from pyemd import emd


def calculate_emd(input_dist, output_dist, target_class_idx):
    """
    Calculate the direction-corrected Earth Mover's Distance (aka Wasserstein distance)
    between two distributions of equal length. Here we penalize the EMD score if
    the output text style moved further away from the target style.

    Ref: https://github.com/passeul/style-transfer-model-evaluation/blob/master/code/style_transfer_intensity.py

    Args:
        input_dist (list) - probabilities assigned to the style classes
            from the input text to style transfer model
        output_dist (list) - probabilities assigned to the style classes
            from the outut text of the style transfer model

    Returns:
        emd (float) - Earth Movers Distance between the two distributions

    """

    N = len(input_dist)
    distance_matrix = np.ones((N, N))
    dist = emd(np.array(input_dist), np.array(output_dist), distance_matrix)

    transfer_direction_correction = (
        1 if output_dist[target_class_idx] >= input_dist[target_class_idx] else -1
    )

    return round(dist * transfer_direction_correction, 4)

In [168]:
wasserstein_distance(input_dist, output_dist)

0.0052945250645279884

In [169]:
calculate_emd(input_dist, output_dist, target_class_idx=1)

0.9775