# Interactive Tool For ArguMentor


## Setup

*Note*: The model files are pretty large, so downloading and extracting them usually takes a few of minutes.

In [None]:
!pip install transformers sentencepiece



In [None]:
!wget -O model.tar.xz https://uofi.box.com/shared/static/r0uerujvsq2z25odnmtga5gcdlrs1pd4

--2023-12-15 18:56:17--  https://uofi.box.com/shared/static/r0uerujvsq2z25odnmtga5gcdlrs1pd4
Resolving uofi.box.com (uofi.box.com)... 74.112.186.144
Connecting to uofi.box.com (uofi.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/r0uerujvsq2z25odnmtga5gcdlrs1pd4 [following]
--2023-12-15 18:56:17--  https://uofi.box.com/public/static/r0uerujvsq2z25odnmtga5gcdlrs1pd4
Reusing existing connection to uofi.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://uofi.app.box.com/public/static/r0uerujvsq2z25odnmtga5gcdlrs1pd4 [following]
--2023-12-15 18:56:17--  https://uofi.app.box.com/public/static/r0uerujvsq2z25odnmtga5gcdlrs1pd4
Resolving uofi.app.box.com (uofi.app.box.com)... 74.112.186.144
Connecting to uofi.app.box.com (uofi.app.box.com)|74.112.186.144|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://public.boxcloud.com/d/1/b1!O9LrI

In [None]:
!tar xf model.tar.xz

## Import and Model Loading

In [None]:
from IPython.display import HTML
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

In [None]:
config = {'model_name': 'google/bigbird-roberta-base', # From Huggingface's ModelHub.
          'model_save_path': './model/',
          'max_length': 1024,
          'device': 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'}
print(f"Infering on {config['device']}")

tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
model = AutoModelForTokenClassification.from_pretrained(config['model_save_path']).to(config['device'])

Infering on cpu


In [None]:
def infer(essay):
    # Makes a dict with keys: input_ids, attention_mask.
    encoding = tokenizer(essay.split(),
                         is_split_into_words=True, # Necessary to keep correspondance between words and labels contructed previously.
                         padding='max_length',
                         truncation=True,
                         max_length=config['max_length'])
    """
    From Tokenizer's docs about word_ids:
        A list indicating the word corresponding to each token. Special tokens added by the tokenizer are mapped to None and other tokens
        are mapped to the index of their corresponding word (several tokens will be mapped to the same word index if they are parts of that word).

    This is needed to match the correct labels with the tokens, which may not have a 1:1 correspondence with the original words.
    """
    encoding['word_ids'] = torch.as_tensor([w if w is not None else -1 for w in encoding.word_ids()])

    item = {k: torch.unflatten(torch.as_tensor(v), 0, (1, -1)).to(config['device']) for k, v in encoding.items()}

    model.eval()
    output_dict = model(input_ids=item['input_ids'],
                   attention_mask=item['attention_mask'],
                   return_dict=True)

    token_predictions = torch.argmax(output_dict['logits'].view(-1, model.num_labels), axis=-1)

    words_predictions = list()
    prev_word_idx = -1
    for idx, word_idx in enumerate(item['word_ids'][0]):
        if word_idx == -1:
            continue
        elif word_idx != prev_word_idx:
            prev_word_idx = word_idx
            words_predictions.append(token_predictions[idx].item())

    return words_predictions

## Helper Functions

In [None]:
segment_colors = {
  0: 'rgba(0,0,0, 0.0)',
  1: 'rgba(206,95,20, 0.8)',
  2: 'rgba(114,174,146, 0.8)',
  3: 'rgba(251,174,28, 0.8)',
  4: 'rgba(81,53,51, 0.8)',
  5: 'rgba(43,112,133, 0.8)',
  6: 'rgba(200,109,142, 0.8)',
  7: 'rgba(243,218,179, 0.7)',
}

segment_names = {
  0: 'Unnanotated',
  1: 'Lead',
  2: 'Position',
  3: 'Evidence',
  4: 'Claim',
  5: 'Concluding Statement',
  6: 'Counterclaim',
  7: 'Rebuttal'
}

In [None]:
def generate_html_with_highlight(original_text, segment_types):

    # Hover stuff
    html_code = '<style>'
    html_code += '.segment-highlight:hover:before { content: attr(data-label); background: #111; color: #fff; padding: 4px 8px; border-radius: 4px; z-index: 2; font-size: 14px}'
    html_code += '.segment-highlight:hover:after { content: attr(data-label); background: #111; color: #fff; padding: 4px 8px; border-radius: 4px; z-index: 2; font-size: 14px}'
    html_code += '</style>'

    # Generate a legend showing what segment type each color represents
    html_code += '<p style="font-size: 18px; line-height: 1.6;"><b>Legend:</b><br/>'
    for segment_color, segment_name in zip(segment_colors.values(), segment_names.values()):
        html_code += f'<span style="background-color: {segment_color};">{segment_name}</span><br/>'
    html_code += '<br/></p>'

    html_code += '<p style="font-size: 18px; line-height: 1.6;">You can also hover on top of each segment to see their type<br/><br/></p>'

    # Highlight original text
    html_code += '<p style="font-size: 18px; line-height: 1.6; background-color: white; color: black;"><b>Segmented Essay:</b><br/>'
    current_segment_type = None
    for i, (word, segment_type) in enumerate(zip(original_text.split(), segment_types)):
        if i > 0 and segment_type != segment_types[i - 1]:
            html_code += f'</span>'

        if segment_type != current_segment_type:
            html_code += f'<span class="segment-highlight" style="background-color: {segment_colors[segment_type]};" data-label="{segment_names[segment_type]}">'
            current_segment_type = segment_type

        html_code += f'{word} '

    if current_segment_type is not None:
        html_code += '</span>'

    html_code += '</p>'

    return html_code

## Interactive Tool

In [None]:
# @title Segment Your Essay
from math import ceil

essay = "To overcome Hume\u2019s problem of induction and derive a theorem that permits learning \u2013 and, thus, relies on induction working\u2013, Valiant claims it is necessary to make two assumptions about the world: The Invariance Assumption, and the Learnable Regularity Assumption.  The first \u2013 the Invariance Assumption \u2013 assumes that the context in which a certain generalization is used to make predictions cannot be different that in which this generalization was drawn. This should make intuitive sense and, in a similar way to Hume\u2019s Uniformity Principle, assumes that the universe is consistent and uniform \u2013 so much so that if the context (or conditions) is the same, it is possible to use previously noticed patters to make predictions about the world with relative confidence. The second \u2013 the Learnable Regularity Assumption \u2013 assumes that objects/things of a same class/category have a few regularities which, when observed over a big-enough sample of such objects, allows one to differentiate this object from others, allowing for categorization.  The combination of both these assumptions, then, allows induction to be probably correct and learning to be achievable. From there, then, Valiant derives his theorem that there is a finite number of samples needed to extract such identifying characteristics from a certain category of objects such that the prediction will remain under a certain error margin \u2013 and thus, probably correct." # @param {type:"string"}

preds = infer(essay)
segments = [ceil(pred/2) for pred in preds] # Adjust for B/I tags.


html_code = generate_html_with_highlight(essay, segments)
HTML(html_code)

224 224
