# Visualizing Binoculars Score

In [1]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
import os
import boto3
import ast
import nltk

from IPython.display import HTML
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from tqdm import tqdm
from dotenv import load_dotenv
from io import StringIO

In [2]:
nltk.download("popular", quiet=True)

True

In [3]:
load_dotenv(f"../credentials.env")
s3 = boto3.client(
    's3',
    aws_access_key_id=os.getenv('aws_access_key_id'),
    aws_secret_access_key=os.getenv('aws_secret_access_key')
)

In [4]:
DEVICE_1 = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
DEVICE_2 = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [5]:
torch.set_grad_enabled(False)

observer_name = "tiiuae/falcon-7b-instruct"
performer_name = "tiiuae/falcon-7b"

In [6]:
identical_tokens = (AutoTokenizer.from_pretrained(observer_name).vocab ==
                    AutoTokenizer.from_pretrained(performer_name).vocab)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
observer_model = AutoModelForCausalLM.from_pretrained(
    observer_name,
    device_map={"": DEVICE_1},
    trust_remote_code=True,
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
)
performer_model = AutoModelForCausalLM.from_pretrained(
    performer_name,
    device_map={"": DEVICE_2},
    trust_remote_code=True,                                                            
    quantization_config=quantization_config,
    torch_dtype=torch.bfloat16,
)

observer_model.eval()
performer_model.eval()

tokenizer = AutoTokenizer.from_pretrained(observer_name)
tokenizer.pad_token = tokenizer.eos_token





Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]





Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# redefine to handle batch of strings
def tokenize(batch):
    encodings = tokenizer(batch, return_tensors="pt", 
    padding="longest" if len(batch) > 1 else False, truncation=True,
    max_length=512, return_token_type_ids=False).to(DEVICE_1)
    return encodings

# redefinition with cuda sync
@torch.inference_mode()
def get_logits(encodings):
    observer_logits = observer_model(**encodings.to(DEVICE_1)).logits
    performer_logits = performer_model(**encodings.to(DEVICE_2)).logits
    torch.cuda.synchronize()

    return observer_logits, performer_logits

loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
softmax_fn = torch.nn.Softmax(dim=-1)

def perplexity(encoding, logits):
    shifted_logits = logits[..., :-1, :].contiguous()
    shifted_labels = encoding.input_ids[..., 1:].contiguous()
    shifted_attention_mask = encoding.attention_mask[..., 1:].contiguous()

    ppl = loss_fn(shifted_logits.transpose(1, 2).to('cuda:1'), shifted_labels) * shifted_attention_mask
    ppl = ppl.sum(1) / shifted_attention_mask.sum(1)
    
    return ppl.to("cpu").float().numpy()

def cross_perplexity(observer_logits, performer_logits, encoding):
    V = observer_logits.shape[-1]
    S = observer_logits.shape[-2]

    performer_probs = softmax_fn(performer_logits).view(-1, V).to("cuda:1")
    observer_scores = observer_logits.view(-1, V).to("cuda:1")
    
    xppl = loss_fn(observer_scores, performer_probs).view(-1, S)
    padding_mask = (encoding.input_ids != tokenizer.pad_token_id).type(torch.uint8)
    
    xppl = (xppl * padding_mask).sum(1) / padding_mask.sum(1)
    
    return xppl.to("cpu").float().numpy()

def binocular_score(text):
    batch = [text] if isinstance(text, str) else text
    encodings = tokenize(batch)
    observer_logits, performer_logits = get_logits(encodings)
    ppl = perplexity(encodings, observer_logits)
    xppl = cross_perplexity(observer_logits, performer_logits, encodings)

    return (ppl / xppl).tolist()

@torch.inference_mode()
def get_logits(encodings):
    observer_logits = observer_model(**encodings.to(DEVICE_1)).logits
    performer_logits = performer_model(**encodings.to(DEVICE_2)).logits
    return observer_logits, performer_logits



def generate_essay_structure(essay: str):
  """
  Generate a list of paragraphs where each paragraph is a list of sentences.
  """
  essay_structure = [[sentence for sentence in nltk.sent_tokenize(paragraph)] for paragraph in essay.split('\n\n')]
  return essay_structure

def flag_masked_tokens(essay_structure, tokens, masked_sentences):
    if masked_sentences:
        # Create a mapping of (paragraph_number, sentence_number) to sentence
        sentence_map = {}
        for p_idx, paragraph in enumerate(essay_structure):
            for s_idx, sentence in enumerate(paragraph):
                sentence_map[(p_idx, s_idx)] = sentence

        # Flatten the essay structure into a list of (paragraph_number, sentence_number, sentence) tuples
        flattened_sentences = [
            (p_idx, s_idx, sentence)
            for p_idx, paragraph in enumerate(essay_structure)
            for s_idx, sentence in enumerate(paragraph)
        ]
        # Create a mapping from sentence to tokens
        sentence_to_tokens = {}

        token_counter = 0
        # print({i: token for i, token in enumerate(tokens)})
        for p_idx, s_idx, sentence in flattened_sentences:
            # display(sentence)
            sentence_to_build = ''
            sentence_tokens = []
            while sentence_to_build in sentence and token_counter != len(tokens):
                token = tokens[token_counter]
                # print(token)
                cleaned_token = token.strip() if sentence_to_build == '' else token
                
                if not token in ['\n']:
                    sentence_to_build += cleaned_token
                sentence_tokens.append(token)
                token_counter += 1
            # print(sentence_tokens)
            token_counter -= 1
            sentence_to_tokens[(p_idx, s_idx)] = sentence_tokens[:-1]

        
        # Create a set of masked sentences for quick lookup
        masked_set = set(masked_sentences)
        
        # Flag tokens
        flagged_tokens = []
        for p_idx, paragraph in enumerate(essay_structure):
            for s_idx, sentence in enumerate(paragraph):
                is_masked = (p_idx, s_idx) in masked_set
                tokens_for_sentence = sentence_to_tokens.get((p_idx, s_idx), [])
                flagged_tokens.extend([(token, is_masked) for token in tokens_for_sentence])
        # print(flagged_tokens)
        return flagged_tokens
    return None

def generate_html(tokens, scores, flagged_tokens):
    html = "<p>"
    # assert len(tokens) == len(flagged_tokens)
    counter = 0
    for index, (token, score) in enumerate(zip(tokens, scores.squeeze().tolist())):
        # Determine if this sentence is in the sampled sentences list
        # print([token, tokens[counter][0]])
        if flagged_tokens and flagged_tokens[counter][0] == token:
            is_sampled = flagged_tokens[counter][1]
        else:
            is_sampled = False
        # Apply background color based on score
        color_value = 255 * score 
        span_style = f"background-color: rgb({255-color_value}, 255, {255-color_value}); color: black;"
        # span_style = f"background-color: rgb({color_value}, 255, {color_value}); color: black;"
        
        # If the sentence is sampled, make it bold
        if is_sampled:
            html += f"<strong><span style='{span_style}'>{token}</span></strong>"
        else:
            html += f"<span style='{span_style}'>{token}</span>"
        counter += 1
    
    html += "</p>"
    return html

def read_csv_from_s3(
    bucket_name: str,
    file_key: str,
):
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    body = obj['Body']
    data = body.read().decode('utf-8')
    
    # Use StringIO to convert the string data to a pandas-readable buffer
    data_buffer = StringIO(data)

    # Read the data into a pandas DataFrame
    df = pd.read_csv(data_buffer)
    return df

In [8]:
def highlight_text(text, method='b', masked_indices=None, ret = False, display_html=True):
    encoding = tokenize(text)
    essay_structure = generate_essay_structure(text)
    
    observer_logits, performer_logits = get_logits(encoding)
    
    S = observer_logits.shape[-2]
    V = observer_logits.shape[-1]

    shifted_logits = observer_logits[..., :-1, :].contiguous()
    shifted_labels = encoding.input_ids[..., 1:].contiguous()
    
    tokens = [tokenizer.decode([tok], clean_up_tokenization_spaces=False) for tok in encoding.input_ids.squeeze().tolist()]

    performer_probs = softmax_fn(performer_logits).view(-1, V).to("cuda:1")
    observer_scores = observer_logits.view(-1, V).to("cuda:1")

    # Compute perplexity and x-plerplexity
    ppl = loss_fn(shifted_logits.transpose(1, 2).to("cuda:1"), shifted_labels).float()
    xppl = loss_fn(observer_scores[:-1], performer_probs[:-1]).view(-1, S - 1).to("cuda:1").float()
    
    normalized_ppl = ppl / torch.max(ppl)    
    normalized_xppl = xppl / torch.max(xppl)
    binocular_score = normalized_ppl / normalized_xppl

    flagged_tokens = flag_masked_tokens(essay_structure, tokens, masked_indices)
    # Perplexity
    if method.startswith('p'):
        html_output = generate_html(tokens, normalized_ppl, masked_indices)        
    # Cross - Perplexity  
    elif method.startswith('c'):
        html_output = generate_html(tokens, normalized_xppl, masked_indices)
    
    if display_html:
        display(HTML(html_output))
    return (ppl, xppl) if ret else None


In [35]:
def generate_summary_statistics(tp, fp, tn, fn, masked_indices_dict):
    token_list = []
    for df_type, df in zip(['tp', 'fp', 'tn','fn'], [tp, fp, tn, fn]):
        for i, row in tqdm(df.iterrows(), total = len(df)):
            if df_type in ['tp', 'fn']:
                normalized_binocular_score, flagged_tokens, tokens = highlight_text(row['text'], masked_indices=masked_indices_dict[row['essay_id']], display_html=False)
            else:
                normalized_binocular_score, flagged_tokens, tokens = highlight_text(row['text'], display_html=False)
            
            counter = 0
        
            for bin_score, token in zip(normalized_binocular_score[0], tokens):
                try:
                    if flagged_tokens and flagged_tokens[counter][0] == token:
                        flagged_token = flagged_tokens[counter][1]
                        counter += 1
                    else:
                        flagged_token = False
                    res = {
                        'essay_id': row['essay_id'],
                        'token': token,
                        'binocular_score': bin_score.item(),
                        'flagged': flagged_token,
                        'confusion_matrix': df_type,
                    }
                    token_list.append(res)
                except IndexError as e:
                    break
                    
    print(token_list)
    return pd.DataFrame(token_list)
    

In [19]:
filenames = []
bucket_name = 'training-essays'

## Dataset 1: Rephrase 25%

In [11]:
masked_indices_df_1 = read_csv_from_s3(bucket_name, "masked_indices/dataset1_partial_machine_generated.csv")
masked_indices_df_1['mask_indices'] = masked_indices_df_1['mask_indices'].apply(ast.literal_eval)
masked_indices_dict_1 = masked_indices_df_1.set_index('essay_id')['mask_indices'].to_dict()


In [12]:
df_1 = read_csv_from_s3(bucket_name, "predictions/dataset1_predictions.csv")
df_1_tp = df_1[(df_1['label'] == 1) & (df_1['prediction_label'] == 1)]
df_1_fp = df_1[(df_1['label'] == 0) & (df_1['prediction_label'] == 1)]
df_1_tn = df_1[(df_1['label'] == 0) & (df_1['prediction_label'] == 0)]
df_1_fn = df_1[(df_1['label'] == 1) & (df_1['prediction_label'] == 0)]

In [36]:
df_1_summary = generate_summary_statistics(df_1_tp, df_1_fp, df_1_tn, df_1_fn, masked_indices_dict_1)
df_1_summary.to_csv('temp_summary.csv', index=False)
s3.upload_file('temp_summary.csv', bucket_name, f'summary_statistics/dataset1_summary.csv')

100%|███████████████████████████████████████████████████████████████████████████████████| 13/13 [00:05<00:00,  2.34it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:02<00:00,  2.56it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 493/493 [03:36<00:00,  2.27it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 487/487 [03:35<00:00,  2.26it/s]
IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



### True Positive - Actual Machine Predicted Machine

In [13]:
print(f"True Positive Count: {len(df_1_tp)}")

True Positive Count: 13


In [25]:
sample = df_1_tp.sample(1).iloc[0]
essay_id = sample['essay_id']
masked_indices = masked_indices_dict_1[essay_id]
sample_text = sample['text']

# print(masked_indices)
normalized_binocular_score, flagged_tokens = highlight_text(sample_text, masked_indices=masked_indices)

### False Positive - Actual Human Predicted Machine

In [55]:
print(f"False Positive Count: {len(df_1_fp)}")

False Positive Count: 7


In [56]:
sample = df_1_fp.sample(1).iloc[0]
essay_id = sample['essay_id']
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text)

### True Negative - Actual Human Predicted Human

In [57]:
print(f"True Negative Count: {len(df_1_tn)}")

True Negative Count: 493


In [58]:
sample = df_1_tn.sample(1).iloc[0]
essay_id = sample['essay_id']
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text)

### False Negative - Actual Machine Predicted Human

In [59]:
print(f"False Negative Count: {len(df_1_fn)}")

False Negative Count: 487


In [43]:
sample = df_1_tp.sample(1).iloc[0]
essay_id = sample['essay_id']
masked_indices = masked_indices_dict_1[essay_id]
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text, masked_indices=masked_indices)

## Dataset 2: Rephrase 75%

In [43]:
masked_indices_df_2 = read_csv_from_s3(bucket_name, "masked_indices/dataset2_partial_machine_generated.csv")
masked_indices_df_2['mask_indices'] = masked_indices_df_2['mask_indices'].apply(ast.literal_eval)
masked_indices_dict_2 = masked_indices_df_2.set_index('essay_id')['mask_indices'].to_dict()

# masked_indices_dict

In [None]:
df_2 = read_csv_from_s3(bucket_name, "predictions/dataset2_predictions.csv")
df_2_tp = df_2[(df_2['label'] == 1) & (df_2['prediction_label'] == 1)]
df_2_fp = df_2[(df_2['label'] == 0) & (df_2['prediction_label'] == 1)]
df_2_tn = df_2[(df_2['label'] == 0) & (df_2['prediction_label'] == 0)]
df_2_fn = df_2[(df_2['label'] == 1) & (df_2['prediction_label'] == 0)]

In [None]:
df_2_summary = generate_summary_statistics(df_2_tp, df_2_fp, df_2_tn, df_2_fn, masked_indices_dict_2)
df_2_summary.to_csv('temp_summary.csv', index=False)
s3.upload_file('temp_summary.csv', bucket_name, f'summary_statistics/dataset2_summary.csv')

### True Positive - Actual Machine Predicted Machine

In [49]:
print(f"True Positive Count: {len(df_2_tp)}")

True Positive Count: 159


In [50]:
sample = df_2_tp.sample(1).iloc[0]
essay_id = sample['essay_id']
masked_indices = masked_indices_dict_2[essay_id]
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text, masked_indices=masked_indices)

### False Positive - Actual Human Predicted Machine

In [67]:
print(f"False Positive Count: {len(df_2_fp)}")

False Positive Count: 7


In [51]:
sample = df_2_fp.sample(1).iloc[0]
essay_id = sample['essay_id']
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text)

### True Negative - Actual Human Predicted Human

In [69]:
print(f"True Negative Count: {len(df_2_tn)}")

True Negative Count: 493


In [52]:
sample = df_2_tn.sample(1).iloc[0]
essay_id = sample['essay_id']
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text)

### False Negative - Actual Machine Predicted Human

In [71]:
print(f"False Negative Count: {len(df_2_fn)}")

False Negative Count: 341


In [53]:
sample = df_2_fn.sample(1).iloc[0]
essay_id = sample['essay_id']
masked_indices = masked_indices_dict_2[essay_id]
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text, masked_indices=masked_indices)

## Dataset 3: Rephrase 50%

In [None]:
masked_indices_df_3 = read_csv_from_s3(bucket_name, "masked_indices/dataset3_partial_machine_generated.csv")
masked_indices_df_3['mask_indices'] = masked_indices_df_3['mask_indices'].apply(ast.literal_eval)
masked_indices_dict_3 = masked_indices_df_1.set_index('essay_id')['mask_indices'].to_dict()

# masked_indices_dict

In [None]:
df_3 = read_csv_from_s3(bucket_name, "predictions/dataset3_predictions.csv")
df_3_tp = df_3[(df_3['label'] == 1) & (df_3['prediction_label'] == 1)]
df_3_fp = df_3[(df_3['label'] == 0) & (df_3['prediction_label'] == 1)]
df_3_tn = df_3[(df_3['label'] == 0) & (df_3['prediction_label'] == 0)]
df_3_fn = df_3[(df_3['label'] == 1) & (df_3['prediction_label'] == 0)]

In [None]:
df_3_summary = generate_summary_statistics(df_3_tp, df_3_fp, df_3_tn, df_3_fn, masked_indices_dict_3)
df_3_summary.to_csv('temp_summary.csv', index=False)
s3.upload_file('temp_summary.csv', bucket_name, f'summary_statistics/dataset3_summary.csv')

### True Positive - Actual Machine Predicted Machine

In [141]:
print(f"True Positive Count: {len(df_3_tp)}")

True Positive Count: 159


In [142]:
sample = df_3_tp.sample(1).iloc[0]
essay_id = sample['essay_id']
masked_indices = masked_indices_dict[essay_id]
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text, masked_indices=masked_indices)

### False Positive - Actual Human Predicted Machine

In [143]:
print(f"False Positive Count: {len(df_3_fp)}")

False Positive Count: 7


In [144]:
sample = df_3_fp.sample(1).iloc[0]
essay_id = sample['essay_id']
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text)

### True Negative - Actual Human Predicted Human

In [145]:
print(f"True Negative Count: {len(df_3_tn)}")

True Negative Count: 493


In [146]:
sample = df_3_tn.sample(1).iloc[0]
essay_id = sample['essay_id']
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text)

### False Negative - Actual Machine Predicted Human

In [147]:
print(f"False Negative Count: {len(df_3_fn)}")

False Negative Count: 341


In [148]:
sample = df_3_fn.sample(1).iloc[0]
essay_id = sample['essay_id']
masked_indices = masked_indices_dict[essay_id]
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text, masked_indices=masked_indices)

## Dataset 4: Fill-in-the-Blank 25%

In [None]:
masked_indices_df_4 = read_csv_from_s4(bucket_name, "masked_indices/dataset4_partial_machine_generated.csv")
masked_indices_df_4['mask_indices'] = masked_indices_df_4['mask_indices'].apply(ast.literal_eval)
masked_indices_dict_4 = masked_indices_df_4.set_index('essay_id')['mask_indices'].to_dict()

# masked_indices_dict

In [None]:
df_4 = read_csv_from_s3(bucket_name, "predictions/dataset4_predictions.csv")
df_4_tp = df_4[(df_4['label'] == 1) & (df_4['prediction_label'] == 1)]
df_4_fp = df_4[(df_4['label'] == 0) & (df_4['prediction_label'] == 1)]
df_4_tn = df_4[(df_4['label'] == 0) & (df_4['prediction_label'] == 0)]
df_4_fn = df_4[(df_4['label'] == 1) & (df_4['prediction_label'] == 0)]

In [None]:
df_4_summary = generate_summary_statistics(df_4_tp, df_4_fp, df_4_tn, df_4_fn, masked_indices_dict_4)
df_4_summary.to_csv('temp_summary.csv', index=False)
s3.upload_file('temp_summary.csv', bucket_name, f'summary_statistics/dataset4_summary.csv')

### True Positive - Actual Machine Predicted Machine

In [151]:
print(f"True Positive Count: {len(df_4_tp)}")

True Positive Count: 159


In [152]:
sample = df_4_tp.sample(1).iloc[0]
essay_id = sample['essay_id']
masked_indices = masked_indices_dict[essay_id]
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text, masked_indices=masked_indices)

### False Positive - Actual Human Predicted Machine

In [153]:
print(f"False Positive Count: {len(df_4_fp)}")

False Positive Count: 7


In [154]:
sample = df_4_fp.sample(1).iloc[0]
essay_id = sample['essay_id']
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text)

### True Negative - Actual Human Predicted Human

In [155]:
print(f"True Negative Count: {len(df_4_tn)}")

True Negative Count: 493


In [156]:
sample = df_4_tn.sample(1).iloc[0]
essay_id = sample['essay_id']
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text)

### False Negative - Actual Machine Predicted Human

In [157]:
print(f"False Negative Count: {len(df_4_fn)}")

False Negative Count: 341


In [158]:
sample = df_4_fn.sample(1).iloc[0]
essay_id = sample['essay_id']
masked_indices = masked_indices_dict[essay_id]
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text, masked_indices=masked_indices)

## Dataset 5: Fill-in-the-Blank 75%

In [60]:
masked_indices_df_5 = read_csv_from_s3(bucket_name, "masked_indices/dataset5_partial_machine_generated.csv")
masked_indices_df_5['mask_indices'] = masked_indices_df_5['mask_indices'].apply(ast.literal_eval)
masked_indices_dict_5 = masked_indices_df_5.set_index('essay_id')['mask_indices'].to_dict()

# masked_indices_dict

In [54]:
df_5 = read_csv_from_s3(bucket_name, "predictions/dataset2_predictions.csv")
df_5_tp = df_5[(df_5['label'] == 1) & (df_5['prediction_label'] == 1)]
df_5_fp = df_5[(df_5['label'] == 0) & (df_5['prediction_label'] == 1)]
df_5_tn = df_5[(df_5['label'] == 0) & (df_5['prediction_label'] == 0)]
df_5_fn = df_5[(df_5['label'] == 1) & (df_5['prediction_label'] == 0)]

In [55]:
df_5_fp

Unnamed: 0,essay_id,text,label,binocular_score,prediction_label
263,22465,"Dear Principal,\n\nI think you should require ...",0,0.853147,1
381,30782,Having multiple opinions\n\nHow come when you ...,0,0.860759,1
459,19722,"Dear principal,\n\nThe change in the sports po...",0,0.861538,1
473,3756,"Dear Principal,\n\nI have all of my averages i...",0,0.859649,1
545,15021,"Dear Principal,\n\nThe argument of not allowin...",0,0.839196,1
604,11991,I think that we can help people in america by ...,0,0.84186,1
777,9548,There is a lot of discussion whether or not to...,0,0.861111,1


### True Positive - Actual Machine Predicted Machine

In [56]:
print(f"True Positive Count: {len(df_5_tp)}")

True Positive Count: 159


In [61]:
sample = df_5_tp.sample(1).iloc[0]
essay_id = sample['essay_id']
masked_indices = masked_indices_dict_5[essay_id]
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text, masked_indices=masked_indices)

### False Positive - Actual Human Predicted Machine

In [62]:
print(f"False Positive Count: {len(df_5_fp)}")

False Positive Count: 7


In [63]:
sample = df_5_fp.sample(1).iloc[0]
essay_id = sample['essay_id']
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text)

### True Negative - Actual Human Predicted Human

In [64]:
print(f"True Negative Count: {len(df_5_tn)}")

True Negative Count: 493


In [65]:
sample = df_5_tn.sample(1).iloc[0]
essay_id = sample['essay_id']
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text)

### False Negative - Actual Machine Predicted Human

In [167]:
print(f"False Negative Count: {len(df_5_fn)}")

False Negative Count: 341


In [168]:
sample = df_5_fn.sample(1).iloc[0]
essay_id = sample['essay_id']
masked_indices = masked_indices_dict[essay_id]
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text, masked_indices=masked_indices)

## Dataset 6: Fill-in-the-Blank 50%

In [169]:
df_6 = read_csv_from_s3(bucket_name, "predictions/dataset2_predictions.csv")
df_6_tp = df_6[(df_6['label'] == 1) & (df_6['prediction_label'] == 1)]
df_6_fp = df_6[(df_6['label'] == 0) & (df_6['prediction_label'] == 1)]
df_6_tn = df_6[(df_6['label'] == 0) & (df_6['prediction_label'] == 0)]
df_6_fn = df_6[(df_6['label'] == 1) & (df_6['prediction_label'] == 0)]

In [170]:
df_6_fp

Unnamed: 0,essay_id,text,label,binocular_score,prediction_label
263,22465,"Dear Principal,\n\nI think you should require ...",0,0.853147,1
381,30782,Having multiple opinions\n\nHow come when you ...,0,0.860759,1
459,19722,"Dear principal,\n\nThe change in the sports po...",0,0.861538,1
473,3756,"Dear Principal,\n\nI have all of my averages i...",0,0.859649,1
545,15021,"Dear Principal,\n\nThe argument of not allowin...",0,0.839196,1
604,11991,I think that we can help people in america by ...,0,0.84186,1
777,9548,There is a lot of discussion whether or not to...,0,0.861111,1


### True Positive - Actual Machine Predicted Machine

In [171]:
print(f"True Positive Count: {len(df_6_tp)}")

True Positive Count: 159


In [172]:
sample = df_6_tp.sample(1).iloc[0]
essay_id = sample['essay_id']
masked_indices = masked_indices_dict[essay_id]
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text, masked_indices=masked_indices)

### False Positive - Actual Human Predicted Machine

In [173]:
print(f"False Positive Count: {len(df_6_fp)}")

False Positive Count: 7


In [174]:
sample = df_6_fp.sample(1).iloc[0]
essay_id = sample['essay_id']
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text)

### True Negative - Actual Human Predicted Human

In [175]:
print(f"True Negative Count: {len(df_6_tn)}")

True Negative Count: 493


In [176]:
sample = df_6_tn.sample(1).iloc[0]
essay_id = sample['essay_id']
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text)

### False Negative - Actual Machine Predicted Human

In [177]:
print(f"False Negative Count: {len(df_6_fn)}")

False Negative Count: 341


In [178]:
sample = df_6_fn.sample(1).iloc[0]
essay_id = sample['essay_id']
masked_indices = masked_indices_dict[essay_id]
sample_text = sample['text']

# print(masked_indices)
highlight_text(sample_text, masked_indices=masked_indices)