In [4]:
import re
import sys

# Add the 'utils' directory to the system path
utils_path = '/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/utils'
sys.path.insert(0, utils_path)

from text_cleaner import clean_text


ex = 'I decided to enroll into the physics supplemental course because I’ve had a history of struggling in my math classes. Since physics is basically another math class, I knew I would need to take extra steps to ensure my success. In the past I’ve had a packed schedule with classes and work, but this semester I allowed myself more room for school by cutting back my work hours. I originally planned on letting myself have time to attend office hours, but I realized that wouldn’t be enough.\nWhen I heard the option to enroll in a supplemental course for physics, I knew I had to enroll for the extra help. Not only would I have more practice with physics problems, but I would be able to connect more with my fellow classmates. I figured that if I felt like I created a little community with the supplemental course, I would feel more comfortable with physics problems. With a community I would feel comfortable enough to ask questions without having to worry about sounding unintelligent.\nAnother big reason for enrolling was when I heard I wouldn’t have homework. I will have the opportunity to have the extra practice with physics problems without the stress of having to worry about my grade. I will be able to do the problems at my own pace and truly understand the steps to get the solution'
annotation = ' I’ve had a history of struggling in my math classes. Since physics is basically another math class /%/ I knew I would need to take extra steps to ensure my success. /-/ When I heard the option to enroll in a supplemental course for physics. /-/ I knew I had to enroll for the extra help.'

annotation_list = re.split(r'/%/', annotation)
annotation_list = [clean_text(text) for text in annotation_list]
annotation_list

['Ive had a history of struggling in my math classes. Since physics is basically another math class',
 'I knew I would need to take extra steps to ensure my success. When I heard the option to enroll in a supplemental course for physics. I knew I had to enroll for the extra help.']

In [18]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn.functional as F

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load your pre-trained sequence classification model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

def classify_and_extract_snippets(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
    
    # Get model outputs
    outputs = model(**inputs)
    logits = outputs.logits
    
    # Determine the label
    predicted_class = torch.argmax(logits, dim=1).item()
    label = 'Yes' if predicted_class == 1 else 'No'
    
    # Calculate token importance using softmax on logits
    token_importance = F.softmax(logits, dim=1).squeeze()
    
    # Convert input IDs back to tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    # Determine threshold to select important tokens
    threshold = 0.5
    important_tokens = [token for token, importance in zip(tokens, token_importance) if importance > threshold]
    
    # Clean the tokens and join them to form snippets
    clean_snippets = " ".join(important_tokens).replace(" ##", "")
    
    return label, clean_snippets

# Example usage
ex = """I decided to enroll into the physics supplemental course because I’ve had a history of struggling in my math classes. Since physics is basically another math class, I knew I would need to take extra steps to ensure my success. In the past I’ve had a packed schedule with classes and work, but this semester I allowed myself more room for school by cutting back my work hours. I originally planned on letting myself have time to attend office hours, but I realized that wouldn’t be enough.
When I heard the option to enroll in a supplemental course for physics, I knew I had to enroll for the extra help. Not only would I have more practice with physics problems, but I would be able to connect more with my fellow classmates. I figured that if I felt like I created a little community with the supplemental course, I would feel more comfortable with physics problems. With a community I would feel comfortable enough to ask questions without having to worry about sounding unintelligent.
Another big reason for enrolling was when I heard I wouldn’t have homework. I will have the opportunity to have the extra practice with physics problems without the stress of having to worry about my grade. I will be able to do the problems at my own pace and truly understand the steps to get the solution."""

label, snippets = classify_and_extract_snippets(ex)
print(f'Label: {label}')
print(f'Snippets: {snippets}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Label: No
Snippets: [CLS]


In [7]:
from transformers import BertTokenizer, BertForTokenClassification

# Load pre-trained BERT tokenizer and model for token classification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=3)  # 3 labels: B, I, O

# Tokenize and encode the entire essay
inputs = tokenizer(ex, return_tensors='pt', max_length=512, truncation=True, padding=True)
outputs = model(**inputs)

# Get the token-level predictions
logits = outputs.logits
predicted_tokens = torch.argmax(logits, dim=2)

# Decode the tokens and identify the snippets
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
labels = ['B', 'I', 'O']  # Assuming the model is trained with BIO scheme

snippets = []
current_snippet = []

for token, label_id in zip(tokens, predicted_tokens[0].tolist()):
    label = labels[label_id]
    if label == 'B':
        if current_snippet:
            snippets.append(" ".join(current_snippet))
            current_snippet = []
        current_snippet.append(token)
    elif label == 'I' and current_snippet:
        current_snippet.append(token)
    elif label == 'O' and current_snippet:
        snippets.append(" ".join(current_snippet))
        current_snippet = []

# Add the last snippet if any
if current_snippet:
    snippets.append(" ".join(current_snippet))

# Clean up the snippets by removing special tokens like [CLS] and [SEP]
clean_snippets = [" ".join(snippet.replace("##", "") for snippet in snippet.split()) for snippet in snippets]

print(f'Snippets: {clean_snippets}')


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Snippets: ['decided', 'physics', 'supplemental', 'history', 'math', '.', 'is', 'basically', ',', 'knew', 'would', '.', 'semester', 'allowed', '.', 'i', 'realized', 'wouldn', 't', '.', 'heard', 'the', 'option', 'supplemental', 'physics', 'knew', '.', 'physics', 'problems', ',', 'but', 'would', '.', 'figured', 'that', 'created', 'little', 'community', 'would', 'physics', 'problems', '.', 'community', 'would', '.', 'big', 'was', 'wouldn', 't', '.', 'physics', 'problems', '.', 'problems', 'solution', '[SEP]']


In [10]:
from transformers import BertTokenizer, BertForSequenceClassification, BertForTokenClassification
import torch

# Initialize the tokenizer and the sequence classification model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sequence_classification_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Initialize the token classification model (fine-tune it for your specific token classification task)
token_classification_model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=3)

def classify_and_extract_snippets(text):
    # Step 1: Classify the essay
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
    classification_outputs = sequence_classification_model(**inputs)
    logits = classification_outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    label = 'Yes' if predicted_class == 1 else 'No'

    # Step 2: Extract snippets
    token_outputs = token_classification_model(**inputs)
    token_logits = token_outputs.logits
    predicted_tokens = torch.argmax(token_logits, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    labels = ['B', 'I', 'O']  # Assuming the model is trained with BIO scheme

    snippets = []
    current_snippet = []

    for token, label_id in zip(tokens, predicted_tokens[0].tolist()):
        label = labels[label_id]
        if label == 'B':
            if current_snippet:
                snippets.append(" ".join(current_snippet))
                current_snippet = []
            current_snippet.append(token)
        elif label == 'I' and current_snippet:
            current_snippet.append(token)
        elif label == 'O' and current_snippet:
            snippets.append(" ".join(current_snippet))
            current_snippet = []

    if current_snippet:
        snippets.append(" ".join(current_snippet))

    clean_snippets = [" ".join(snippet.replace("##", "") for snippet in snippet.split()) for snippet in snippets]

    return label, clean_snippets

# Example usage
ex = """I decided to enroll into the physics supplemental course because I’ve had a history of struggling in my math classes. Since physics is basically another math class, I knew I would need to take extra steps to ensure my success. In the past I’ve had a packed schedule with classes and work, but this semester I allowed myself more room for school by cutting back my work hours. I originally planned on letting myself have time to attend office hours, but I realized that wouldn’t be enough.
When I heard the option to enroll in a supplemental course for physics, I knew I had to enroll for the extra help. Not only would I have more practice with physics problems, but I would be able to connect more with my fellow classmates. I figured that if I felt like I created a little community with the supplemental course, I would feel more comfortable with physics problems. With a community I would feel comfortable enough to ask questions without having to worry about sounding unintelligent.
Another big reason for enrolling was when I heard I wouldn’t have homework. I will have the opportunity to have the extra practice with physics problems without the stress of having to worry about my grade. I will be able to do the problems at my own pace and truly understand the steps to get the solution."""

label, snippets = classify_and_extract_snippets(ex)
print(f'Label: {label}')
print(f'Snippets: {snippets}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Label: I
Snippets: ['[CLS] i', 'course', 'because i', '’', 've', 'history', 'in', 'my math', 'classes', '.', 'class', ', i', 'need', 'take', 'steps to', 'ensure my', 'success', '. in the past i ’', 've', 'had', 'a', 'packed', 'schedule', 'with classes', 'and', 'work', 'semester i', 'allowed', 'myself', 'more', 'room', 'for', 'school', 'by', 'cutting', 'my', 'work', 'hours', '. i', 'on', 'myself', 'have', 'time', 'attend', 'office', 'hours', 'that', 'wouldn ’', 'be', '.', 'course', 'for', 'physics', 'had', 'for the', 'help', '.', 'would', 'i', 'have', 'more', 'practice with physics', 'problems', ',', 'i', 'would', 'be', 'connect', 'more with', 'fellow', 'classmates', '.', 'course', 'i', 'would', 'feel', 'problems', '.', 'with', 'would', 'feel comfortable', 'enough', 'ask', 'questions', '.', 'another', 'big', 'reason', 'for enroll ing', 'i', 'wouldn ’', 'have', '.', 'i', 'will', 'have the opportunity', 'have', 'the', 'practice', 'with physics', 'problems', 'the', 'of', 'grade', '.', 'i',

In [12]:
snippets

['[CLS] i',
 'course',
 'because i',
 '’',
 've',
 'history',
 'in',
 'my math',
 'classes',
 '.',
 'class',
 ', i',
 'need',
 'take',
 'steps to',
 'ensure my',
 'success',
 '. in the past i ’',
 've',
 'had',
 'a',
 'packed',
 'schedule',
 'with classes',
 'and',
 'work',
 'semester i',
 'allowed',
 'myself',
 'more',
 'room',
 'for',
 'school',
 'by',
 'cutting',
 'my',
 'work',
 'hours',
 '. i',
 'on',
 'myself',
 'have',
 'time',
 'attend',
 'office',
 'hours',
 'that',
 'wouldn ’',
 'be',
 '.',
 'course',
 'for',
 'physics',
 'had',
 'for the',
 'help',
 '.',
 'would',
 'i',
 'have',
 'more',
 'practice with physics',
 'problems',
 ',',
 'i',
 'would',
 'be',
 'connect',
 'more with',
 'fellow',
 'classmates',
 '.',
 'course',
 'i',
 'would',
 'feel',
 'problems',
 '.',
 'with',
 'would',
 'feel comfortable',
 'enough',
 'ask',
 'questions',
 '.',
 'another',
 'big',
 'reason',
 'for enroll ing',
 'i',
 'wouldn ’',
 'have',
 '.',
 'i',
 'will',
 'have the opportunity',
 'have',
 

In [11]:
annotation_list

['Ive had a history of struggling in my math classes. Since physics is basically another math class',
 'I knew I would need to take extra steps to ensure my success. When I heard the option to enroll in a supplemental course for physics. I knew I had to enroll for the extra help.']