In [None]:
%matplotlib widget

from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
    AutoConfig
)
import os
import torch
import requests
from bs4 import BeautifulSoup
import re

from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
import evaluate
import glob

import matplotlib.pyplot as plt
import nltk
import time

# Download the Punkt tokenizer models
nltk.download('punkt')


In [None]:
MODEL_DIR = "/home/andrej/Code/story-vibe/data/models/checkpoint-08_07_2024"
device = torch.device('cuda:0')
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.to(device)
config = AutoConfig.from_pretrained(MODEL_DIR)
max_length = config.max_position_embeddings
print(f"Max length: {max_length}")

# Parsing a book into a format for inference

In [None]:
# Load and segment the book
BOOK_PATH = "/home/andrej/Code/story-vibe/data/texts"

# SPLIT = "paragraph"  # "sentence"
SPLIT = "sentence"
with open(f'{BOOK_PATH}/alice_in_wonderland.txt', 'r') as file:
    book_text = file.read()

if SPLIT == "paragraph":
    segments = book_text.split('\n\n')
elif SPLIT == "sentence":
    segments = nltk.sent_tokenize(book_text)

def get_sentiment(text, argmax=False):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    # print(f"Inputs: {inputs}")
    # print(f"Inputs length: {len(inputs['input_ids'][0])}")
    if len(inputs['input_ids'][0]) > 350:
        print(f"Input length is larger than 350: {len(inputs['input_ids'][0])}")
    with torch.no_grad():
        outputs = model(**inputs)
    scores = outputs.logits.softmax(dim=-1).cpu().numpy()[0]

    # return only the max sentiment
    if argmax is True:
        scores = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]
    return scores

T_START = time.time()
sentiments = [get_sentiment(segment) for segment in segments]
print(f"Time taken for {SPLIT} split: {time.time() - T_START} [s].")
# Convert to numpy array for easy plotting
sentiments = np.array(sentiments)


# Now do the same type of inference, but parallely - in batches

In [None]:
# Load and segment the book
BOOK_PATH = "/home/andrej/Code/story-vibe/data/texts"
BATCH_SIZE = 2

# SPLIT = "paragraph"  # "sentence"
SPLIT = "sentence"  # "sentence"
with open(f'{BOOK_PATH}/alice_in_wonderland.txt', 'r') as file:
    book_text = file.read()

if SPLIT == "paragraph":
    segments = book_text.split('\n\n')
elif SPLIT == "sentence":
    segments = nltk.sent_tokenize(book_text)

def get_sentiment(text, argmax=False):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    # print(f"Inputs: {inputs}")
    # print(f"Inputs length: {len(inputs['input_ids'][0])}")
    if len(inputs['input_ids'][0]) > 350:
        print(f"Input length is larger than 350: {len(inputs['input_ids'][0])}")
    with torch.no_grad():
        outputs = model(**inputs)
    scores = outputs.logits.softmax(dim=-1).cpu().numpy()

    # return only the max sentiment
    if argmax is True:
        scores = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]
    return scores

# Processing in batches
T_START = time.time()
all_sentiments = []

print("segments: ", segments)
for i in range(0, len(segments), BATCH_SIZE):
    batch = segments[i:i + BATCH_SIZE]
    sentiments = get_sentiment(batch)
    all_sentiments.extend(sentiments)

print(f"Time taken for batched {SPLIT} split: {time.time() - T_START} [s].")
# Convert to numpy array for easy plotting
all_sentiments = np.array(all_sentiments)

# Model timing - CPU
- Getting sentiments per sentence: 71.24 seconds.  
  - with batchsize 16 - 63 seconds  
  - with batchsize 8  - 55 seconds  
  - with batchsize 4  - 50.5 seconds  
  - with batchsize 2  - 54.7 seconds  
- Getting sentiments per paragraph: 44.53 seconds.  
  - with batchsize 8 - longer   
  - with batchsize 4 - 42 seconds  
  - with batchsize 2 - 41 seconds  

# Model timing - GPU
- Getting sentiments per sentence: 9.43 seconds.  
  - with batchsize 32 - 6.49 seconds  
  - with batchsize 16 - 5.37 seconds  
  - with batchsize 8  - 4.97 seconds  
  - with batchsize 4  - 5.22 seconds  
  - with batchsize 2  - 7.18 seconds  
- Getting sentiments per paragraph: 6.00 seconds.  
  - with batchsize 16 - 4.91 seconds 
  - with batchsize 8 - 4.32 seconds   
  - with batchsize 4 - 4.00 seconds  
  - with batchsize 2 - 4.61 seconds  

In [None]:
print(f"Sentiments: {sentiments}")
print(f"Shape of Sentiments: {sentiments.shape}")
print(f"Labels: {model.config.id2label}")

minimal_labels = {
                  "negative": ["sadness", "fear", "anger", "disgust"],
                  "neutral": ["neutral", "surprise"],
                  "positive": ["happiness"],
                  }

minimal_number_labels = {
                        -1: [model.config.label2id[label] for label in minimal_labels["negative"]],
                         0: [model.config.label2id[label] for label in minimal_labels["neutral"]],
                        1: [model.config.label2id[label] for label in minimal_labels["positive"]],
                        }

print(minimal_number_labels)
maximal_number_labels = [0] * len(model.config.id2label)
# for key, val in minimal_number_labels.items():
#     for v in val:
#         maximal_number_labels[v] = key

for key, val in minimal_number_labels.items():
    for v in val:
        maximal_number_labels[v] = key

print(f"maximal_number_labels: {maximal_number_labels}")

# remapped_sentiment = [maximal_number_labels[sent] for sent in sentiments]
# print(f"Remapped sentiment: {remapped_sentiment}")

# Plot 6 concurrent lines representing the sentiments

In [None]:
from scipy.signal import savgol_filter

def validate_data(data):
    """
    Ensure the data contains only finite numbers and handle NaNs.
    
    Parameters:
    data (numpy array): The input data array.
    
    Returns:
    numpy array: The validated data.
    """
    # Replace NaN and inf values with zeros
    data = np.nan_to_num(data)
    return data

# Apply Savitzky-Golay filter to smooth the running averages
def smooth_data(data, window_size, polyorder=2):
    """
    Smooth data using Savitzky-Golay filter.
    
    Parameters:
    data (numpy array): The input data array.
    window_size (int): The window size for the filter. It must be a positive odd integer.
    polyorder (int): The order of the polynomial used to fit the samples. Default is 2.
    
    Returns:
    numpy array: The smoothed data.

    """
    if window_size % 2 == 0:
        window_size += 1
    if window_size <= polyorder:
        window_size = polyorder + 1
        if window_size % 2 == 0:
            window_size += 1

    return savgol_filter(validate_data(data), window_size, polyorder)

def running_average(data, window_size):
    """
    Calculate the running average of a list with a specified window size.
    
    Parameters:
    data (list or numpy array): The input data list.
    window_size (int): The window size for calculating the running average.
    
    Returns:
    numpy array: The running averages with NaN padding for incomplete windows.
    """
    data = np.array(data)
    running_avg = np.full_like(data, np.nan, dtype=np.float64)
    if len(data) >= window_size:
        running_avg[window_size - 1:] = np.convolve(data, np.ones(window_size), 'valid') / window_size
    return running_avg

print(all_sentiments)

window_size = 10
running_avg_sentiments = np.array([running_average(all_sentiments[:, i], window_size) for i in range(all_sentiments.shape[1])]).T

print(running_avg_sentiments)

smoothed_sentiments = np.array([smooth_data(running_avg_sentiments[:, i], 11, 4) for i in range(running_avg_sentiments.shape[1])]).T

# Plot sentiment timeline with running average
plt.figure(figsize=(15, 5))
for i, emotion in enumerate(['sadness', 'happiness', 'fear', 'anger', 'surprise', 'disgust', 'neutral']):
    plt.plot(all_sentiments[:, i], label=emotion, lw=3)
plt.xlabel('Segment')
plt.ylabel('Sentiment Score')
plt.legend()
plt.title('Sentiment Timeline with Running Average')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()

plt.show()



# Plot now the negative, neutral and positive curves

In [None]:
for i, (key, val) in enumerate(minimal_number_labels.items()):
    print(f"i: {i}, key: {key}, val:{val}")

supergroups = np.zeros([all_sentiments.shape[0], 3])

for idx, val in enumerate(minimal_number_labels.values()):
    supergroups[:, idx] = all_sentiments[:, val].sum(axis=1)


window_size = 10
running_avg_supergroups = np.array([running_average(supergroups[:, i], window_size) for i in range(supergroups.shape[1])]).T

smoothed_supergroups = np.array([smooth_data(running_avg_supergroups[:, i], 11, 4) for i in range(running_avg_supergroups.shape[1])]).T
smoothed_supergroups = smoothed_supergroups.clip(min=0, max=1)
print(f"Shape of smoothed supergroups: {smoothed_supergroups.shape}")


plt.figure(figsize=(15, 5))
colors = ["tab:red", "tab:purple", "tab:blue"]
for i, emotion in enumerate(['Negative', 'Neutral', 'Positive']):
    plt.plot(smoothed_supergroups[:, i], label=emotion, color=colors[i], lw=3)
plt.xlabel('Segment')
plt.ylabel('Sentiment Score')
plt.legend()
plt.title('Reduced Sentiment score')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()

plt.show()

print(f"smoothed_supergroups positive at 531: {smoothed_supergroups[531, :]}")

In [None]:
# now to show an overall score -i.e. one line
# the sentiment probabilities are now used as weights for [-1, 0, 1]

overall_scores = np.ones_like(smoothed_supergroups)
overall_scores[:, 0] = -1
overall_scores[:, 1] = 0
overall_scores[:, 2] = 1
print(overall_scores[:5, :])
print(smoothed_supergroups[:5, :])
overall_scores *= smoothed_supergroups

print(overall_scores.shape)


plt.figure(figsize=(15, 5))
plt.plot(overall_scores.sum(axis=1), label="Overall Sentiment", color="tab:blue", lw=3)
plt.xlabel('Segment')
plt.ylabel('Reduced Sentiment Score')
plt.legend()
plt.title('Reduced Sentiment score')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.ylim(-1,1)
plt.tight_layout()

plt.show()



In [None]:
plt.close()