In [1]:
RANDOM_SEED = 42
LLM_MODEL = "llama3.1:8b"
MIN_TWEET_TOKENS = 15
LLM_SAMPLE_SIZE = 6000
GENERATE_RAW_TRAINING_DATA = False
GENERATE_LLM_TRAINING_DATA = False
TRAIN_MODEL = True

In [3]:
import csv
import langdetect
import numpy as np
import pandas as pd
import random
import re
import scipy
import sqlite3
import time
import torch
import torchview

from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from datasets import Dataset
from IPython.display import display, HTML, clear_output
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
from torch import nn
from tqdm import tqdm
from transformers import BertModel, BertTokenizer


class MSEPlot:
    """Ugly code! Dynamically updates plots of MSE so that one can visualize 
    error in real-time as a model trains.
    """
    def __init__(self):
        plt.ion()
        self.fig, self.ax = plt.subplots(1, 1)
        self.is_flushed = False

    def update(self, train_mse, validation_mse, title="", include_mse=True, epoch=None, batch=None, max_batches=None):
        if len(title) > 0:
            title = f"{title}\n"

        if epoch is not None and batch is not None:
            if max_batches is not None:
                title = f"{title}\nEpoch: {epoch}, {batch/max_batches*100:.0f}%"
            else:
                title = f"{title}\nEpoch: {epoch}, Batch: {batch}"
        elif epoch is not None:
            title = f"{title}\nEpoch: {epoch}"

        if include_mse and len(train_mse) > 0:
            title = f"{title}\nTrain: {train_mse[-1]:.3f}, Validation: {validation_mse[-1]:.3f}"
        
        self.ax.clear()
        self.ax.plot(train_mse)
        self.ax.plot(validation_mse)
        self.ax.legend(["Training", "Validation"], loc="upper right")
        if len(title) > 0:
            self.ax.set_title(title, fontsize=10)
        display(self.fig)
        clear_output(wait=True)
        self.is_flushed = False

    def flush(self):
        plt.ioff()
        plt.show()
        self.is_flushed = True
    
    def __del__(self):
        if not self.is_flushed:
            self.flush()

In [4]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

device = "cuda"
llm = OllamaLLM(model=LLM_MODEL)

In [5]:
if GENERATE_RAW_TRAINING_DATA:
    records = []
    
    # dataset: https://www.kaggle.com/datasets/kazanova/sentiment140
    with open("/home/mallinger/Code/data/tone-classifier/training.1600000.processed.noemoticon.csv", mode="r") as fh:
        csv_reader = csv.reader(fh)
        while True:
            try:
                row = next(csv_reader)
                text = row[5]
                if langdetect.detect(text) == 'en':
                    records += [("sentiment140", text)]
            except (langdetect.LangDetectException, UnicodeDecodeError, IndexError) as e:
                pass
            except StopIteration:
                break
    
    raw_text_df = pd.DataFrame(records, columns=["source", "text"])
    raw_text_df.to_pickle("/home/mallinger/Code/data/tone-classifier/raw_text_df")

In [6]:
TONES = {"formal": ("informal and friendly, characterized by bubbly and outgoing language", "formal and distanced, characterized by flat and unemotional language"),
         "optimistic": ("pessimistic, characterized by a negative outlook", "optimistic, characterized by positive, forward-looking language"),
         "apologetic": ("defensive, characterized by denial and pushback", "apologetic, characterized by regret and a focus on restitution"),
         "compassionate": ("insensitive, characterized by a disregard for the feelings of others", "warm, gentle, and empathetic, characterized by a deep concern for another's well-being or struggles"),
        }
template = ChatPromptTemplate.from_messages([("system", 
                                              """You are a helpful chatbot that is trained to rewrite text in different tones.
                                              For example, you can rewrite a text message to be more happy and cheerful.

                                              Rewrite the text submitted by the user to be very {description}.  
                                              Do not print any other text or characters.
                                              """),
                                             ("human", 
                                              "{text}"
                                             )])

def get_tone_examples(text):
    records = []
    for tone, (negative_description, positive_description) in TONES.items():
        negative_example_text = llm.invoke(template.format_messages(description=negative_description, text=text), keep_alive=0)
        positive_example_text = llm.invoke(template.format_messages(description=positive_description, text=text), keep_alive=0)
        records += [(tone, text, positive_example_text, negative_example_text)]
    return records


if GENERATE_LLM_TRAINING_DATA:
    records = []
    raw_text_df = pd.read_pickle("/home/mallinger/Code/data/tone-classifier/raw_text_df")
    corporus = random.sample([text for text in raw_text_df["text"] if len(re.split(r"\s+", text)) >= MIN_TWEET_TOKENS], LLM_SAMPLE_SIZE)
    with ProcessPoolExecutor(max_workers=32) as executor:
        futures = [executor.submit(get_tone_examples, text) for text in corporus]
        for future in tqdm(as_completed(futures), total=len(corporus)):
            records += [future.result()]
        records = sum(records, [])  # flatten
    text_df = pd.DataFrame(records, columns=["topic", "original", "positive", "negative"])
    text_df.to_pickle("/home/mallinger/Code/data/tone-classifier/text_df")

In [10]:
torch.cuda.device_count()



0

In [8]:
def get_model_df(text_df, tone, bert_model="bert-base-cased"):
    df = text_df[text_df["topic"] == tone]
    tokenizer = BertTokenizer.from_pretrained(bert_model)

    positives = [tokenizer(text, padding="max_length", max_length=512, truncation=True, return_tensors="pt") for text in df["positive"]]
    negatives = [tokenizer(text, padding="max_length", max_length=512, truncation=True, return_tensors="pt") for text in df["negative"]]

    return pd.concat([pd.DataFrame({"score": np.ones(len(df)), "text": df["positive"], "bert_input": positives}),
                      pd.DataFrame({"score": np.zeros(len(df)), "text": df["negative"], "bert_input": negatives})
                     ]).reset_index()


def get_model(hidden_layer_width, bert_model="bert-base-cased", device="cpu"):
    bert_model = BertModel.from_pretrained(bert_model)
    bert_final_layer_size = len(list(bert_model.parameters())[-1])
    return nn.Sequential(
                         bert_model,
                         nn.Linear(in_features=bert_final_layer_size, out_features=hidden_layer_width),
                         nn.BatchNorm1d(hidden_layer_width),
                         nn.ReLU(),
                         nn.Linear(hidden_layer_width, 1)
                     ).to(device=device)


if TRAIN_MODEL:
    text_df = pd.read_pickle("/home/mallinger/Code/data/tone-classifier/text_df")
    formal_df = get_model_df(text_df, "formal")
    formal_model = get_model(128, device=device)
    
    sample_mask = pd.qcut(np.random.rand(len(formal_df)), 10, labels=False)
    X_train = formal_df.iloc[sample_mask >= 2, "text"]
    X_train

RuntimeError: No CUDA GPUs are available

In [16]:




formal_df = get_model_df(text_df, "formal")

sample_mask = pd.qcut(np.random.rand(len(interest_model_df)), 7, labels=False)
interest_model_train_df = interest_model_df.iloc[sample_mask >= 2]
interest_model_validation_df = interest_model_df.iloc[sample_mask == 0]
interest_model_test_df = interest_model_df.iloc[sample_mask == 1]

# CUDA test and train datasets
X_train = torch.tensor(interest_model_train_df[list(valid_book_categories)].to_numpy(), device=device, dtype=torch.float32)
y_train = torch.tensor(interest_model_train_df["delta_zscore"].to_numpy(), device=device, dtype=torch.float32)
X_validation = torch.tensor(interest_model_validation_df[list(valid_book_categories)].to_numpy(), device=device, dtype=torch.float32)
y_validation = torch.tensor(interest_model_validation_df["delta_zscore"].to_numpy(), device=device, dtype=torch.float32)
X_test = torch.tensor(interest_model_test_df[list(valid_book_categories)].to_numpy(), device=device, dtype=torch.float32)
y_test = torch.tensor(interest_model_test_df["delta_zscore"].to_numpy(), device=device, dtype=torch.float32)


# building a neural network to predict algorithm error.
hidden_layer_width = 12
interest_model = nn.Sequential(
                     nn.Linear(in_features=X_train.shape[1], out_features=hidden_layer_width),
                     nn.BatchNorm1d(hidden_layer_width),
                     nn.ReLU(),
                     nn.Dropout(p=0.3),
                     nn.Linear(in_features=hidden_layer_width, out_features=hidden_layer_width),
                     nn.BatchNorm1d(hidden_layer_width),
                     nn.ReLU(),
                     nn.Dropout(p=0.1),
                     nn.Linear(hidden_layer_width, 1)
                 ).to(device=device)

loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(interest_model.parameters(), lr=0.00001, weight_decay=0.0075)

n_epochs = 150
batch_size = 12
# store (and plot) the MSE by epoch for train and test
mse_history_train, mse_history_validation = [], []
# plot the change in MSE over time
mse_plot = MSEPlot()

for epoch in range(n_epochs):
    interest_model.train()
    indices = torch.randperm(len(X_train))
    batches = len(X_train) // batch_size
    for batch in range(batches):
        # random batch.  todo: use a dataloader here
        slice_ = indices[batch*batch_size:batch*batch_size+batch_size]
        # occassional updates about batch progress
        if batch % 2500 == 0:
            mse_plot.update(mse_history_train, mse_history_validation, epoch=epoch, batch=batch, max_batches=batches)
        
        X_batch = X_train[slice_]
        y_batch = y_train[slice_]
        y_pred = interest_model(X_batch).squeeze()
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # calculate MSE per epoch
    interest_model.eval()
    with torch.inference_mode():
        y_pred = interest_model(X_train).squeeze()
        train_mse = float(loss_fn(y_pred, y_train))
        mse_history_train.append(train_mse)
        y_pred = interest_model(X_validation).squeeze()
        validation_mse = float(loss_fn(y_pred, y_validation))
        mse_history_validation.append(validation_mse)

    # update graph
    mse_plot.update(mse_history_train, mse_history_validation, epoch=epoch, batch=batch, max_batches=batches)

# ensure graph renders above additional information
mse_plot.flush()

# prediction datasets
interest_model.eval()
with torch.inference_mode():
    y_train_pred = interest_model(X_train).squeeze()
    y_validation_pred = interest_model(X_validation).squeeze()
    y_test_pred = interest_model(X_test).squeeze()


# we expect the overall correlation to be weak- after all, if we can predict ŷ - y easily, then the model
# is missing something super obvious in the data.
print("Overall y~y_pred correlation")
print("Train:")
print(scipy.stats.spearmanr(y_train.cpu().numpy(), y_train_pred.cpu().numpy()))
print("Validation:")
print(scipy.stats.spearmanr(y_validation.cpu().numpy(), y_validation_pred.cpu().numpy()))

# render model architecture
interest_model_graph = torchview.draw_graph(interest_model, input_size=(batch_size, X_validation.shape[1]))
display(interest_model_graph.visual_graph)

Unnamed: 0,score,text
0,1.0,@jennheartsdavid: It is acknowledged that Davi...
4,1.0,It is being suggested that Joan Rivers conside...
8,1.0,Notation of Appreciation for @gloriavelez's Pr...
12,1.0,It is proposed that arrangements be made for a...
16,1.0,"@skullnik, a greeting has been extended to you..."
...,...,...
10996,0.0,omg i just ordered the cutest birthday dress E...
11000,0.0,OMG exam week is FINALLY over!!! I'm soooo don...
11004,0.0,Haha omg yaaas Razzle u crack me up!!! dont be...
11008,0.0,"Don't stress, we dodged a bullet this time! On..."


topic                                                                                                                                                                             formal
original                                        @jennheartsdavid hahaha. i know right!! i was like, omgsh david. he makes me laugh lol.  but yeah. he always has great song suggestions!
positive    @jennheartsdavid: It is acknowledged that David possesses a certain capacity for inducing laughter. Additionally, his song recommendations are consistently of high quality.
negative                                                            omigosh david is the MAN!!! ikr?! he's literally the best thing since sliced bread lol his music taste is on POINT!!
Name: 0, dtype: object