In [1]:
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import json
import os

In [2]:
# Edit constants 
NUM_SEQUENCES_PER_BATCH = 128

# The file to train the language model on
TRAIN_FILE_1 = 'data/jchat_paired.csv' 
TRAIN_FILE_2 = 'data/chigiri_train_w_tone.csv'

In [3]:
# get out all tone from both files
labels = set()
for file_path in [TRAIN_FILE_1, TRAIN_FILE_2]:
    df_data = pd.read_csv(file_path)
    labels = labels | set(df_data["tone"])

# sort the labels so it will be the same every time
labels = sorted(list(labels))

In [4]:
def readin_data(file_path, labels):
    """
    Read in the csv, change the tone to integer labels, and format the data to a list of dictionaries
    
    Args:
        file_path: file path
        labels: a list of full sorted labels

    Return:
        formatted_data: a list of dictionaries of prev_line and integer label of tone
    """
    df_data = pd.read_csv(file_path)
    df_data["label"] = df_data["tone"].apply(lambda x: labels.index(x))
    
    
    formatted_data = [{"prev_line": prev, "label": label} 
                      for prev, label in zip(df_data["prev_line"], df_data["label"])]
    
    return formatted_data

In [5]:
# load the tohoku tokenizer and bert model
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
bert_model = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")

def get_embeddings_batch(texts: list[str], tokenizer, model, batch_size=16):
    """
    Get the embedding at once.
    
    Args:
        texts: a list of text to be embedded.
        tokenizer: tokenizer model
        model: bert embedding model
        batch_size: batch_size
        
    Return:
        embedded_torch: embedded vector in torch format
    """
    model.eval()
    all_embeddings = []

    # loop through sentences in batch_size and do embedding
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :] 
            
            # append the result
            all_embeddings.append(cls_embeddings)

    return torch.cat(all_embeddings, dim=0)

In [6]:
# it takes a long time to do embeddding. save the embedding to reduce time 
DATA1_EMBED_PATH = "Jchat_X_embeddings.pt"
DATA1_LABEL_PATH = "Jchat_y_labels.pt"
DATA2_EMBED_PATH = "Chigiri_X_embeddings.pt"
DATA2_LABEL_PATH = "Chigiri_y_labels.pt"

In [7]:
def create_dataloaders(X: list[str], y: list[int], embed_path, label_path, batch_size: int,
                       test_pct: float = 0.2, shuffle: bool = True,
                       tokenizer=None, model=None):
    """
    Do embedding and create dataloaders for training.
    
    Args:
        X: text of previous lines
        y: tone labels in integer
        embed_path: embedding path for pre-done embedding or to save the current embedding
        label_path: label path for pre-done labels or to save the current labels
        batch_size: batch size for embedding
        test_pct: % of test set
        shuffle (bool): shuffle while seperating train/test set
        tokenizer: pre-trained Tohoku tokenizer
        model: pre-trained Tohoku bert model
        
    Return:
        train_loader, test_loader: tensor data loader for FFNN training
    """
    # use the embedded file if existed
    if os.path.exists(embed_path) and os.path.exists(label_path):
        print("Found cached embeddings")
        X_tensor = torch.load(embed_path)
        y_tensor = torch.load(label_path)
    else:
        print("No cached embeddings found.")
        y_tensor = torch.tensor(y)
        X_tensor = get_embeddings_batch(X, tokenizer, model)
        
        # save the embedding for future use
        torch.save(X_tensor, embed_path)
        torch.save(y_tensor, label_path)
        print("Saved embeddings to disk.")

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_tensor, y_tensor, test_size=test_pct, stratify=y_tensor
    )

    # create train and text tensordataset
    train_dataset = TensorDataset(X_train, y_train)
    test_dataset = TensorDataset(X_test, y_test)

    # create data loader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle)

    return train_loader, test_loader


In [15]:
# FFNN setting mostly from hw4 
class FFNN(nn.Module):
    def __init__(self, input_dim=768, hidden_units=128, num_classes=len(labels)):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_units)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_units, num_classes)

    def forward(self, X):
        X = self.fc1(X)
        X = self.relu(X)
        X = self.dropout(X)
        return self.fc2(X)

def train(dataloader, model, epochs=3, lr: float = 1e-4) -> None:
    """
    Train the FFNN.
    
    Args:
        dataloader: dataloader
        model: FFNN model to be trained
        epochs: # of iteration to train
        lr: learning rate
    """
    # initialize Adam optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        # change to training mode
        model.train()
        
        total_loss = 0
        num_batches = len(dataloader)

        # loop through each branch
        for X, y in tqdm(dataloader, desc=f"EPOCH {epoch+1}: Training Progress"):
            # Zero the gradients for every batch
            optimizer.zero_grad()
            
            # Make predictions for this batch
            outputs = model(X)
            
            # Compute the loss and its gradients
            loss = loss_fn(outputs, y)
            total_loss += loss.item()
            loss.backward()
            
            # update weights
            optimizer.step()

        print(f"Average Loss = {(total_loss / num_batches):.4f}")


In [16]:
def full_pipeline(file_path, labels, embed_path, label_path, batch_size: int = NUM_SEQUENCES_PER_BATCH,
                  hidden_units=128, epochs=3, lr=0.001,
                  test_pct=0.2, trained_model=None):
    """
    Full pipeline to run the training process
    
    Args:
        file_path: file path of the paired csv
        labels: collection of tone labels from all datasets
        embed_path, label_path: path for pre-done embedding or to save the current embedding
        batch_size: batch_size to train the model
        hidden_units: hidden units of FFNN
        epochs: # of iteration to train
        lr: learning rate
        test_pct: % of test set
        trained_model: previous trained FFNN model
        
    Returns:
        model: trained FFNN model
    """
    # read in data and change format
    data = readin_data(file_path, labels)
    
    # separate X, y
    X = [d["prev_line"] for d in data]
    y = [d["label"] for d in data]

    # dataloader
    train_loader, test_loader = create_dataloaders(X, y, embed_path, label_path, 
                                                   batch_size, test_pct, tokenizer=tokenizer, model=bert_model)
    
    print("Finish processing data")

    if trained_model is None:
        # initilize model
        model = FFNN(input_dim=768, hidden_units=hidden_units)
    else:
        model = trained_model

    print("Finish initializing model, start training")

    # train
    train(train_loader, model, epochs, lr)

    return model


In [12]:
# train for j-chat dataset
half_model = full_pipeline(TRAIN_FILE_1, labels, DATA1_EMBED_PATH, DATA1_LABEL_PATH, epochs=6)

✅ Found cached embeddings. Loading from disk...
Finish processing data
Finish initializing model, start training


EPOCH 1: Training Progress: 100%|██████████████| 52/52 [00:00<00:00, 371.29it/s]


Average Loss = 1.7554


EPOCH 2: Training Progress: 100%|██████████████| 52/52 [00:00<00:00, 477.86it/s]


Average Loss = 1.5274


EPOCH 3: Training Progress: 100%|██████████████| 52/52 [00:00<00:00, 534.27it/s]


Average Loss = 1.4982


EPOCH 4: Training Progress: 100%|██████████████| 52/52 [00:00<00:00, 513.87it/s]


Average Loss = 1.4900


EPOCH 5: Training Progress: 100%|██████████████| 52/52 [00:00<00:00, 454.35it/s]


Average Loss = 1.4732


EPOCH 6: Training Progress: 100%|██████████████| 52/52 [00:00<00:00, 312.36it/s]

Average Loss = 1.4844





# Use Dataset 2

In [20]:
# train for chigiri-only dataset
new_model = full_pipeline(TRAIN_FILE_2, labels, DATA2_EMBED_PATH, DATA2_LABEL_PATH, 
                          epochs=10, trained_model=half_model)

✅ Found cached embeddings. Loading from disk...
Finish processing data
Finish initializing model, start training


EPOCH 1: Training Progress: 100%|████████████████| 2/2 [00:00<00:00, 274.09it/s]


Average Loss = 2.6132


EPOCH 2: Training Progress: 100%|████████████████| 2/2 [00:00<00:00, 271.77it/s]


Average Loss = 2.5785


EPOCH 3: Training Progress: 100%|████████████████| 2/2 [00:00<00:00, 206.01it/s]


Average Loss = 2.6363


EPOCH 4: Training Progress: 100%|█████████████████| 2/2 [00:00<00:00, 56.99it/s]


Average Loss = 2.5428


EPOCH 5: Training Progress: 100%|████████████████| 2/2 [00:00<00:00, 283.44it/s]


Average Loss = 2.4756


EPOCH 6: Training Progress: 100%|████████████████| 2/2 [00:00<00:00, 286.37it/s]


Average Loss = 2.4563


EPOCH 7: Training Progress: 100%|████████████████| 2/2 [00:00<00:00, 234.86it/s]


Average Loss = 2.4761


EPOCH 8: Training Progress: 100%|████████████████| 2/2 [00:00<00:00, 242.68it/s]


Average Loss = 2.5026


EPOCH 9: Training Progress: 100%|████████████████| 2/2 [00:00<00:00, 323.83it/s]


Average Loss = 2.5616


EPOCH 10: Training Progress: 100%|███████████████| 2/2 [00:00<00:00, 322.85it/s]

Average Loss = 2.5560





In [21]:
# save the model
MODEL_PATH = "models/tone_classifier.pt"
torch.save(new_model.state_dict(), MODEL_PATH)