## Setup

In [None]:
import glob

import heapq

import re

from sklearn.model_selection import KFold

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import mode

from tqdm import tqdm
from sentence_transformers import SentenceTransformer

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

import statsmodels.api as sm

from sklearn.metrics import classification_report, confusion_matrix

from sklearn.preprocessing import StandardScaler
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW

import torch
from torch.utils.data import DataLoader, TensorDataset

from sklearn.preprocessing import LabelEncoder

import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [None]:
tqdm.pandas()

## Helper Functions

In [None]:
def remove_author(string: str, two_layer=False):
    # Remove name of chat user
    index = string.find(':')  # Find the index of the first colon
    if index != -1:
        string = string[index + 1:]
    return string if not two_layer else remove_author(string, two_layer=False)

def process_chat_message(string: str, two_layer=False, process_msg=True):
    # Remove name of chat user
    if process_msg:
        string = remove_author(string, two_layer=two_layer)
    # Encode string
    return model.encode(string)

def add_training_embeddings(df):
    embeddings = df.Input.progress_apply(process_chat_message, two_layer=True)
    df['embedding'] = embeddings
    return df

def get_trained_centroids(df, cats):
    d_avg = dict()
    for cat in cats:
        d_avg[cat] = df[df[cat]==1].copy()['embedding'].mean()
    return d_avg

def calculate_cosine_similarity(vector1, vector2):
    # Convert the vectors to 2D arrays
    vector1 = np.array(vector1).reshape(1, -1)
    vector2 = np.array(vector2).reshape(1, -1)

    # Calculate the cosine similarity
    similarity = cosine_similarity(vector1, vector2)

    return similarity[0, 0]

def get_most_similar_category(emb: np.array, d_avg: dict):
    d_tmp = {k: calculate_cosine_similarity(emb, v) for k, v in d_avg.items()}
    return max(d_tmp, key=d_tmp.get)

def eval_classification(df):
    true_labels = df['label']
    predicted_labels = df['pred']
    report = classification_report(true_labels, predicted_labels, output_dict=True)
    matrix = confusion_matrix(true_labels, predicted_labels)
    acc = accuracy_score(true_labels, predicted_labels)
    return report, matrix, acc

def get_test_folds(df, n_splits=10):
    split_data = []
    kf = KFold(n_splits=n_splits, shuffle=True)  # Set shuffle=True for random shuffling of data
    for train_index, test_index in kf.split(df):
        df_train = df.iloc[train_index].copy()
        df_test = df.iloc[test_index].copy()
        split_data.append((df_train, df_test))
    return split_data #tuple of train, test

def run_cv(df_train, df_test, cats):
    d_avg = get_trained_centroids(df_train, cats)
    df_test['pred'] = df_test.embedding.map(lambda emb: get_most_similar_category(emb, d_avg))
    return eval_classification(df_test)

def get_avg_dict(d_dict):
    average_dict = {key: np.mean([d[key] for d in d_dict]) for key in d_dict[0].keys()}
    return average_dict

def get_all_avg_dicts(reports, cats):
    ans = {cat: get_avg_dict([d[cat] for d in reports]) for cat in cats}
    return ans

def str_to_array(s):
    return np.array(s.split(','))

def create_chats(d_tx, two_layer=False):
    d_tx = d_tx.sort_values(by = ['Session Id', 'Time'])
    d_chats = d_tx[
        (d_tx['Action'] == 'sendMessage') &
        (d_tx['Student Response Subtype'] != 'tutor-performed') &
        (d_tx['Input'].map(lambda s: False if pd.isna(s) else s[-9:] != '_computer')) # System-performed messages
    ][['Session Id', 'Transaction Id', 'Anon Student Id', 'Input']].copy().reset_index()
    del d_chats['index']
    d_chats['chat_msg'] = d_chats.Input.map(lambda s: remove_author(s, two_layer=two_layer))
    return d_chats

def proc_s(input_string):
    # Convert to lowercase
    preprocessed_string = input_string.lower()
    
    # Remove leading/trailing whitespace
    preprocessed_string = preprocessed_string.strip()
    
    # Remove punctuation
    preprocessed_string = re.sub(r'[^\w\s]', '', preprocessed_string)
    
    # Replace multiple spaces with a single space
    preprocessed_string = re.sub(r'\s+', ' ', preprocessed_string)
    
    return preprocessed_string

# Fetch context window of k=2, i.e., up to 1,2,INPUT,4,5 
def add_joined_input(df):
    """
    Creates a context window of chats for prediction.
    """
    joined_input = []
    for string in df['Input']:
        search_s = proc_s(string)

        # Find the index where search_s exists in Input_clean
        search_index = d_chats[d_chats['Input_clean'].str.contains(search_s, na=False)].index[0]

        # Get the session id at the search index
        session_id = d_chats.at[search_index, 'Session Id']

        # Filter the rows based on the session id
        session_rows = d_chats[d_chats['Session Id'] == session_id]

        # Find the index of the search_s within the session rows
        search_index_in_session = session_rows[session_rows['Input_clean'].str.contains(search_s, na=False)].index[0]

        # Get the indices for the previous and next rows
        prev_indices = session_rows.iloc[max(0, search_index_in_session - 2):search_index_in_session].index
        next_indices = session_rows.iloc[search_index_in_session + 1:min(search_index_in_session + 3, len(session_rows))].index

        # Concatenate the indices for previous, current, and next rows
        indices_to_fetch = pd.Index(list(prev_indices) + [search_index_in_session] + list(next_indices))

        # Fetch the chat messages using the indices
        chat_messages = d_chats.loc[indices_to_fetch, 'chat_msg'].tolist()

        # Prepare the final string with tokens
        final_string = ''
        for i, message in enumerate(chat_messages):
            index_token = '[SEP]'
            final_string += f'{message} {index_token} '

        # Add START and END tokens
        final_string = '[CLS] ' + final_string.strip()
        joined_input.append(final_string)

    df['Input'] = joined_input

    return df

def add_joined_input_full(df):
    """
    Generate analogous context windows on the full chat data set
    to which the trained model is applied.
    """
    joined_input = []
    
    for index, row in df.iterrows():
        # Get the session id at the search index
        session_id = df.at[index, 'Session Id']

        # Filter the rows based on the session id
        session_rows = df[df['Session Id'] == session_id]

        # Get the indices for the previous and next rows
        prev_indices = session_rows.iloc[max(0, index - 2):index].index
        next_indices = session_rows.iloc[index + 1:min(index + 3, len(session_rows))].index

        # Concatenate the indices for previous, current, and next rows
        indices_to_fetch = pd.Index(list(prev_indices) + [index] + list(next_indices))

        # Fetch the chat messages using the indices
        chat_messages = df.loc[indices_to_fetch, 'chat_msg_clean'].tolist()

        # Prepare the final string with tokens
        final_string = ''
        for i, message in enumerate(chat_messages):
            index_token = '[SEP]'
            final_string += f'{message} {index_token} '

        # Add START and END tokens
        final_string = '[CLS] ' + final_string.strip()
        joined_input.append(final_string)

    df['Input_scale'] = joined_input

    return df

class TextClassificationDataset(Dataset):

    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

def train_bert_classifier(model=None):
    """
    If a model is passed into this function, 
    then the validation set is directly returned for evaluation.
    """
    bert_model_name = 'bert-base-uncased'
    num_classes = 3
    max_length = 128*2
    batch_size = 16
    num_epochs = 4
    learning_rate = 5e-5

    d_labels = {'Minimal Participation':0,
              'Facilitative participation':1,
              'Constructive Participation':2
              }

    texts = df_bert.Input.to_list()
    labels = df_bert.label.map(d_labels).to_list()

    train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

    tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
    val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

    device = torch.device("cpu")
    
    if model is None:
        model = BERTClassifier(bert_model_name, num_classes).to(device)

        optimizer = AdamW(model.parameters(), lr=learning_rate)
        total_steps = len(train_dataloader) * num_epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

        for epoch in range(num_epochs):
            print(f"Epoch {epoch + 1}/{num_epochs}")
            train(model, train_dataloader, optimizer, scheduler, device)
            accuracy, report = evaluate(model, val_dataloader, device)
            print(f"Validation Accuracy: {accuracy:.4f}")
            print(report)

        torch.save(model.state_dict(), "bert_classifier-2.pth")
        return val_dataloader, device
    else:
        return val_dataloader, device

def predict_label(text, model, tokenizer, device, max_length=128*2):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return preds

## Reading in Tutor Log Data


In [None]:
pd.set_option('display.max_columns', None)

In [None]:
d_tx_1 = pd.read_csv('data/logdataschool1.txt', 
                      sep='\t', low_memory=False)
d_chats_1 = create_chats(d_tx_1, two_layer=True)

d_tx_2 = pd.read_csv('data/logdataschool2.txt', 
                      sep='\t', low_memory=False)
d_chats_2 = create_chats(d_tx_2, two_layer=False)

d_tx_3 = pd.read_csv('logdataschool3.txt', 
                      sep='\t', low_memory=False)
d_chats_3 = create_chats(d_tx_3, two_layer=False)

d_tx = pd.concat([d_tx_1, d_tx_2, d_tx_3])
d_chats = pd.concat([d_chats_1, d_chats_2, d_chats_3])

d_chats = d_chats[d_chats['chat_msg'] != ''].copy()

d_chats['chat_msg_clean'] = d_chats['chat_msg'].map(proc_s)

d_chats['Input_clean'] = d_chats['Input'].map(proc_s)

d_chats = d_chats.reset_index(); del d_chats['index']

## Train Model

In [None]:
df = pd.read_csv('data/cleaned-apta-codings.csv')
df.dropna(subset=['Input'], inplace=True) # Remove empty messages
df.fillna(0, inplace=True)
for c in df.columns:
    if c != 'Input':
        df[c] = df[c].astype(int)

In [None]:
df = add_joined_input(df)

In [None]:
high_level_categories = [
    'Minimal Participation',
    'Facilitative participation',
    'Constructive Participation'
]
for cat in high_level_categories:
    tmp = df[[c for c in df.columns if cat in c]].select_dtypes(include='number').copy()
    l = tmp.sum(axis=1).to_list()
    df[cat] = [1 if x>=1 else 0 for x in l]
    
# Remove ambiguous and unclassified labels
df = df[df[high_level_categories].sum(axis=1) == 1].copy()

# Add label
df['label'] = df[high_level_categories].idxmax(axis=1)

In [None]:
# BERT Classifier
df_bert = df[['Input', 'label']].copy()

In [None]:
accuracy, report = train_bert_classifier()

## Evaluate Model

In [None]:
model = BERTClassifier('bert-base-uncased', 3)
model.load_state_dict(torch.load("bert_classifier-2.pth"))

In [None]:
val_dataloader, device = train_bert_classifier(model)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, cohen_kappa_score

def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    probabilities = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            probabilities.extend(torch.softmax(outputs, dim=1).cpu().tolist())
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    
    # Calculate accuracy
    accuracy = accuracy_score(actual_labels, predictions)
    
    # Calculate AUC
    probabilities = np.array(probabilities)
    auc = roc_auc_score(actual_labels, probabilities, multi_class='ovr', average='weighted')
    
    # Calculate Cohen's Kappa
    kappa = cohen_kappa_score(actual_labels, predictions)
    
    # Generate classification report
    class_report = classification_report(actual_labels, predictions)
    
    return accuracy, auc, kappa, class_report

In [None]:
accuracy, auc, kappa, class_report = evaluate(model, val_dataloader, device)
print("Accuracy:", accuracy)
print("AUC:", auc)
print("Cohen's Kappa:", kappa)
print("Classification Report:\n", class_report)

In [None]:
accuracy

In [None]:
print(report)

## Apply Model to all Chat Messags in the Data Set and Save Output

In [None]:
m2 = BERTClassifier('bert-base-uncased', 3)
m2.load_state_dict(torch.load("bert_classifier-2.pth"))
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device("cpu")
predict_label('who are you?', m2, tokenizer, device, max_length=128*2)

d_chats = add_joined_input_full(d_chats)

In [None]:
d_chats['predicted_label'] =\
    d_chats.Input_scale.progress_apply(lambda s: predict_label(s, m2, tokenizer, device, max_length=128*2))

In [None]:
join_this = d_chats[['Transaction Id', 'predicted_label']]

In [None]:
join_this['predicted_label'] = join_this.predicted_label.map(lambda x: x.item())

In [None]:
df = d_tx.merge(join_this, how='left', on='Transaction Id')

In [None]:
df.to_csv('analysis-set.csv', index=False)