In [1]:
import html
import itertools
import nltk
import pickle

import numpy as np
import pandas as pd

from nltk.corpus import stopwords, wordnet
from sklearn.linear_model import LogisticRegression
from spellchecker import SpellChecker
from unidecode import unidecode

In [2]:
def unescape_response(response):
    response = unidecode(response)
    prev_response = response
    response = html.unescape(response)
    while prev_response != response:
        prev_response = response
        response = html.unescape(response)

    return response

def tokenize(response):
    response = str(response)
    
    spell_checker = SpellChecker()
    spell_checker.distance = 1
    
    punctuations = ' ,.:;?'
    preprocessed_response = []
    for i in range(len(response)):
        if response[i] in ["'", '"']:
            if i != 0 and i != len(response) - 1:
                if response[i - 1] not in punctuations and response[i + 1] not in punctuations:
                    preprocessed_response.append(response[i])
        else:
            preprocessed_response.append(response[i])
    response = ''.join(preprocessed_response)

    operators = '+-*/%=<>()[]{}#'
    preprocessed_response = []
    for i in range(len(response)):
        if response[i] in operators:
            if i != 0 and response[i - 1] != ' ':
                preprocessed_response.append(' ')
            preprocessed_response.append(response[i])
            if i != len(response) - 1 and response[i + 1] != ' ':
                preprocessed_response.append(' ')
        else:
            preprocessed_response.append(response[i])

    response = ''.join(preprocessed_response)

    spell_checker_skip = ["it's"]
    tokens_list = []
    for sentence in nltk.sent_tokenize(response):
        nltk_tokens = nltk.word_tokenize(sentence)
        tokens = []
        i = 0
        while i < len(nltk_tokens):
            if i < len(nltk_tokens) - 1 and nltk_tokens[i + 1][0] == "'":
                tokens.append(nltk_tokens[i].lower() + nltk_tokens[i + 1].lower())
                i += 1
            elif nltk_tokens[i] not in punctuations:
                tokens.append(nltk_tokens[i].lower())
            i += 1
        
        tokens_list.append([(spell_checker.correction(token) if spell_checker.correction(token) is not None else token) 
                            if token not in spell_checker_skip else token 
                            for token in tokens])

    return tokens_list

def initialize_bow(tokens_lists, percentage_threshold=0.6):
    token_to_count = {}
    for token_list in tokens_lists:
        tokens = set(itertools.chain.from_iterable(token_list))
        for token in tokens:
            if token not in token_to_count:
                token_to_count[token] = 0
            token_to_count[token] += 1
            
    sorted_counts = sorted(token_to_count.values())
    threshold = sorted_counts[int(len(sorted_counts) * percentage_threshold)]
    
    token_to_index = {}
    index = 0
    for token in token_to_count:
        if token_to_count[token] >= threshold:
            token_to_index[token] = index
            index += 1
    
    return token_to_index, len(token_to_index)

def bowize(token_to_index, tokens_list):
    tokens = set(itertools.chain.from_iterable(tokens_list))
    
    X = np.zeros((1, len(token_to_index)))
    for token in tokens:
        if token in token_to_index:
            X[0, token_to_index[token]] = 1
    
    return X

def initialize_bigram(tokens_lists, percentage_threshold=0.8):
    bigram_to_count = {}
    for tokens_list in tokens_lists:
        bigrams = []
        for tokens in tokens_list:
            bigrams.extend(map(lambda x : x[0] + ' ' + x[1], zip(tokens[0:-1], tokens[1:])))
        
        bigrams = set(bigrams)
        for bigram in bigrams:
            if bigram not in bigram_to_count:
                bigram_to_count[bigram] = 0
            bigram_to_count[bigram] += 1
    
    sorted_counts = sorted(bigram_to_count.values())
    threshold = sorted_counts[int(len(sorted_counts) * percentage_threshold)]
    
    bigram_to_index = {}
    index = 0
    for bigram in bigram_to_count:
        if bigram_to_count[bigram] >= threshold:
            bigram_to_index[bigram] = index
            index += 1
    
    token_to_index, _ = initialize_bow(tokens_lists)
    
    return (token_to_index, bigram_to_index), len(token_to_index) + len(bigram_to_index)
    
def bigramize(to_index_tuple, tokens_list):
    token_to_index, bigram_to_index = to_index_tuple
    
    X = np.zeros((1, len(token_to_index) + len(bigram_to_index)))
    
    tokens = set(itertools.chain.from_iterable(tokens_list))
    for token in tokens:
        if token in token_to_index:
            X[0, token_to_index[token]] = 1
    
    for tokens in tokens_list:
        for bigram in map(lambda x : x[0] + ' ' + x[1], zip(tokens[0:-1], tokens[1:])):
            if bigram in bigram_to_index:
                X[0, len(token_to_index) + bigram_to_index[bigram]] = 1
    
    return X

def initialize_trigram(tokens_lists, percentage_threshold=0.9):
    trigram_to_count = {}
    for tokens_list in tokens_lists:
        trigrams = []
        for tokens in tokens_list:
            trigrams.extend(map(lambda x : x[0] + ' ' + x[1] + ' ' + x[2], zip(tokens[0:-2], tokens[1:-1], tokens[2:])))
        
        trigrams = set(trigrams)
        for trigram in trigrams:
            if trigram not in trigram_to_count:
                trigram_to_count[trigram] = 0
            trigram_to_count[trigram] += 1
    
    sorted_counts = sorted(trigram_to_count.values())
    threshold = sorted_counts[int(len(sorted_counts) * percentage_threshold)]
    
    trigram_to_index = {}
    index = 0
    for trigram in trigram_to_count:
        if trigram_to_count[trigram] >= threshold:
            trigram_to_index[trigram] = index
            index += 1
    
    (token_to_index, bigram_to_index), _ = initialize_bigram(tokens_lists)
    
    return (token_to_index, bigram_to_index, trigram_to_index), len(token_to_index) + len(bigram_to_index) + len(trigram_to_index)

def trigramize(to_index_tuple, tokens_list):
    token_to_index, bigram_to_index, trigram_to_index = to_index_tuple
    
    X = np.zeros((1, len(token_to_index) + len(bigram_to_index) + len(trigram_to_index)))
    
    tokens = set(itertools.chain.from_iterable(tokens_list))
    for token in tokens:
        if token in token_to_index:
            X[0, token_to_index[token]] = 1
    
    for tokens in tokens_list:
        for bigram in map(lambda x : x[0] + ' ' + x[1], zip(tokens[0:-1], tokens[1:])):
            if bigram in bigram_to_index:
                X[0, len(token_to_index) + bigram_to_index[bigram]] = 1
                
    for tokens in tokens_list:
        for trigram in map(lambda x : x[0] + ' ' + x[1] + ' ' + x[2], zip(tokens[0:-2], tokens[1:-1], tokens[2:])):
            if trigram in trigram_to_index:
                X[0, len(token_to_index) + len(bigram_to_index) + trigram_to_index[trigram]] = 1
    
    return X

In [3]:
df = pd.read_csv('sigcse_2024.csv')

train_df = df[df.subset == 'train'].copy()
validate_df = df[df.subset == 'validate'].copy()
test_df = df[df.subset == 'test'].copy()

train_df['tokens_list'] = train_df.response.apply(tokenize)
validate_df['tokens_list'] = validate_df.response.apply(tokenize)
test_df['tokens_list'] = test_df.response.apply(tokenize)

In [4]:
for suffix in ['una', 'c', 'hl']:
    test_df[f'bigram_{suffix}'] = 0
    for qid, sub_train_df in train_df.groupby('qid'):
        if len(sub_train_df[suffix].unique()) == 1:
            sub_validate_df = validate_df[validate_df.qid == qid]
            sub_test_df = test_df[test_df.qid == qid]
            
            validate_df.loc[sub_validate_df.index, f'bigram_{suffix}'] = sub_train_df[suffix].unique()[0]
            test_df.loc[sub_test_df.index, f'bigram_{suffix}'] = sub_train_df[suffix].unique()[0]
        else:
            lr = LogisticRegression()

            feature_map, feature_size = initialize_bigram(sub_train_df.tokens_list)

            train_X = np.zeros((len(sub_train_df), feature_size))
            train_y = np.zeros((len(sub_train_df), ), dtype=int)
            index = 0
            for _, row in sub_train_df.iterrows():
                train_X[index, :] = bigramize(feature_map, row.tokens_list)
                train_y[index] = row[suffix]

                index += 1

            lr.fit(train_X, train_y)
            
            sub_validate_df = validate_df[validate_df.qid == qid]
            validate_X = np.zeros((len(sub_validate_df), feature_size))
            validate_y = np.zeros((len(sub_validate_df), ), dtype=int)
            index = 0
            for _, row in sub_validate_df.iterrows():
                validate_X[index, :] = bigramize(feature_map, row.tokens_list)
                validate_y[index] = row[suffix]

                index += 1
            predicted = lr.predict(validate_X)
            validate_df.loc[sub_validate_df.index, f'bigram_{suffix}'] = predicted

            sub_test_df = test_df[test_df.qid == qid]

            test_X = np.zeros((len(sub_test_df), feature_size))
            test_y = np.zeros((len(sub_test_df), ), dtype=int)
            index = 0
            for _, row in sub_test_df.iterrows():
                test_X[index, :] = bigramize(feature_map, row.tokens_list)
                test_y[index] = row[suffix]

                index += 1

            predicted = lr.predict(test_X)
            test_df.loc[sub_test_df.index, f'bigram_{suffix}'] = predicted

In [5]:
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score

In [6]:
for suffix in ['c', 'una', 'hl']:
    print(suffix)
    print(accuracy_score(test_df[suffix], test_df[f'bigram_{suffix}']))
    print(cohen_kappa_score(test_df[suffix], test_df[f'bigram_{suffix}']))
    print(f1_score(test_df[suffix], test_df[f'bigram_{suffix}']))

c
0.8612903225806452
0.7186190591786548
0.8760806916426513
una
0.7306451612903225
0.33423353909465014
0.8125701459034792
hl
0.8951612903225806
0.5917747163695299
0.9382716049382717


In [7]:
validate_df['c_una_hl'] = validate_df.apply(lambda row : f'{row.c}_{row.una}_{row.hl}', axis=1)
validate_df['c_una_hl_predicted'] = validate_df.apply(lambda row : f'{int(row.bigram_c)}_{int(row.bigram_una)}_{int(row.bigram_hl)}', axis=1)
validate_df['converted_binary_ground_truth'] = validate_df.c_una_hl.apply(lambda x : 1 if x == '1_1_1' else 0)
validate_df['converted_binary_predicted'] = validate_df.c_una_hl_predicted.apply(lambda x : 1 if x == '1_1_1' else 0)

In [8]:
test_df['c_una_hl'] = test_df.apply(lambda row : f'{row.c}_{row.una}_{row.hl}', axis=1)
test_df['c_una_hl_predicted'] = test_df.apply(lambda row : f'{int(row.bigram_c)}_{int(row.bigram_una)}_{int(row.bigram_hl)}', axis=1)
test_df['converted_binary_ground_truth'] = test_df.c_una_hl.apply(lambda x : 1 if x == '1_1_1' else 0)
test_df['converted_binary_predicted'] = test_df.c_una_hl_predicted.apply(lambda x : 1 if x == '1_1_1' else 0)

In [9]:
print(accuracy_score(test_df['converted_binary_ground_truth'], test_df['converted_binary_predicted']))
print(cohen_kappa_score(test_df['converted_binary_ground_truth'], test_df['converted_binary_predicted']))
print(f1_score(test_df['converted_binary_ground_truth'], test_df['converted_binary_predicted']))

0.8354838709677419
0.6595423956931359
0.7968127490039841


In [10]:
def order_predict(row, order):
    if row.converted_binary_ground_truth == 1:
        return row.converted_binary_predicted
    else:
        for suffix in order:
            if row[suffix] == 1:
                if row[f'bigram_{suffix}'] == 1:
                    continue
                else:
                    return -1
            else:
                if row[f'bigram_{suffix}'] == 1:
                    continue
                else:
                    return 0
        return 1

In [11]:
orders = [
    ['c', 'hl', 'una'], 
    ['c', 'una', 'hl'],
    ['hl', 'c', 'una'],
    ['hl', 'una', 'c'],
    ['una', 'c', 'hl'],
    ['una', 'hl', 'c']
]

In [12]:
for order in orders:
    order_string = '_'.join(order)
    validate_df[order_string] = validate_df.apply(lambda row :  order_predict(row, order), axis=1)
    validate_df[f'{order_string}_ground_truth'] = validate_df['converted_binary_ground_truth']
    validate_df.loc[validate_df[order_string] == -1, f'{order_string}_ground_truth'] = 1
    validate_df.loc[validate_df[order_string] == -1, order_string] = 0
    
    print(order)
    print(accuracy_score(validate_df[f'{order_string}_ground_truth'], validate_df[order_string]))
    print(cohen_kappa_score(validate_df[f'{order_string}_ground_truth'], validate_df[order_string]))
    print(f1_score(validate_df[f'{order_string}_ground_truth'], validate_df[order_string]))

['c', 'hl', 'una']
0.7973856209150327
0.5878384983869391
0.7669172932330828
['c', 'una', 'hl']
0.7973856209150327
0.5878384983869391
0.7669172932330828
['hl', 'c', 'una']
0.7892156862745098
0.5720326119953163
0.7597765363128491
['hl', 'una', 'c']
0.7549019607843137
0.5062976627120285
0.7311827956989249
['una', 'c', 'hl']
0.7467320261437909
0.49079939020462504
0.7246891651865008
['una', 'hl', 'c']
0.75
0.49699170570286644
0.7272727272727272


In [13]:
print(accuracy_score(test_df['c_una_hl'], test_df['c_una_hl_predicted']))
print(cohen_kappa_score(test_df['c_una_hl'], test_df['c_una_hl_predicted']))
print(f1_score(test_df['c_una_hl'], test_df['c_una_hl_predicted'], average='weighted'))

0.5854838709677419
0.4500491828740055
0.5733543887217953


In [14]:
order = ['c', 'una', 'hl']
order_string = '_'.join(order)
test_df[order_string] = test_df.apply(lambda row :  order_predict(row, order), axis=1)
test_df[f'{order_string}_ground_truth'] = test_df['converted_binary_ground_truth']
test_df.loc[test_df[order_string] == -1, f'{order_string}_ground_truth'] = 1
test_df.loc[test_df[order_string] == -1, order_string] = 0

print(order)
print(accuracy_score(test_df[f'{order_string}_ground_truth'], test_df[order_string]))
print(cohen_kappa_score(test_df[f'{order_string}_ground_truth'], test_df[order_string]))
print(f1_score(test_df[f'{order_string}_ground_truth'], test_df[order_string]))

['c', 'una', 'hl']
0.7677419354838709
0.5285220899652517
0.7352941176470588
