In [1]:
import html
import itertools
import nltk
import pickle

import numpy as np
import pandas as pd

from nltk.corpus import stopwords, wordnet
from sklearn.linear_model import LogisticRegression
from spellchecker import SpellChecker
from unidecode import unidecode

In [2]:
def unescape_response(response):
    response = unidecode(response)
    prev_response = response
    response = html.unescape(response)
    while prev_response != response:
        prev_response = response
        response = html.unescape(response)

    return response

def tokenize(response):
    response = str(response)
    
    spell_checker = SpellChecker()
    spell_checker.distance = 1
    
    punctuations = ' ,.:;?'
    preprocessed_response = []
    for i in range(len(response)):
        if response[i] in ["'", '"']:
            if i != 0 and i != len(response) - 1:
                if response[i - 1] not in punctuations and response[i + 1] not in punctuations:
                    preprocessed_response.append(response[i])
        else:
            preprocessed_response.append(response[i])
    response = ''.join(preprocessed_response)

    operators = '+-*/%=<>()[]{}#'
    preprocessed_response = []
    for i in range(len(response)):
        if response[i] in operators:
            if i != 0 and response[i - 1] != ' ':
                preprocessed_response.append(' ')
            preprocessed_response.append(response[i])
            if i != len(response) - 1 and response[i + 1] != ' ':
                preprocessed_response.append(' ')
        else:
            preprocessed_response.append(response[i])

    response = ''.join(preprocessed_response)

    spell_checker_skip = ["it's"]
    tokens_list = []
    for sentence in nltk.sent_tokenize(response):
        nltk_tokens = nltk.word_tokenize(sentence)
        tokens = []
        i = 0
        while i < len(nltk_tokens):
            if i < len(nltk_tokens) - 1 and nltk_tokens[i + 1][0] == "'":
                tokens.append(nltk_tokens[i].lower() + nltk_tokens[i + 1].lower())
                i += 1
            elif nltk_tokens[i] not in punctuations:
                tokens.append(nltk_tokens[i].lower())
            i += 1
        
        tokens_list.append([(spell_checker.correction(token) if spell_checker.correction(token) is not None else token) 
                            if token not in spell_checker_skip else token 
                            for token in tokens])

    return tokens_list

def initialize_bow(tokens_lists, percentage_threshold=0.6):
    token_to_count = {}
    for token_list in tokens_lists:
        tokens = set(itertools.chain.from_iterable(token_list))
        for token in tokens:
            if token not in token_to_count:
                token_to_count[token] = 0
            token_to_count[token] += 1
            
    sorted_counts = sorted(token_to_count.values())
    threshold = sorted_counts[int(len(sorted_counts) * percentage_threshold)]
    
    token_to_index = {}
    index = 0
    for token in token_to_count:
        if token_to_count[token] >= threshold:
            token_to_index[token] = index
            index += 1
    
    return token_to_index, len(token_to_index)

def bowize(token_to_index, tokens_list):
    tokens = set(itertools.chain.from_iterable(tokens_list))
    
    X = np.zeros((1, len(token_to_index)))
    for token in tokens:
        if token in token_to_index:
            X[0, token_to_index[token]] = 1
    
    return X

def initialize_bigram(tokens_lists, percentage_threshold=0.8):
    bigram_to_count = {}
    for tokens_list in tokens_lists:
        bigrams = []
        for tokens in tokens_list:
            bigrams.extend(map(lambda x : x[0] + ' ' + x[1], zip(tokens[0:-1], tokens[1:])))
        
        bigrams = set(bigrams)
        for bigram in bigrams:
            if bigram not in bigram_to_count:
                bigram_to_count[bigram] = 0
            bigram_to_count[bigram] += 1
    
    sorted_counts = sorted(bigram_to_count.values())
    threshold = sorted_counts[int(len(sorted_counts) * percentage_threshold)]
    
    bigram_to_index = {}
    index = 0
    for bigram in bigram_to_count:
        if bigram_to_count[bigram] >= threshold:
            bigram_to_index[bigram] = index
            index += 1
    
    token_to_index, _ = initialize_bow(tokens_lists)
    
    return (token_to_index, bigram_to_index), len(token_to_index) + len(bigram_to_index)
    
def bigramize(to_index_tuple, tokens_list):
    token_to_index, bigram_to_index = to_index_tuple
    
    X = np.zeros((1, len(token_to_index) + len(bigram_to_index)))
    
    tokens = set(itertools.chain.from_iterable(tokens_list))
    for token in tokens:
        if token in token_to_index:
            X[0, token_to_index[token]] = 1
    
    for tokens in tokens_list:
        for bigram in map(lambda x : x[0] + ' ' + x[1], zip(tokens[0:-1], tokens[1:])):
            if bigram in bigram_to_index:
                X[0, len(token_to_index) + bigram_to_index[bigram]] = 1
    
    return X

In [3]:
df = pd.read_csv('sigcse_2024.csv')

train_df = df[df.subset == 'train'].copy()
test_df = df[df.subset == 'test'].copy()

train_df['tokens_list'] = train_df.response.apply(tokenize)
test_df['tokens_list'] = test_df.response.apply(tokenize)

In [4]:
test_df['bigram_lr'] = 0
for qid, sub_train_df in train_df.groupby('qid'):
    lr = LogisticRegression()
    
    feature_map, feature_size = initialize_bigram(sub_train_df.tokens_list)
    
    train_X = np.zeros((len(sub_train_df), feature_size))
    train_y = np.zeros((len(sub_train_df), ), dtype=int)
    index = 0
    for _, row in sub_train_df.iterrows():
        train_X[index, :] = bigramize(feature_map, row.tokens_list)
        train_y[index] = row.binary_ground_truth

        index += 1

    lr.fit(train_X, train_y)
    
    sub_test_df = test_df[test_df.qid == qid]
    
    test_X = np.zeros((len(sub_test_df), feature_size))
    test_y = np.zeros((len(sub_test_df), ), dtype=int)
    index = 0
    for _, row in sub_test_df.iterrows():
        test_X[index, :] = bigramize(feature_map, row.tokens_list)
        test_y[index] = row.binary_ground_truth

        index += 1
    
    targets = np.array(test_y)
    predicted = lr.predict(test_X)
    
    test_df.loc[sub_test_df.index, 'bigram_lr'] = predicted

In [5]:
pickle.dump(list(test_df.bigram_lr), open('bigram_lr.pkl', 'wb'))