# Install sparse_dot_topn for fast sparse vector matching

https://github.com/ing-bank/sparse_dot_topn

In [None]:
!pip install /kaggle/input/sparse-dot-topn-033/sparse_dot_topn-0.3.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

# Define TextMatcher

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sparse_dot_topn import awesome_cossim_topn


class TextMatcher:
    def __init__(self, ground_truth, col, topk=5, lower_bound=-1):
        self.ground_truth = ground_truth
        self.vec = TfidfVectorizer(ngram_range=(1, 2), analyzer="word", token_pattern=r"(?u)(\b\w\w+\b|[\.,!])",
                                   use_idf=False, min_df=2, binary=True)
        self.topk = topk
        self.lower_bound = lower_bound
        self.col = col
        
    def get_matches_df(self, sparse_matrix, texts):
        non_zeros = sparse_matrix.nonzero()

        text_indices = non_zeros[0]
        gt_indices = non_zeros[1]

        left_side = np.empty(gt_indices.size, dtype=object)
        right_side = np.empty(gt_indices.size, dtype=object)
        match_score = np.zeros(gt_indices.size)

        for index in range(gt_indices.size):
            left_side[index] = texts.values[text_indices[index]]
            right_side[index] = self.ground_truth[self.col].values[gt_indices[index]]
            match_score[index] = sparse_matrix.data[index]

        res_df = pd.DataFrame({self.col: left_side,
                               'Ground Truth': right_side,
                               'match_score': match_score})

        res_df = pd.DataFrame(texts).merge(res_df, on=self.col, how="left")
        return res_df


    def match(self, texts_to_match, n_threads=16):
        print(f"Matching {texts_to_match.shape[0]} texts to {self.ground_truth.shape[0]} texts...")
        
        X = self.vec.fit_transform(texts_to_match[self.col])
        X_gt = self.vec.transform(self.ground_truth[self.col])
        
        sparse_sim = awesome_cossim_topn(X, X_gt.T, self.topk, self.lower_bound, use_threads=True, n_jobs=n_threads)
        
        return self.get_matches_df(sparse_sim, texts_to_match[self.col])

# Load data

In [None]:
df = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")

if df.shape[0] == 3: # debug mode
    df = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv").drop("generated", axis=1)

df.head()

# Count keywords in essays

If both "because" and "thing" are in an essay, then it is likely that it is written by a student. If none of them are present, then it is likely that it is generated by an LLM.

In [None]:
KEYWORDS = ["because","thing"]

def count_keywords(text):
    count = 0
    ltext = text.lower()
    for kw in KEYWORDS:
        count += (kw in ltext)
        
    return count


df["keyword_count"] = df['text'].apply(count_keywords)
df["likely_student"] = df["keyword_count"] == 2
df["likely_llm"] = df["keyword_count"] == 0

df["keyword_count"].value_counts() / df.shape[0]

# Get kth similarity score

30th best match score within likely students and within likely LLMs are calculated.

In [None]:
TOPK = 30


def get_match_score(df, gt_filter_col):
    tm = TextMatcher(df[df[gt_filter_col]].reset_index(drop=True), "text", topk=TOPK)
    res_df = tm.match(df, n_threads=4)
    df = res_df.groupby("text")["match_score"].min().reset_index().merge(df, on="text")
    return df

all_prompts = df["prompt_id"].unique()

sub_dfs = [get_match_score(df[df["prompt_id"] == pid], "likely_student").reset_index(drop=True)[["id", "match_score"]]
           for pid in all_prompts]
sub_df = pd.concat(sub_dfs).rename(columns={"match_score": "match_score_student"})


sub_dfs = [get_match_score(df[df["prompt_id"] == pid], "likely_llm").reset_index(drop=True)[["id", "match_score"]]
           for pid in all_prompts]
sub_df2 = pd.concat(sub_dfs).rename(columns={"match_score": "match_score_llm"})

sub_df = sub_df.merge(sub_df2, on="id")
sub_df.shape

# Make submission

Ratio between student match score and smoothed LLM match score determines the ranking of essays.

In [None]:
SMOOTH = 0.15

sub_df["generated"] = -sub_df["match_score_student"] / (sub_df["match_score_llm"] + SMOOTH)

sub_df.to_csv("submission.csv", index=False, columns=["id", "generated"])