In [1]:
from collections import defaultdict

import nltk
import pandas as pd
from typing import List
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, OPTICS


nltk.download("stopwords")

RUSSIAN_STOPWORDS = set(stopwords.words("russian"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def process_text(str):
    """ Converting a string of pre-lemmatized words into a list of tokens """
    return [s for s in str.split() if not s.isspace()]


class TextsPairClassifier(object):

    def __init__(self, data: List[str]):
        self.pair_labels = defaultdict(lambda: 0)

        vectorizer = TfidfVectorizer(stop_words=RUSSIAN_STOPWORDS, 
                                     tokenizer=process_text, 
                                     max_df=.5,
                                     min_df=5)
        
        term_doc_matrix = vectorizer.fit_transform(data)

        clusterizer =  AgglomerativeClustering(n_clusters=6, linkage='ward')
        clusterizer.fit(term_doc_matrix.toarray())
        
        self.pair_labels = clusterizer.labels_

    def label(self, id1: int, id2: int):
        """ If the items are in the same cluster, return 1, else 0; use self.pair_labels"""
        cluster1 = self.pair_labels[id1 - 1]
        cluster2 = self.pair_labels[id2 - 1]
        
        return 1 if cluster1 == cluster2 else 0

In [3]:
def generate_submission():

    texts = pd.read_csv("normalized_texts.csv", index_col="id", encoding="utf-8")
    pairs = pd.read_csv("pairs.csv", index_col="id")

    classifier = TextsPairClassifier(texts["paragraph_lemmatized"].to_list())

    with open("submission.csv", "w", encoding="utf-8") as output:
        output.write("id,gold\n")
        for index, id1, id2 in pairs.itertuples():
            result = classifier.label(id1, id2)
            output.write("%s,%s\n" % (index, result))
            
    return classifier

In [4]:
if __name__ == "__main__":
    classifier = generate_submission()