ADD COLAB BADGE (kida) 

In [None]:
from google.colab import files
files.upload()

!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d bwandowando/ukraine-russian-crisis-twitter-dataset-1-2-m-rows
!unzip ukraine-russian-crisis-twitter-dataset-1-2-m-rows.zip -d dataset
!rm ukraine-russian-crisis-twitter-dataset-1-2-m-rows.zip

In [54]:
import csv
import hashlib
import os
import pathlib
import pickle
import random
import re
import string
import unicodedata
from collections import defaultdict
from itertools import combinations
from typing import Callable, Optional

import numpy as np
import pandas as pd
import spacy
from nltk.stem import PorterStemmer
from scipy.optimize import fsolve
from tqdm.auto import tqdm

In [38]:
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_sm', exclude=['parser', 'ner'])

In [145]:
def is_prime(n: int):
    if n < 2:
        return False
    for i in range(2, int(np.sqrt(n))+1):
        if (n % i) == 0:
            return False
    return True

def find_closest_prime(n):
    while True:
        if is_prime(n):
            return n
        n += 1

def get_variable_length_hash(n: int):
    def inner_f(s: str):
        binary_str = bin(
            int.from_bytes(
                hashlib.sha256(s.encode()).digest(), 
                'little'
            )
        )[-n:]
        return int(binary_str, 2)
    return inner_f

class HashGenerator:
    def __init__(
        self, 
        num_rows: int = np.iinfo(np.uint32).max, 
        # prime: int = 4294967387
    ) -> None:
        # assert is_prime(prime)
        # assert prime >= num_rows

        self.num_rows = num_rows
        self.prime = find_closest_prime(num_rows)
        self.a_set = set()
        self.b_set = set()

    def get_num_rows(self) -> int:
        return self.num_rows

    def next(self) -> Callable[[np.uint32], np.uint32]:
        a = self._generate_coeff(self.a_set, self.num_rows)
        b = self._generate_coeff(self.b_set, self.num_rows)
        return lambda row: np.uint32((a * row + b) % self.prime)

    def reset(self) -> None:
        self.a_set = set()
        self.b_set = set()

    def _generate_coeff(
        self, 
        coeff_set: set[int],
        max_val: int
    ) -> int:
        while True:
            coeff = random.randint(1, max_val)
            if coeff not in coeff_set:
                coeff_set.add(coeff)
                return coeff
    

# Preprocessing

In [146]:
def normalize_white_space(doc: str) -> str:
    return " ".join(doc.split())

def remove_https(doc: str) -> str:
    return re.sub(r'https?://[^ ]+', '', doc)

def replace_chars(doc: str) -> str:
    return doc.replace('&amp;', ' and ')

def remove_non_ascii(doc: str) -> str:
    """We keep cyrillic characters due to the nature
    of the dataset.
    """
    cyr_chars = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"

    res = ""
    for c in doc:
        if (c.isascii() and c.isprintable()) \
            or (c in cyr_chars) or c.isspace():
            res += c
    return res

def strip_accents(doc: str) -> str:
    """Replace words with accent with their 
    counterpart without accent. Also deal with 
    special characters such as 𝕒, 𝕕, 𝕖, 𝙖, 𝙘, 𝙙. 
    """
    return unicodedata.normalize('NFKD', doc)

def strip_punctuation(doc: str) -> str:
    return re.sub('[' + re.escape(string.punctuation) + ']', '', doc)
    
def get_lemmatizer( 
    nlp: spacy.pipeline, 
    allow_stop_words: bool = False,
    allow_punct: bool = False,
    allow_numbers: bool = False
) -> Callable[[str], str]:
    def inner_f(doc):
        return ' '.join(
            [
                token.lemma_
                for token in nlp(doc)
                if (not token.is_stop or allow_stop_words) \
                    and (not token.is_punct or allow_punct) \
                    and (token.pos_ != 'NUM' or allow_numbers) \
                    and (not token.pos_ == 'X')
            ]
        )
    return inner_f

# Model

In [147]:
def jaccard_similarity(
    x: np.ndarray, 
    y: np.ndarray
) -> float:
    numerator = len(set(x).intersection(set(y)))
    denominator = len(set(x).union(set(y)))
    return numerator / denominator

def mean_absolute_error(
    x: list, 
    y: list
) -> float:
    return sum(
        [np.abs(val2 - val1) for val1, val2 in zip(x, y)]
    ) / len(x)

def evaluate_on_cm(
    sig_dict: dict, 
    cm_dict: dict
) -> tuple[int, float]:
    common = set(sig_dict).intersection(set(cm_dict))
    num_wrong = len(sig_dict) - len(common)

    sig_values = []
    cm_values = []

    for pair in common:
        sig_values.append(sig_dict[pair])
        cm_values.append(cm_dict[pair])

    return num_wrong, \
        mean_absolute_error(sig_values, cm_values) 

class LSHModel:
    def __init__(
        self,
        k: int,
        threshold: float,
        num_hashes: int,
        shingle_hash_bits: int,
        track_shingles: bool = False,
        checkpoint_path: Optional[str] = None
    ) -> None:
        self.k = k
        self.threshold = threshold
        self.num_hashes = num_hashes
        self.shingle_set = set()
        self.char_set = set()
        self.shingle_hash_bits = shingle_hash_bits
        self.shingle_hash = get_variable_length_hash(
            shingle_hash_bits
        )
        self.num_shingles = 2 ** shingle_hash_bits
        self.track_shingles = track_shingles
        self.checkpoint_path = checkpoint_path
        self.num_docs = 0
        self.docs_dict = dict()
        self.signature = None
        self.candidate_pairs = set()
        self.fp_pairs = set()
        self.similar_pairs = set()
        self.b = -1
        self.r = -1
        self.sig_idx = -1

        if self.num_hashes > self.num_shingles:
            raise ValueError(
                f"Number of hash functions must be lower than "
                f"or equal to the number of shingles. Found "
                f"{self.num_hashes} hash functions and "
                f"{self.num_shingles} shingles."
            )

    def load_checkpoint(
        self,
        checkpoint_path: Optional[str] = None
    ) -> None:
        if checkpoint_path is not None:
            self.checkpoint_path = checkpoint_path
        if self.checkpoint_path is None:
            raise ValueError(
                "Checkpoint path not found"
            )
        else:
            tup_ls = [
                (f'{self.checkpoint_path}/docs_dict.npy', 'docs_dict'),
                (f'{self.checkpoint_path}/shingle_set.npy', 'shingle_set'),
                (f'{self.checkpoint_path}/char_set.npy', 'char_set'),
                (f'{self.checkpoint_path}/signature.npy', 'signature'),
                (f'{self.checkpoint_path}/sig_idx.npy', 'sig_idx'),
                (f'{self.checkpoint_path}/candidate_pairs.npy', 'candidate_pairs'),
                (f'{self.checkpoint_path}/fp_pairs.npy', 'fp_pairs'),
                (f'{self.checkpoint_path}/similar_pairs.npy', 'similar_pairs')
            ]

            for file_path, attr in tup_ls:
                if os.path.isfile(file_path):
                    if attr in ['signature']:
                        setattr(
                            self, 
                            attr, 
                            np.load(file_path, allow_pickle=True)
                        )
                    else:
                        setattr(
                            self, 
                            attr, 
                            np.load(file_path, allow_pickle=True).item()
                        )
                        
    def save_checkpoint(
        self,
        checkpoint_path: Optional[str] = None
    ) -> None:
        if checkpoint_path is not None:
            self.checkpoint_path = checkpoint_path
        if self.checkpoint_path is None:
            raise ValueError(
                "Checkpoint path not found"
            )
        else:
            os.makedirs(self.checkpoint_path, exist_ok=True)

            tup_ls = [
                (f'{self.checkpoint_path}/docs_dict.npy', self.docs_dict),
                (f'{self.checkpoint_path}/shingle_set.npy', self.shingle_set),
                (f'{self.checkpoint_path}/char_set.npy', self.char_set),
                (f'{self.checkpoint_path}/signature.npy', self.signature),
                (f'{self.checkpoint_path}/sig_idx.npy', self.sig_idx),
                (f'{self.checkpoint_path}/candidate_pairs.npy', self.candidate_pairs),
                (f'{self.checkpoint_path}/fp_pairs.npy', self.fp_pairs),
                (f'{self.checkpoint_path}/similar_pairs.npy', self.similar_pairs)
            ]

            for file_path, val in tup_ls:
                np.save(file_path, val)

    def add_document(
        self, 
        doc: str,
        preprocessing_pipeline: Optional[list[Callable[[str], str]]] = None
    ) -> None:
        if preprocessing_pipeline is not None:
            for f in preprocessing_pipeline:
                doc = f(doc)
        shingles = self._create_shingles(
            doc, 
            self.k,
            self.track_shingles,
            self.shingle_hash
        )
        self.docs_dict[self.num_docs] = shingles
        self.num_docs += 1

    def get_similar_pairs(
        self,
        checkpoint_path: Optional[str] = None,
        checkpoint_freq: int = 10000
    ) -> set[tuple[int, int]]:
        hg = HashGenerator(self.num_shingles)
        hash_functions = [
            hg.next()
            for _ in range(self.num_hashes)
        ]
        self.signature = self._build_signature(
            self.docs_dict,
            self.num_shingles,
            hash_functions,
            checkpoint_path,
            checkpoint_freq
        )
        self.b, self.r = self._find_lsh_params(
            self.threshold,
            self.num_hashes
        )
        self.candidate_pairs = self._lsh(
            self.signature,
            self.b
        )
        self.similar_pairs, self.fp_pairs = \
            self._check_threshold_on_signature(
                self.candidate_pairs,
                self.signature,
                self.threshold
            )
        return self.similar_pairs

    def _create_shingles(
        self,
        doc: str, 
        k: int,
        track_shingles: bool, 
        hash_f: Callable[[str], int]
    ) -> np.ndarray:
        res = []

        for i in range(len(doc[:-k+1])):
            shingle = doc[i:i+k]
            if track_shingles:
                self.shingle_set.add(shingle)
                self.char_set = self.char_set.union(
                    set(shingle)
                ) 
            res.append(hash_f(shingle))

        return np.unique(res).astype(np.uint32)

    def _build_signature(
        self,
        docs_dict: dict[int, np.ndarray],
        num_rows: int, 
        hash_functions: list[Callable[[np.uint32], np.uint32]],
        checkpoint_path: Optional[str] = None,
        checkpoint_freq: int = 10000
    ) -> np.ndarray:
        if checkpoint_path is not None:
            self.checkpoint_path = checkpoint_path
        if self.checkpoint_path is not None:
            os.makedirs(self.checkpoint_path, exist_ok=True)
        
        sig_path = f'{self.checkpoint_path}/signature.npy'
        sig_idx_path = f'{self.checkpoint_path}/sig_idx.npy'
        
        if self.checkpoint_path is not None and \
            os.path.isfile(sig_path) and \
            os.path.isfile(sig_idx_path):
                signature = np.load(sig_path, allow_pickle=True)
                self.sig_idx = np.load(sig_idx_path, allow_pickle=True)
                print(f"Loaded signature from row {self.sig_idx}")
        else:
            signature = np.full(
                (len(hash_functions), len(docs_dict)), 
                fill_value=np.inf
            )
            self.sig_idx = -1

        for r in tqdm(
            range(0, num_rows),
            total=num_rows,
            desc='[signature matrix] row number',
            leave=False
        ):
            if r < self.sig_idx:
                continue

            hash_values = [
                f(r)
                for f in hash_functions
            ]
            for c, shingles in enumerate(docs_dict.values()):
                if r in shingles:
                    for i, hash_val in enumerate(hash_values):
                        if hash_val < signature[i,c]:
                            signature[i,c] = hash_val

            self.sig_idx = r
            if (self.sig_idx % checkpoint_freq == 0) and \
                self.checkpoint_path is not None:
                np.save(sig_path, signature)
                np.save(sig_idx_path, self.sig_idx)

        if self.checkpoint_path is not None:
            np.save(sig_path, signature)
            np.save(sig_idx_path, self.sig_idx)
        
        return signature.astype(np.uint32)

    def _find_lsh_params(self, t: int, n: int) -> tuple[int]:
        """A lower b means that two items must match a higher
        number of rows. By taking the floor of b, we favor
        more similar pairs.  
        """
        def equations(vars):
            b, r = vars
            eq1 = t - (1 / b) ** (1 / r)
            eq2 = n - b * r
            return [eq1, eq2]

        b, r =  fsolve(equations, (1, 1))
        b = np.floor(b)
        r = n // b
        return int(b), int(r)

    def _lsh(
        self, 
        signature: np.ndarray, 
        b: int
    ) -> set[tuple[int, int]]:
        candidate_pairs = set()
        
        for band in np.array_split(signature, b):
            
            # column tuple -> list of column indices having that tuple
            same_columns = defaultdict(list) 
            
            for c in range(band.shape[1]):
                column = band[:,c]
                same_columns[tuple(column)].append(c)

            filtered_same_columns = dict()
            for k, values in same_columns.items():
                if len(values) >= 2:
                    filtered_same_columns[k] = values

            for values in filtered_same_columns.values():
                for pair in combinations(values, 2):
                    candidate_pairs.add(pair)

        return candidate_pairs

    def _check_threshold_on_signature(
        self, 
        candidate_pairs: list[tuple[int, int]], 
        signature: np.ndarray, 
        t: float
    ) -> tuple[set[tuple[tuple[int, int], float]]]:
        similar_pairs = set()
        false_positive_pairs = set()

        for (x, y) in candidate_pairs:
            x_col = signature[:,x]
            y_col = signature[:,y]
            similarity = sum(x_col == y_col) / signature.shape[0]
            tup = ((x, y), similarity)
            if similarity >= t:
                similar_pairs.add(tup)
            else:
                false_positive_pairs.add(tup)

        return similar_pairs, false_positive_pairs

    def check_threshold_on_cm(
        self,
        candidate_pairs: list[tuple[int, int]], 
        docs_dict: dict[int, np.ndarray], 
        t: float
    ) -> tuple[set[tuple[tuple[int, int], float]]]:
        similar_pairs = set()
        false_positive_pairs = set()

        for (x, y) in candidate_pairs:
            similarity = jaccard_similarity(docs_dict[x], docs_dict[y])
            tup = ((x, y), similarity)
            if similarity >= t:
                similar_pairs.add(tup)
            else:
                false_positive_pairs.add(tup)

        return similar_pairs, false_positive_pairs

    def get_shingle_set(self) -> set[int]:
        return self.shingle_set

    def get_char_set(self) -> set[str]:
        return self.char_set

def train_model(
    model: LSHModel, 
    data_path: str, 
    num_docs: int,
    verbose: bool = False,
    preprocessing_pipeline: Optional[list[Callable[[str], str]]] = None  
) -> LSHModel:
    files = []

    for name in os.listdir(data_path):
        full_path = os.path.join(data_path, name)
        if os.path.isfile(full_path):
            files.append(full_path)

    duplicates = 0
    count = num_docs

    with tqdm(
        total=num_docs,
        desc='Adding documents to model',
        leave=False
    ) as pbar:
        for file in files:
            if count == 0:
                break

            if verbose:
                print(f'Reading file {file}')
            df = pd.read_csv(
                file, 
                compression='gzip', 
                index_col=0,
                encoding='utf-8', 
                quoting=csv.QUOTE_ALL,
                low_memory=False
            )

            df = df[df['language'] == 'en']
            df_unique = df.drop_duplicates(subset=['text'])
            duplicates += len(df) - len(df_unique)

            for index, row in tqdm(
                df_unique.iterrows(),
                total=len(df_unique),
                desc='Reading file',
                leave=False
            ):
                text = row['text']
                model.add_document(
                    text,
                    preprocessing_pipeline
                )
                
                count -= 1
                pbar.update(1)
                if count == 0:
                    break

    if verbose:       
        print(f'Found {duplicates} duplicates in files')

    return model

# Test

In [None]:
dataset_path = r"E:\datasets\ukraine"

In [None]:
files = []

for name in os.listdir(dataset_path):
    full_path = os.path.join(dataset_path, name)
    if os.path.isfile(full_path):
        files.append(full_path)

df = pd.read_csv(
    files[0], 
    compression='gzip', 
    index_col=0,
    encoding='utf-8', 
    quoting=csv.QUOTE_ALL
)

In [None]:
df = df[df['language'] == 'en']

In [None]:
char_set = set()

In [None]:
pipeline = [
    get_lemmatizer(nlp),
    strip_accents,
    str.lower,
    remove_https,
    replace_chars,
    strip_punctuation,
    remove_non_ascii,
    normalize_white_space
]

docs = []

for index, row in tqdm(
    df.iterrows(),
    total=len(df),
):
    text = row['text']
    for f in pipeline:
        text = f(text)
    text_set = set(text)
    char_set = char_set.union(text_set)
    if index > 10000:
        break

In [None]:
pipeline = [
    get_lemmatizer(nlp),
    strip_accents,
    str.lower,
    remove_https,
    replace_chars,
    strip_punctuation,
    remove_non_ascii,
    normalize_white_space
]

docs = []

for index, row in tqdm(
    df.iterrows(),
    total=len(df),
):
    text = row['text']
    docs.append(text)
    if index > 10000:
        break

In [None]:
len(docs)

In [None]:
for doc in nlp.pipe(docs, batch_size=64, n_process=4, disable=["parser", "ner"]):
    a = ([tok.lemma_ for tok in doc])

In [None]:
stemmer = PorterStemmer()

for doc in docs:
    print([stemmer.stem(token) for token in doc.split(' ')])

## check

In [None]:
len(char_set)

In [None]:
char_set

In [None]:
len(char_set)

In [None]:
'誌'.isalnum()

In [None]:
char_set

In [None]:
df[0:2].T

# Test Small

In [None]:
hg = HashGenerator()
docs = [
    "Lincoln was born into poverty in a log cabin in Kentucky and was raised on the frontier, primarily in Indiana. He was self-educated and became a lawyer, Whig Party leader, Illinois state legislator, and U.S. Congressman from Illinois. In 1849, he returned to his law practice but became vexed by the opening of additional lands to slavery as a result of the Kansas–Nebraska Act of 1854. He reentered politics in 1854, becoming a leader in the new Republican Party, and he reached a national audience in the 1858 Senate campaign debates against Stephen Douglas. Lincoln ran for President in 1860, sweeping the North to gain victory. Pro-slavery elements in the South viewed his election as a threat to slavery, and Southern states began seceding from the Union. During this time the newly formed Confederate States of America began seizing federal military bases in the south. Just over one month after Lincoln assumed the presidency, the Confederate States attacked Fort Sumter, a U.S. fort in South Carolina. Following the bombardment, Lincoln mobilized forces to suppress the rebellion and restore the Union.",
    "Abraham Lincoln was born on February 12, 1809, the second child of Thomas Lincoln and Nancy Hanks Lincoln, in a log cabin on Sinking Spring Farm near Hodgenville, Kentucky.[2] He was a descendant of Samuel Lincoln, an Englishman who migrated from Hingham, Norfolk, to its namesake, Hingham, Massachusetts, in 1638. The family then migrated west, passing through New Jersey, Pennsylvania, and Virginia.[3] Lincoln's paternal grandparents, his namesake Captain Abraham Lincoln and wife Bathsheba (née Herring) moved the family from Virginia to Jefferson County, Kentucky.[b] The captain was killed in an Indian raid in 1786.[5] His children, including eight-year-old Thomas, Abraham's father, witnessed the attack.[6][c] Thomas then worked at odd jobs in Kentucky and Tennessee before the family settled in Hardin County, Kentucky, in the early 1800s.",
    "A supernova is a powerful and luminous stellar explosion. This transient astronomical event occurs during the last evolutionary stages of a massive star or when a white dwarf is triggered into runaway nuclear fusion. The original object, called the progenitor, either collapses to a neutron star or black hole, or is completely destroyed. The peak optical luminosity of a supernova can be comparable to that of an entire galaxy before fading over several weeks or months. Supernovae are more energetic than novae. In Latin, nova means new, referring astronomically to what appears to be a temporary new bright star. Adding the prefix super- distinguishes supernovae from ordinary novae, which are far less luminous. The word supernova was coined by Walter Baade and Fritz Zwicky in 1929.",
    "The most recent directly observed supernova in the Milky Way was Kepler's Supernova in 1604, but the remnants of more recent supernovae have been found. Observations of supernovae in other galaxies suggest they occur in the Milky Way on average about three times every century. These supernovae would almost certainly be observable with modern astronomical telescopes. The most recent naked-eye supernova was SN 1987A, the explosion of a blue supergiant star in the Large Magellanic Cloud, a satellite of the Milky Way."
]

In [None]:
hg = HashGenerator()
docs = [
    "The    pen is     on   the       table",
    "The cat is eating something on the table",
    "I watched soccer on television"
]

In [None]:
model = LSHModel(
    k=5,
    threshold=0.1,
    num_hashes=10,
    hash_generator=hg
)

In [None]:
preprocessing_pipeline = [
    remove_https,
    normalize_white_space,
    get_lemmatizer(nlp)
]

In [None]:
for doc in docs:
    model.add_document(doc, preprocessing_pipeline)

In [None]:
model.get_similar_pairs()

In [None]:
model.check_threshold_on_cm(
    model.candidate_pairs,
    model.docs_dict,
    model.threshold
)

In [None]:
model.docs_dict

In [None]:
model.signature

# Test 1k

In [4]:
dataset_path = r"E:\datasets\ukraine"
files = []

for name in os.listdir(dataset_path):
    full_path = os.path.join(dataset_path, name)
    if os.path.isfile(full_path):
        files.append(full_path)

In [None]:
df = pd.read_csv(
    files[0], 
    compression='gzip', 
    index_col=0,
    encoding='utf-8', 
    quoting=csv.QUOTE_ALL
)

df = df[df['language'] == 'en']

In [8]:
unique_tweets = df.drop_duplicates(subset = ['text'])
len(df) - len(unique_tweets)

191292

In [25]:
model = LSHModel(
    k=5,
    threshold=0.1,
    num_hashes=100,
    shingle_hash_bits=16
)

In [27]:
2 ** 16

65536

In [26]:
preprocessing_pipeline = [
    get_lemmatizer(nlp),
    strip_accents,
    str.lower,
    remove_https,
    replace_chars,
    strip_punctuation,
    remove_non_ascii,
    normalize_white_space
]

count = 1000

for index, row in tqdm(
    unique_tweets.iterrows(),
    total=len(unique_tweets),
):
    text = row['text']
    model.add_document(text, preprocessing_pipeline)
    
    count -= 1
    if count == 0:
        break

  2%|▏         | 999/63334 [00:04<05:08, 202.19it/s]


In [28]:
tp = model.get_similar_pairs()

                                                                                     

In [29]:
len(tp)

7894

In [30]:
len(model.fp_pairs)

152965

In [31]:
ttp, tfp = model.check_threshold_on_cm(
    model.candidate_pairs,
    model.docs_dict,
    model.threshold
)

In [32]:
len(ttp)

5544

In [None]:
unique_tweets = df.drop_duplicates(subset = ['text'])

In [None]:
len(df)

In [None]:
len(unique_tweets)

# Experiments

In [148]:
if os.path.isdir(r'e:\datasets\ukraine'):
    DATA_PATH = r'e:\datasets\ukraine'
else:
    DATA_PATH = os.path.join(os.getcwd(), 'dataset')

In [149]:
preprocessing_pipeline = [
    get_lemmatizer(nlp),
    strip_accents,
    str.lower,
    remove_https,
    replace_chars,
    strip_punctuation,
    remove_non_ascii,
    normalize_white_space
]

## Shingle and character number growth

In [150]:
for k in [4, 5, 6]:
    for num_docs in [
        10, 100, 1000, 10000, 
        20000, 30000, 50000,
        70000, 100000, 150000,
        200000
    ]:
        ckpt_path = f'checkpoints/k{k}_d{num_docs}'
        model = LSHModel(
            k=k,
            threshold=0.1,
            num_hashes=100,
            shingle_hash_bits=16,
            track_shingles=True,
            checkpoint_path=ckpt_path
        )

        if os.path.isdir(ckpt_path) and \
            len(os.listdir(ckpt_path)) > 0:
            model.load_checkpoint()
        else:
            model = train_model(
                model=model, 
                data_path=DATA_PATH,
                num_docs=num_docs,
                verbose=False,
                preprocessing_pipeline=preprocessing_pipeline,
            )
            model.save_checkpoint()

        print(
            f'[{k} k, {num_docs} docs]:\n'
            f'\t{len(model.char_set)} characters\n'
            f'\t{len(model.shingle_set)} shingles\n'
        )

                                                                         

[4 k, 10 docs]:
	38 characters
	1138 shingles



                                                                           

[4 k, 100 docs]:
	47 characters
	5783 shingles



                                                                               

[4 k, 1000 docs]:
	57 characters
	19435 shingles



                                                                                

[4 k, 10000 docs]:
	63 characters
	56358 shingles



                                                                                 

[4 k, 20000 docs]:
	65 characters
	76335 shingles



                                                                                 

[4 k, 30000 docs]:
	66 characters
	91754 shingles



                                                                                 

[4 k, 50000 docs]:
	67 characters
	116291 shingles



                                                                                 

[4 k, 70000 docs]:
	67 characters
	131465 shingles



                                                                                  

[4 k, 100000 docs]:
	68 characters
	148594 shingles



                                                                                   

[4 k, 150000 docs]:
	68 characters
	168011 shingles



                                                                                    

[4 k, 200000 docs]:
	68 characters
	183544 shingles

[5 k, 10 docs]:
	38 characters
	1251 shingles

[5 k, 100 docs]:
	47 characters
	7658 shingles

[5 k, 1000 docs]:
	57 characters
	36739 shingles

[5 k, 10000 docs]:
	63 characters
	137366 shingles

[5 k, 20000 docs]:
	65 characters
	198130 shingles

[5 k, 30000 docs]:
	66 characters
	247196 shingles

[5 k, 50000 docs]:
	67 characters
	329268 shingles

[5 k, 70000 docs]:
	67 characters
	385285 shingles

[5 k, 100000 docs]:
	68 characters
	451278 shingles

[5 k, 150000 docs]:
	68 characters
	529304 shingles

[5 k, 200000 docs]:
	68 characters
	594565 shingles



                                                                         

[6 k, 10 docs]:
	38 characters
	1308 shingles



                                                                           

[6 k, 100 docs]:
	47 characters
	8689 shingles



                                                                              

[6 k, 1000 docs]:
	57 characters
	51954 shingles



                                                                                

[6 k, 10000 docs]:
	63 characters
	245526 shingles



                                                                                 

[6 k, 20000 docs]:
	65 characters
	373659 shingles



                                                                                 

[6 k, 30000 docs]:
	66 characters
	478378 shingles



                                                                                 

[6 k, 50000 docs]:
	67 characters
	656358 shingles



                                                                                 

[6 k, 70000 docs]:
	67 characters
	783192 shingles



                                                                                  

[6 k, 100000 docs]:
	68 characters
	937405 shingles



                                                                                   

[6 k, 150000 docs]:
	68 characters
	1126938 shingles



Adding documents to model:   2%|▏         | 3058/200000 [00:19<15:30, 211.54it/s]

## Number of hash buckets

In [None]:
import timeit

In [None]:
for n_bits in [16, 18, 19, 20]:
    ckpt_path = f'checkpoints/n_bits{n_bits}'
    time_path = f'{ckpt_path}/time.npy'

    model = LSHModel(
        k=5,
        threshold=0.1,
        num_hashes=100,
        shingle_hash_bits=n_bits,
        track_shingles=True,
        checkpoint_path=ckpt_path
    )

    if os.path.isdir(ckpt_path) and \
        len(os.listdir(ckpt_path)) > 0:
        model.load_checkpoint()

        time_delta = np.load(
            f'{ckpt_path}/time.npy', 
            allow_pickle=True
        )

        sig_tp = dict(model.get_similar_pairs())
        cm_tp, _ = model.check_threshold_on_cm()
        cm_tp = dict(cm_tp)

        num_wrong, mae = evaluate_on_cm(sig_tp, cm_tp)
        
    else:
        start_time = timeit.default_timer()

        model = train_model(
            model=model, 
            data_path=DATA_PATH,
            num_docs=1000,
            verbose=False,
            preprocessing_pipeline=preprocessing_pipeline,
        )
        model.save_checkpoint()

        sig_tp = dict(model.get_similar_pairs())
        cm_tp, _ = model.check_threshold_on_cm()
        cm_tp = dict(cm_tp)

        end_time = timeit.default_timer()
        time_delta = start_time - end_time
        np.save(f'{ckpt_path}/time.npy', time_delta)

        num_wrong, mae = evaluate_on_cm(sig_tp, cm_tp)

    print(
        f'[{n_bits} bits]:\n'
        f'\t{time_delta} time delta\n'
        f'\t{num_wrong} wrong\n'
        f'\t{mae} MAE\n'
    )