*Task 1*

In [22]:
import pandas as pd
import numpy as np
from hashlib import sha256
from scipy.sparse import csr_matrix

class MinHashLSHFromDataFrame:
    def __init__(self, dataframe, content_column, num_hash_functions=100):
        self.dataframe = dataframe
        self.content_column = content_column
        self.num_hash_functions = num_hash_functions
        self.hash_functions = [self._generate_hash_function(i) for i in range(num_hash_functions)]
        self.signatures = None
        self.buckets = None
        self.shingling_result = None
        self.shingle_mapping = {}
        self.shingles_set = set()

    def _generate_hash_function(self, seed):
        def hash_function(x):
            return int(sha256(f"{seed}_{x}".encode()).hexdigest(), 16)
        return hash_function

    def shingling(self, k=3):
        if self.shingling_result is not None:
            return self.shingling_result

        rows, cols, data = [], [], []
        for doc_idx, document in enumerate(self.dataframe[self.content_column]):
            doc_shingles = set()
            for i in range(len(document) - k + 1):
                shingle = document[i:i+k]
                if shingle not in self.shingle_mapping:
                    self.shingle_mapping[shingle] = len(self.shingle_mapping)
                if shingle not in doc_shingles:
                    doc_shingles.add(shingle)
                    shingle_idx = self.shingle_mapping[shingle]
                    rows.append(doc_idx)
                    cols.append(shingle_idx)
                    data.append(1)
        self.shingling_result = csr_matrix((data, (rows, cols)), shape=(len(self.dataframe), len(self.shingle_mapping)))
        return self.shingling_result

    def minhashing(self):
        if self.signatures is not None:
            return self.signatures

        self.shingling()  # Ensure shingling is done first
        num_docs, num_shingles = self.shingling_result.shape
        signatures = np.full((num_docs, self.num_hash_functions), np.inf)
        for doc_idx in range(num_docs):
            doc_shingles = self.shingling_result[doc_idx].nonzero()[1]
            for shingle_idx in doc_shingles:
                hash_values = np.array([self.hash_functions[i](shingle_idx) for i in range(self.num_hash_functions)])
                signatures[doc_idx] = np.minimum(signatures[doc_idx], hash_values)
        self.signatures = pd.DataFrame(signatures)
        return self.signatures

    def locality_sensitivity_hashing(self, num_bands=20, rows_per_band=5):
        if self.signatures is None:
            self.minhashing()  # Ensure signatures are ready

        assert self.signatures.shape[1] == num_bands * rows_per_band, "Number of hash functions must equal num_bands * rows_per_band."
        buckets = {}
        num_rows = self.signatures.shape[0]
        for band_index in range(num_bands):
            start_index = band_index * rows_per_band
            end_index = start_index + rows_per_band
            for row_index in range(num_rows):
                signature_slice = tuple(self.signatures.iloc[row_index, start_index:end_index])
                bucket_id = hash((band_index, signature_slice))
                if bucket_id in buckets:
                    buckets[bucket_id].add(row_index)
                else:
                    buckets[bucket_id] = {row_index}
        self.buckets = buckets
        return self.buckets

    def jaccard_similarity(self, doc1_idx, doc2_idx):
        doc1_shingles = set(self.shingling_result[doc1_idx].nonzero()[1])
        doc2_shingles = set(self.shingling_result[doc2_idx].nonzero()[1])
        intersection = len(doc1_shingles & doc2_shingles)
        union = len(doc1_shingles | doc2_shingles)
        return intersection / union if union else 0

    def jaccard_distance(self, doc1_idx, doc2_idx):
        doc1_shingles = set(self.shingling_result[doc1_idx].nonzero()[1])
        doc2_shingles = set(self.shingling_result[doc2_idx].nonzero()[1])
        intersection = len(doc1_shingles & doc2_shingles)
        union = len(doc1_shingles | doc2_shingles)
        return 1 - (intersection / union) if union else 1

    def find_top_similar_documents(self, doc_idx, n):
        if self.shingling_result is None:
            self.shingling()
        num_docs = self.shingling_result.shape[0]
        similarities = [(other_idx, self.jaccard_similarity(doc_idx, other_idx)) for other_idx in range(num_docs) if other_idx != doc_idx]
        top_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[:n]
        return pd.DataFrame(top_similarities, columns=["Document Index", "Jaccard Similarity"])

    def approx_nearest_neighbors(self, doc_idx, n):
        if self.signatures is None:
            self.minhashing()  # Ensure signatures are generated
        doc_shingles = set(self.shingling_result[doc_idx].nonzero()[1])
        doc_signature = np.array([min([self.hash_functions[i](shingle_idx) for shingle_idx in doc_shingles]) for i in range(self.num_hash_functions)])

        candidates = set()
        num_bands = self.signatures.shape[1] // 5
        for band_index in range(num_bands):
            start_index = band_index * 5
            end_index = start_index + 5
            band_signature = tuple(doc_signature[start_index:end_index])
            bucket_id = hash((band_index, band_signature))
            if bucket_id in self.buckets:
                candidates.update(self.buckets[bucket_id])

        similarities = [(other_idx, self.jaccard_similarity(doc_idx, other_idx)) for other_idx in candidates]
        top_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[:n]
        return pd.DataFrame(top_similarities, columns=["Document Index", "Jaccard Similarity"])

    def run(self):
        self.shingling()
        self.minhashing()
        self.locality_sensitivity_hashing()
        print("Shingling, signature generation, and LSH have been completed.")

if __name__ == "__main__":
    df = pd.DataFrame({
        'contents': [
            "/content/934.txt.",
            "/content/2639.txt.",
        ]
    })

    lsh = MinHashLSHFromDataFrame(df, 'contents', num_hash_functions=100)
    lsh.run()

    # Find top similar documents to the first document
    top_similarities = lsh.find_top_similar_documents(0, 3)
    print("Top similar documents to document 2 :")
    print(top_similarities)

    # Find approximate nearest neighbors for the first document
    approx_neighbors = lsh.approx_nearest_neighbors(0, 3)
    print("Approximate nearest neighbors for document 3 :")
    print(approx_neighbors)


Shingling, signature generation, and LSH have been completed.
Top similar documents to document 2 :
   Document Index  Jaccard Similarity
0               1             0.47619
Approximate nearest neighbors for document 3 :
Empty DataFrame
Columns: [Document Index, Jaccard Similarity]
Index: []


In [18]:
with open("/content/WebOfScience-5736.txt", "r", encoding="utf-8") as file:
    file_content = file.readlines()

print("Dòng 934:", file_content[934])

print("Dòng 2639:", file_content[2639])

Dòng 934: Endonuclease cleavage is the rate-limiting step in the decay of nonsense-containing human beta-globin mRNA in erythroid cells. The 5'-truncated intermediates thus generated are polyadenylated and more stable than the parent mRNA. Northern blotting is commonly used to measure the decay rate of full-length mRNA, and S1 nuclease protection is used to assay the fate of decay intermediates. We have adapted the more sensitive and facile MBRACE assay (Lasham et al., Nucleic Acids Res 38: e19, 2010) to quantitatively monitor the decay process by detecting full-length beta-globin and its decay intermediates.

Dòng 2639: Inferring beliefs and social emotions of others has different neural substrates and possibly different roles in the pathophysiology of different clinical phases of schizophrenia. The current study investigated the neural basis for inferring others' beliefs and social emotions, as individual concepts, in 17 subjects at ultra-high risk for psychosis (UHR), 16 patients wi