# Finding Similar Items: Textually Similar Documents
### Authors: Eva Engel, Tori Leatherman
### November 14, 2022

### Import libraries and class functions

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import time

import sys
sys.path.insert(0,"..")
from classes import shingling, compare_sets, min_hashing, compare_signatures, lsh
from preprocessor import Preprocessor
from dataset_reader import read_dataset

### Initialize parameters and functions

In [37]:
k_shingles = 10
n_signature = 500
n_documents = 100
sim_threshold = 0.8
n_bands = 100

preprocessor = Preprocessor()
Shingling = shingling(k_shingles)
Min_hashing = min_hashing(n_signature)
LSH = lsh(n_bands, sim_threshold)

### Read and preprocess data

In [38]:
docs = read_dataset('tech_articles.zip', n_documents)
preprocessed_docs = preprocessor.preprocess_documents(docs)


['news_0000001.json', 'news_0000002.json', 'news_0000003.json', 'news_0000004.json', 'news_0000005.json', 'news_0000006.json', 'news_0000007.json', 'news_0000008.json', 'news_0000009.json', 'news_0000010.json', 'news_0000011.json', 'news_0000012.json', 'news_0000013.json', 'news_0000014.json', 'news_0000015.json', 'news_0000016.json', 'news_0000017.json', 'news_0000018.json', 'news_0000019.json', 'news_0000020.json', 'news_0000021.json', 'news_0000022.json', 'news_0000023.json', 'news_0000024.json', 'news_0000025.json', 'news_0000026.json', 'news_0000027.json', 'news_0000028.json', 'news_0000029.json', 'news_0000030.json', 'news_0000031.json', 'news_0000032.json', 'news_0000033.json', 'news_0000034.json', 'news_0000035.json', 'news_0000036.json', 'news_0000037.json', 'news_0000038.json', 'news_0000039.json', 'news_0000040.json', 'news_0000041.json', 'news_0000042.json', 'news_0000043.json', 'news_0000044.json', 'news_0000045.json', 'news_0000046.json', 'news_0000047.json', 'news_000004

### Use shingling, min hashing, and LSH to find similar documents

In [42]:
character_matrix = Shingling.create_char_matrix(preprocessed_docs)
signature = Min_hashing.compute_signature_hash(character_matrix)
similar_documents = LSH.find_similar(signature)

In [43]:
print(similar_documents)

[(46, 75), (46, 78), (75, 78), (92, 94)]
