In [1]:
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import glob
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
from sklearn.cluster import KMeans
from transformers import BertTokenizer, BertModel
import torch
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
# Read txt files from directory
def custom_sort(file_name):
    base = re.search(r'(\d+)([a-zA-Z]*)', file_name.split('/')[-1].split('.txt')[0])
    if base:
        return int(base.group(1)), base.group(2)
    else:
        return 0, ''

txt_files = glob.glob("/Users/selenading/Downloads/Policy Texts/*.txt")
documents = []

for txt_file in sorted(txt_files, key=custom_sort):
    try:
        with open(txt_file, 'r', encoding='utf-8') as f:
            documents.append(f.read())
    except UnicodeDecodeError:
        with open(txt_file, 'r', encoding='ISO-8859-1') as f:
            documents.append(f.read())

In [None]:
# Preprocess using Shaine's code
import re
from nltk.corpus import stopwords
import string
from collections import Counter

class TextProcessor:
    def __init__(self, input_corpus: list):
        self.processed_corpus = input_corpus

    def remove_digits_and_punctuation(self):
        """Removes digits and punctuation"""
        regex_pattern = re.compile(r'[0-9]+|[^\w\s]')
        self.processed_corpus = [regex_pattern.sub(" ", row) for row in self.processed_corpus]

    def to_lowercase(self):
        """Converts text to lowercase"""
        self.processed_corpus = [row.lower() for row in self.processed_corpus]

    def remove_stop_words(self):
        """Removes stop words"""
        stops = set(stopwords.words('english')) - {
            "shan't", "couldn't", "against", "shouldn't", "can't",
            "needn't", "should've", "not", "mustn't", "will"
        }
        
        stops.update(string.ascii_lowercase)
        stops.update(['ii', 'iii', 'iv'])

        self.processed_corpus = [
            " ".join([token for token in row.split() if token not in stops])
            for row in self.processed_corpus
        ]

    def remove_common_words(self):
        """Removes the 20 most common words from the corpus"""
        counter = Counter(" ".join(self.processed_corpus).split())
        most_common = set(word for word, count in counter.most_common(20))
        
        self.processed_corpus = [
            " ".join(token for token in row.split() if token not in most_common)
            for row in self.processed_corpus
        ]

    def process(self):
        """Apply all processing steps"""
        self.remove_digits_and_punctuation()
        self.to_lowercase()
        self.remove_stop_words()
        self.remove_common_words()
        return self.processed_corpus

# Usage
processor = TextProcessor(documents)
preprocessed_documents = processor.process()

In [13]:
# SBert
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import glob
import re

def custom_sort(file_name):
    base = re.search(r'(\d+)([a-zA-Z]*)', file_name.split('/')[-1].split('.txt')[0])
    if base:
        return int(base.group(1)), base.group(2)
    else:
        return 0, ''

txt_files = glob.glob("/Users/selenading/Downloads/Policy Texts/*.txt")
txt_files_sorted = sorted(txt_files, key=custom_sort)

# Initialize the SBERT model
model = SentenceTransformer('all-mpnet-base-v2')

embeddings = model.encode(preprocessed_documents)

# Calculate the cosine similarities
cosine_similarities = cosine_similarity(embeddings)

similarity_threshold = 0.8 

similar_pairs = []
for i in range(len(cosine_similarities)):
    for j in range(i + 1, len(cosine_similarities)):
        if cosine_similarities[i, j] > similarity_threshold:
            similar_pairs.append((txt_files_sorted[i], txt_files_sorted[j], cosine_similarities[i, j]))

# Sort pairs by similarity score
similar_pairs_sorted = sorted(similar_pairs, key=lambda x: x[2], reverse=True)

for pair in similar_pairs_sorted:
    print(f"File '{pair[0]}' and File '{pair[1]}' have a similarity score of {pair[2]:.2f}")


File '/Users/selenading/Downloads/Policy Texts/93.txt' and File '/Users/selenading/Downloads/Policy Texts/145.txt' have a similarity score of 0.92
File '/Users/selenading/Downloads/Policy Texts/61.txt' and File '/Users/selenading/Downloads/Policy Texts/68.txt' have a similarity score of 0.91
File '/Users/selenading/Downloads/Policy Texts/85.txt' and File '/Users/selenading/Downloads/Policy Texts/101.txt' have a similarity score of 0.91
File '/Users/selenading/Downloads/Policy Texts/72.txt' and File '/Users/selenading/Downloads/Policy Texts/98.txt' have a similarity score of 0.91
File '/Users/selenading/Downloads/Policy Texts/13.txt' and File '/Users/selenading/Downloads/Policy Texts/14.txt' have a similarity score of 0.90
File '/Users/selenading/Downloads/Policy Texts/16.txt' and File '/Users/selenading/Downloads/Policy Texts/101.txt' have a similarity score of 0.90
File '/Users/selenading/Downloads/Policy Texts/68.txt' and File '/Users/selenading/Downloads/Policy Texts/98.txt' have a 

In [14]:
similarity_threshold = 0.9 

similar_pairs = []
for i in range(len(cosine_similarities)):
    for j in range(i + 1, len(cosine_similarities)):
        if cosine_similarities[i, j] > similarity_threshold:
            similar_pairs.append((txt_files_sorted[i], txt_files_sorted[j], cosine_similarities[i, j]))

similar_pairs_sorted = sorted(similar_pairs, key=lambda x: x[2], reverse=True)

for pair in similar_pairs_sorted:
    print(f"File '{pair[0]}' and File '{pair[1]}' have a similarity score of {pair[2]:.2f}")


File '/Users/selenading/Downloads/Policy Texts/93.txt' and File '/Users/selenading/Downloads/Policy Texts/145.txt' have a similarity score of 0.92
File '/Users/selenading/Downloads/Policy Texts/61.txt' and File '/Users/selenading/Downloads/Policy Texts/68.txt' have a similarity score of 0.91
File '/Users/selenading/Downloads/Policy Texts/85.txt' and File '/Users/selenading/Downloads/Policy Texts/101.txt' have a similarity score of 0.91
File '/Users/selenading/Downloads/Policy Texts/72.txt' and File '/Users/selenading/Downloads/Policy Texts/98.txt' have a similarity score of 0.91
File '/Users/selenading/Downloads/Policy Texts/13.txt' and File '/Users/selenading/Downloads/Policy Texts/14.txt' have a similarity score of 0.90
File '/Users/selenading/Downloads/Policy Texts/16.txt' and File '/Users/selenading/Downloads/Policy Texts/101.txt' have a similarity score of 0.90


In [None]:
# Longformer
import glob
import re
import string
import torch
import numpy as np
from nltk.corpus import stopwords
from transformers import LongformerModel, LongformerTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter


# Initialize the Longformer tokenizer and model
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

model = LongformerModel.from_pretrained('allenai/longformer-base-4096').to('cpu')

inputs = tokenizer(preprocessed_documents, padding=True, truncation=True, return_tensors="pt", max_length=4096)
inputs = inputs.to('cpu')
with torch.no_grad():
    outputs = model(**inputs)
    mask = inputs.attention_mask.unsqueeze(-1).expand(outputs.last_hidden_state.size()).float()
    masked_embeddings = outputs.last_hidden_state * mask
    max_pooled_embeddings = torch.max(masked_embeddings, dim=1).values.cpu().numpy()  # Max pooling operation

# Calculate the cosine similarities
cosine_similarities = cosine_similarity(max_pooled_embeddings)
similarity_threshold = 0.9

similar_pairs = []
for i in range(len(cosine_similarities)):
    for j in range(i + 1, len(cosine_similarities)):
        if cosine_similarities[i, j] > similarity_threshold:
            similar_pairs.append((file_paths[i], file_paths[j], cosine_similarities[i, j]))

similar_pairs_sorted = sorted(similar_pairs, key=lambda x: x[2], reverse=True)

for pair in similar_pairs_sorted:
    print(f"File '{pair[0]}' and File '{pair[1]}' have a similarity score of {pair[2]:.2f}")



In [27]:
# Threshold for similarity
similarity_threshold = 0.988

# Find pairs above the threshold
similar_pairs = []
for i in range(len(cosine_similarities)):
    for j in range(i + 1, len(cosine_similarities)):
        if cosine_similarities[i, j] > similarity_threshold:
            similar_pairs.append((file_paths[i], file_paths[j], cosine_similarities[i, j]))

# Sort pairs by similarity
similar_pairs_sorted = sorted(similar_pairs, key=lambda x: x[2], reverse=True)

# Print out the similar pairs with their similarity score
for pair in similar_pairs_sorted:
    print(f"File '{pair[0]}' and File '{pair[1]}' have a similarity score of {pair[2]:.2f}")

File '/Users/selenading/Downloads/Policy Texts/20.txt' and File '/Users/selenading/Downloads/Policy Texts/31.txt' have a similarity score of 0.99
File '/Users/selenading/Downloads/Policy Texts/40.txt' and File '/Users/selenading/Downloads/Policy Texts/45.txt' have a similarity score of 0.99
File '/Users/selenading/Downloads/Policy Texts/16.txt' and File '/Users/selenading/Downloads/Policy Texts/100.txt' have a similarity score of 0.99
File '/Users/selenading/Downloads/Policy Texts/31.txt' and File '/Users/selenading/Downloads/Policy Texts/112.txt' have a similarity score of 0.99
File '/Users/selenading/Downloads/Policy Texts/13.txt' and File '/Users/selenading/Downloads/Policy Texts/112.txt' have a similarity score of 0.99
File '/Users/selenading/Downloads/Policy Texts/13.txt' and File '/Users/selenading/Downloads/Policy Texts/31.txt' have a similarity score of 0.99


In [17]:
# Bert with mean pooling
import os
import glob
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to read and preprocess text, removing non-UTF-8 characters
def read_and_preprocess_text(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        lines = file.readlines()
        # Remove non-UTF-8 characters and join the lines
        cleaned_lines = [line.encode('utf-8', 'ignore').decode('utf-8', 'ignore') for line in lines]
        text = ' '.join(cleaned_lines)
    return text

# Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"  # You can change this to a different BERT model
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the directory containing your .txt files
text_dir = "/Users/selenading/Downloads/Policy Texts"

# Read and preprocess the documents
txt_files = glob.glob(os.path.join(text_dir, "*.txt"))
documents = []

for txt_file in txt_files:
    document = read_and_preprocess_text(txt_file)
    documents.append(document)

# Tokenize and encode the documents using BERT tokenizer
encoded_documents = tokenizer(documents, padding=True, truncation=True, return_tensors="pt", max_length=512)

# Get embeddings for your documents using hidden states and mean pooling
with torch.no_grad():
    outputs = model(**encoded_documents)
    hidden_states = outputs.last_hidden_state
    document_embeddings = hidden_states.mean(dim=1)

# Calculate cosine similarity between document pairs
similarity_matrix = cosine_similarity(document_embeddings, document_embeddings)

# Set a similarity threshold (adjust as needed)
threshold = 0.98

# Identify similar document pairs
similar_pairs = []

for i in range(len(txt_files)):
    for j in range(i + 1, len(txt_files)):
        if similarity_matrix[i, j] > threshold:
            similar_pairs.append((txt_files[i], txt_files[j]))

# Print similar document pairs
for pair in similar_pairs:
    print(f"Similar Pair: {pair[0]} and {pair[1]}")


Similar Pair: /Users/selenading/Downloads/Policy Texts/98.txt and /Users/selenading/Downloads/Policy Texts/72.txt


In [25]:
threshold = 0.97

# Identify similar document pairs
similar_pairs = []

for i in range(len(txt_files)):
    for j in range(i + 1, len(txt_files)):
        if similarity_matrix[i, j] > threshold:
            similar_pairs.append((txt_files[i], txt_files[j]))

# Print similar document pairs
for pair in similar_pairs:
    print(f"Similar Pair: {pair[0]} and {pair[1]}")

Similar Pair: /Users/selenading/Downloads/Policy Texts/16.txt and /Users/selenading/Downloads/Policy Texts/98.txt
Similar Pair: /Users/selenading/Downloads/Policy Texts/16.txt and /Users/selenading/Downloads/Policy Texts/72.txt
Similar Pair: /Users/selenading/Downloads/Policy Texts/63.txt and /Users/selenading/Downloads/Policy Texts/70.txt
Similar Pair: /Users/selenading/Downloads/Policy Texts/98.txt and /Users/selenading/Downloads/Policy Texts/72.txt
Similar Pair: /Users/selenading/Downloads/Policy Texts/79.txt and /Users/selenading/Downloads/Policy Texts/78.txt
Similar Pair: /Users/selenading/Downloads/Policy Texts/36.txt and /Users/selenading/Downloads/Policy Texts/32.txt


In [32]:
# Bert with max pooling
import os
import glob
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to read and preprocess text, removing non-UTF-8 characters
def read_and_preprocess_text(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        lines = file.readlines()
        # Remove non-UTF-8 characters and join the lines
        cleaned_lines = [line.encode('utf-8', 'ignore').decode('utf-8', 'ignore') for line in lines]
        text = ' '.join(cleaned_lines)
    return text

# Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"  # You can change this to a different BERT model
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the directory containing your .txt files
text_dir = "/Users/selenading/Downloads/Policy Texts"

# Read and preprocess the documents
txt_files = glob.glob(os.path.join(text_dir, "*.txt"))
documents = []

for txt_file in txt_files:
    document = read_and_preprocess_text(txt_file)
    documents.append(document)

# Tokenize and encode the documents using BERT tokenizer
encoded_documents = tokenizer(documents, padding=True, truncation=True, return_tensors="pt", max_length=512)

# Get embeddings for your documents using hidden states and max pooling
with torch.no_grad():
    outputs = model(**encoded_documents)
    hidden_states = outputs.last_hidden_state
    document_embeddings = hidden_states.max(dim=1).values  # Use max pooling here

# Calculate cosine similarity between document pairs
similarity_matrix = cosine_similarity(document_embeddings, document_embeddings)

# Set a similarity threshold (adjust as needed)
threshold = 0.98

# Identify similar document pairs
similar_pairs = []

for i in range(len(txt_files)):
    for j in range(i + 1, len(txt_files)):
        if similarity_matrix[i, j] > threshold:
            similar_pairs.append((txt_files[i], txt_files[j]))

# Print similar document pairs
for pair in similar_pairs:
    print(f"Similar Pair: {pair[0]} and {pair[1]}")


Similar Pair: /Users/selenading/Downloads/Policy Texts/29.txt and /Users/selenading/Downloads/Policy Texts/101.txt
Similar Pair: /Users/selenading/Downloads/Policy Texts/29.txt and /Users/selenading/Downloads/Policy Texts/16.txt
Similar Pair: /Users/selenading/Downloads/Policy Texts/29.txt and /Users/selenading/Downloads/Policy Texts/72.txt
Similar Pair: /Users/selenading/Downloads/Policy Texts/29.txt and /Users/selenading/Downloads/Policy Texts/69.txt
Similar Pair: /Users/selenading/Downloads/Policy Texts/29.txt and /Users/selenading/Downloads/Policy Texts/53.txt
Similar Pair: /Users/selenading/Downloads/Policy Texts/29.txt and /Users/selenading/Downloads/Policy Texts/36.txt
Similar Pair: /Users/selenading/Downloads/Policy Texts/29.txt and /Users/selenading/Downloads/Policy Texts/26.txt
Similar Pair: /Users/selenading/Downloads/Policy Texts/15.txt and /Users/selenading/Downloads/Policy Texts/53.txt
Similar Pair: /Users/selenading/Downloads/Policy Texts/114.txt and /Users/selenading/Do

In [41]:
# Set a similarity threshold (adjust as needed)
threshold = 0.99

# Identify similar document pairs
similar_pairs = []

for i in range(len(txt_files)):
    for j in range(i + 1, len(txt_files)):
        if similarity_matrix[i, j] > threshold:
            similar_pairs.append((txt_files[i], txt_files[j]))

# Print similar document pairs
for pair in similar_pairs:
    print(f"Similar Pair: {pair[0]} and {pair[1]}")

Similar Pair: /Users/selenading/Downloads/Policy Texts/79.txt and /Users/selenading/Downloads/Policy Texts/78.txt
