In [None]:
from google.colab import drive
import numpy as np
drive.mount('/content/drive')
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd

Mounted at /content/drive


Load data and preprocessing

In [None]:
import pandas as pd
import os

# Load the Policy Metadata file
csv_file = 'directory of policy meta data file'

# Create a dictionary to map ID to state and action year
id_to_state_and_year = dict(zip(metadata['ID'].astype(str), zip(metadata['State / Jurisdiction'], metadata['Action_Year'])))

# Function to match txt file names with state and action year
def get_state_and_year_for_file(file_name, id_to_state_and_year):
    # Handle cases with letters in file names (e.g., 11a, 11b)
    base_file_name = ''.join([char for char in file_name if char.isdigit()])
    return id_to_state_and_year.get(base_file_name)

text_files_directory = "Directory containing the text files"
# Create a DataFrame to store file names, their states, and action years
file_state_year_data = []
for file_name in os.listdir(text_files_directory):
    if file_name.endswith('.txt'):
        state_year_info = get_state_and_year_for_file(file_name.split('.')[0], id_to_state_and_year)
        if state_year_info:
            state, action_year = state_year_info
            file_state_year_data.append({'File Name': file_name, 'State': state, 'Action Year': action_year})

# Convert to DataFrame
file_state_df = pd.DataFrame(file_state_year_data)
file_state_df['Action Year']=file_state_df['Action Year'].astype("int")

print(file_state_df)


    File Name          State  Action Year
0       3.txt         Alaska         2016
1       4.txt         Alaska         2018
2       7.txt       Arkansas         2017
3       8.txt       Arkansas         2015
4      10.txt        Arizona         2021
..        ...            ...          ...
151   147.txt      Wisconsin            0
152   149.txt  West Virginia            0
153    62.txt       Michigan         1987
154    40.txt        Indiana            0
155    87.txt         Nevada            0

[156 rows x 3 columns]


In [None]:
# Preprocessing
import nltk
import re
from collections import Counter
from nltk.corpus import stopwords
import string
nltk.download('stopwords')
class TextProcessor:
    def __init__(self, input_corpus: list):
        self.processed_corpus = input_corpus

    def remove_digits_and_punctuation(self):
        regex_pattern = re.compile(r'[0-9]+|[^\w\s]')
        self.processed_corpus = [regex_pattern.sub(" ", row) for row in self.processed_corpus]

    def to_lowercase(self):
        self.processed_corpus = [row.lower() for row in self.processed_corpus]

    def remove_stop_words(self):
      # Customize the stopwords by condition
        stops = set(stopwords.words('english')) - {
            "shan't", "couldn't", "against", "shouldn't", "can't",
            "needn't", "should've", "not", "mustn't", "will"
        }
        stops.update(string.ascii_lowercase)
        stops.update(['ii', 'iii', 'iv'])
        self.processed_corpus = [
            " ".join([token for token in row.split() if token not in stops])
            for row in self.processed_corpus
        ]

    def remove_common_words(self):
        counter = Counter(" ".join(self.processed_corpus).split())
        most_common = set(word for word, count in counter.most_common(10))
        self.processed_corpus = [
            " ".join(token for token in row.split() if token not in most_common)
            for row in self.processed_corpus
        ]

    def process(self):
        self.remove_digits_and_punctuation()
        self.to_lowercase()
        self.remove_stop_words()
        self.remove_common_words()
        return self.processed_corpus

def read_text_files(directory):
  # Some of our data contains Latin and other characters that cannto be identified by utf-8, so we also use a more general encoding method
    file_contents = {}
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    file_contents[filename] = file.read()
            except UnicodeDecodeError:
                try:
                    with open(file_path, 'r', encoding='ISO-8859-1') as file:
                        file_contents[filename] = file.read()
                except Exception as e:
                    print(f"Failed to read {filename}: {e}")
    return file_contents
# Load and preprocess the text files
file_directory = "Directory containing the text files"
file_contents = read_text_files(file_directory)
text_processor = TextProcessor(list(file_contents.values()))
preprocessed_texts = text_processor.process()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Creating different dictionaries that maps between our state, year and filename data
file_to_state = dict(zip(file_state_df['File Name'], file_state_df['State']))
file_to_year_map = dict(zip(file_state_df['File Name'], file_state_df['Action Year']))
file_to_state_map = dict(zip(file_state_df['File Name'], file_state_df['State']))

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━[0m [32m204.8/227.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (

In [None]:
!pip install gensim



In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.12.0-py3-none-any.whl (234 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4227148 sha256=4a67a5dbf78453709422136df93f4b7741e814c503e2f3a86910b8ff82473453
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.12.0


In [None]:
import fasttext
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz



'cc.en.300.bin'

In [None]:
# Download GloVe embeddings (300-dimensional vectors)
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip -d glove.6B


In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B/glove.6B.300d.txt'
word2vec_output_file = 'glove.6B.300d.word2vec.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

  glove2word2vec(glove_input_file, word2vec_output_file)


(400001, 300)

In [None]:
import zipfile
from gensim.scripts.glove2word2vec import glove2word2vec

glove_zip_path = '/content/drive/MyDrive/Policy_comparison/glove.6B.zip'
glove_folder = '/content/glove.6B'
with zipfile.ZipFile(glove_zip_path, 'r') as zip_ref:
    zip_ref.extractall(glove_folder)
glove_input_file = os.path.join(glove_folder, 'glove.6B.300d.txt')
word2vec_output_file = os.path.join(glove_folder, 'glove.6B.300d.word2vec.txt')
glove2word2vec(glove_input_file, word2vec_output_file)

Model

In [None]:
from memory_profiler import memory_usage
from collections import defaultdict
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch
from gensim.models import LdaMulticore
from gensim.models import KeyedVectors
from gensim.corpora import Dictionary
from nltk.tokenize import word_tokenize
import fasttext
import time
start_time = time.time()

def model_1(preprocessed_texts,n=10, top = True):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, lowercase=False, stop_words=None)
    tfidf = tfidf_vectorizer.fit_transform(preprocessed_texts)
    nmf = NMF(n_components=10, random_state=42)
    nmf_features = nmf.fit_transform(tfidf)
    similarity_matrix = cosine_similarity(nmf_features)
    similarity_scores = {}
    total_texts = len(preprocessed_texts)
    for i in range(total_texts):
        for j in range(i + 1, total_texts):
            if file_to_state[list(file_contents.keys())[i]] != file_to_state[list(file_contents.keys())[j]]:
                similarity_scores[(i, j)] = similarity_matrix[i, j]
    top_pairs = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=top)[:n]
    return top_pairs

def model_2(preprocessed_texts,n=10, top = True):
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), lowercase=False, stop_words=None, max_df=0.95, min_df=2)
    tfidf = tfidf_vectorizer.fit_transform(preprocessed_texts)
    nmf = NMF(n_components=10, random_state=42)
    nmf_features = nmf.fit_transform(tfidf)
    similarity_matrix = cosine_similarity(nmf_features)
    similarity_scores = {}
    total_texts = len(preprocessed_texts)
    for i in range(total_texts):
        for j in range(i + 1, total_texts):
            if file_to_state[list(file_contents.keys())[i]] != file_to_state[list(file_contents.keys())[j]]:
                similarity_scores[(i, j)] = similarity_matrix[i, j]
    top_pairs = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=top)[:n]
    return top_pairs


def model_3(preprocessed_texts, n=10, top = True):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(preprocessed_texts, convert_to_tensor=False)
    pca = PCA(n_components=10)
    reduced_embeddings = pca.fit_transform(embeddings)
    similarity_matrix = cosine_similarity(reduced_embeddings)
    similarity_scores = {}
    total_texts = len(preprocessed_texts)
    for i in range(total_texts):
        for j in range(i + 1, total_texts):
            if file_to_state[list(file_contents.keys())[i]] != file_to_state[list(file_contents.keys())[j]]:
                similarity_scores[(i, j)] = similarity_matrix[i, j]
    top_pairs = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=top)[:n]
    return top_pairs

def model_4(preprocessed_texts, n=10, top = True):
    model = fasttext.load_model('cc.en.300.bin')
    doc_embeddings = np.zeros((len(preprocessed_texts), 300))
    for i, doc in enumerate(preprocessed_texts):
        tokens = doc.split()
        doc_embeddings[i] = model.get_sentence_vector(' '.join(tokens))
    similarity_matrix = cosine_similarity(doc_embeddings)
    similarity_scores = {}
    for i in range(len(preprocessed_texts)):
        for j in range(i + 1, len(preprocessed_texts)):
            if file_to_state[list(file_contents.keys())[i]] != file_to_state[list(file_contents.keys())[j]]:
                similarity_scores[(i, j)] = similarity_matrix[i, j]
    top_pairs = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=top)[:n]
    return top_pairs

def model_5(preprocessed_texts,n=10, top = True):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    def get_bert_embedding(text, tokenizer, model):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512, padding=True)
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
    bert_embeddings = [get_bert_embedding(doc, tokenizer, model) for doc in preprocessed_texts]
    similarity_matrix = cosine_similarity(bert_embeddings)
    similarity_scores = {}
    total_texts = len(preprocessed_texts)
    for i in range(total_texts):
        for j in range(i + 1, total_texts):
            if file_to_state[list(file_contents.keys())[i]] != file_to_state[list(file_contents.keys())[j]]:
                similarity_scores[(i, j)] = similarity_matrix[i, j]
    top_pairs = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=top)[:n]
    return top_pairs

def model_6(preprocessed_texts, n=10, top = True):
    glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
    doc_embeddings = np.zeros((len(preprocessed_texts), 300))
    for i, doc in enumerate(preprocessed_texts):
        tokens = doc.split()
        embeddings = [glove_model[word] for word in tokens if word in glove_model]
        if embeddings:
            doc_embeddings[i] = np.mean(embeddings, axis=0)
    pca = PCA(n_components=10)
    reduced_doc_embeddings = pca.fit_transform(doc_embeddings)
    similarity_matrix = cosine_similarity(reduced_doc_embeddings)
    similarity_scores = {}
    for i in range(len(preprocessed_texts)):
        for j in range(i + 1, len(preprocessed_texts)):
            if file_to_state[list(file_contents.keys())[i]] != file_to_state[list(file_contents.keys())[j]]:
                similarity_scores[(i, j)] = similarity_matrix[i, j]
    top_pairs = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=top)[:n]
    return [(pair[0], pair[1]) for pair in top_pairs]

  from tqdm.autonotebook import tqdm, trange


Top Similar

In [None]:
def ensemble_models(preprocessed_texts, file_to_state, n, top = True):
    top_pairs = defaultdict(lambda: {'count': 0, 'models': []})
    file_names = list(file_to_state.keys())

    model_functions = [model_1, model_2, model_3, model_4, model_5, model_6]
    model_names = ['Model 1', 'Model 2', 'Model 3', 'Model 4', 'Model 5', 'Model 6']

    for model_func, model_name in zip(model_functions, model_names):
        top_indices = model_func(preprocessed_texts, n)
        for (i, j), _ in top_indices:
            filenames = (file_names[i], file_names[j])
            top_pairs[filenames]['count'] += 1
            top_pairs[filenames]['models'].append(model_name)
    common_pairs_with_counts = [
        (pair, info['count'], info['models'])
        for pair, info in top_pairs.items()
        if info['count'] >= 2
    ]

    return common_pairs_with_counts

top_pairs_with_counts = ensemble_models(preprocessed_texts, file_to_state, 30, top = True)
top_pairs_with_counts_sorted = sorted(top_pairs_with_counts, key=lambda x: x[1], reverse=True)

# this returns most voted  pairs of policy files, and also provide which model voted for it

for pair, count, models in top_pairs_with_counts_sorted:
    models_str = ', '.join(models)
    print(f"{pair} occur {count} times: {models_str}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

('45.txt', '40.txt') occur 5 times: Model 1, Model 3, Model 4, Model 5, Model 6
('2.txt', '124.txt') occur 4 times: Model 1, Model 2, Model 4, Model 6
('93.txt', '145.txt') occur 4 times: Model 3, Model 4, Model 5, Model 6
('10.txt', '129.txt') occur 3 times: Model 3, Model 4, Model 6
('98.txt', '67.txt') occur 3 times: Model 3, Model 4, Model 5
('72.txt', '98.txt') occur 3 times: Model 3, Model 5, Model 6
('98.txt', '66.txt') occur 3 times: Model 4, Model 5, Model 6
('16.txt', '72.txt') occur 3 times: Model 4, Model 5, Model 6
('17.txt', '72.txt') occur 3 times: Model 4, Model 5, Model 6
('8.txt', '25a.txt') occur 2 times: Model 1, Model 2
('94.txt', '142.txt') occur 2 times: Model 1, Model 2
('8.txt', '25b.txt') occur 2 times: Model 1, Model 2
('2.txt', '40.txt') occur 2 times: Model 1, Model 2
('25.txt', '106.txt') occur 2 times: Model 1, Model 2
('124.txt', '40.txt') occur 2 times: Model 1, Model 2
('8.txt', '25.txt') occur 2 times: Model 1, Model 2
('8.txt', '106.txt') occur 2 tim

Bottom Similar

In [None]:
bottom_pairs_with_counts = ensemble_models(preprocessed_texts, file_to_state, 50, top = False)
bottom_pairs_with_counts_sorted = sorted(bottom_pairs_with_counts, key=lambda x: x[1], reverse=False)

for pair, count, models in bottom_pairs_with_counts_sorted:
    models_str = ', '.join(models)
    print(f"{pair} occur {count} times: {models_str}")



('3.txt', '136.txt') occur 2 times: Model 1, Model 2
('3.txt', '139.txt') occur 2 times: Model 1, Model 2
('4.txt', '45.txt') occur 2 times: Model 1, Model 2
('4.txt', '68.txt') occur 2 times: Model 1, Model 2
('4.txt', '136.txt') occur 2 times: Model 1, Model 2
('4.txt', '139.txt') occur 2 times: Model 1, Model 2
('7.txt', '16.txt') occur 2 times: Model 1, Model 2
('7.txt', '17.txt') occur 2 times: Model 1, Model 2
('8.txt', '20.txt') occur 2 times: Model 1, Model 2
('8.txt', '60.txt') occur 2 times: Model 1, Model 2
('8.txt', '64.txt') occur 2 times: Model 1, Model 2
('8.txt', '70.txt') occur 2 times: Model 1, Model 2
('8.txt', '75.txt') occur 2 times: Model 1, Model 2
('8.txt', '128.txt') occur 2 times: Model 1, Model 2
('8.txt', '142.txt') occur 2 times: Model 1, Model 2
('8.txt', '144.txt') occur 2 times: Model 1, Model 2
('8.txt', '63.txt') occur 2 times: Model 1, Model 2
('10.txt', '16.txt') occur 2 times: Model 1, Model 6
('10.txt', '17.txt') occur 2 times: Model 1, Model 6
('1