In [9]:
import pandas as pd
import numpy as np
import re
from simhash import Simhash, SimhashIndex
from itertools import combinations_with_replacement
from collections import defaultdict
from scipy.sparse import coo_matrix
from scipy.sparse.csgraph import connected_components
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Prepare the Data

In [2]:
data_ads = pd.read_csv('kaggle_text_classified_ads.csv').reset_index().iloc[:1000]
data_ads = data_ads[['index', 'value']]

In [3]:
def get_features(s):
    """
    Returns list of substrings of a given width.  Example: 'how are' -> ['how', 'owa', 'war', 'are']
    """
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]

In [5]:
# view the hash value
Simhash(get_features('How are you? I am fine. Thanks.')).value

5570291454580194887

In [7]:
# calculate hash distance
Simhash(get_features('How are you? We are fine. Thanks.')).distance(Simhash(get_features('How are you? I am fine. Thanks.')))

15

# Hash the Documents

In [81]:
data_ads_dict = data_ads['value'].to_dict()

# hash the documents
data_ads_objs = [(str(k), Simhash(get_features(v))) for k, v in data_ads_dict.items()]

# create an index for efficient searching
distance_threshold = 3  # hashes > this value will not be considered similar
index = SimhashIndex(data_ads_objs, k=distance_threshold)

assert(len(data_ads_objs) == len(data_ads))

In [79]:
# example of finding similar texts for 1 provided text
similar_docs = index.get_near_dups(data_ads_objs[0][1])
similar_docs_text = [data_ads_dict[int(k)] for k in similar_docs]
print(similar_docs_text)

['Overview \r\n \r\nWhy AFFIRMA Rehabilitation? Our innovative occupational therapy clinical programs will challenge new and experienced Occupational Therapists. We have state:of:the:art ACP equipment, modality based programs, well equipped gyms and much more. Our Rehab Directors are experienced, organized, manage schedules well and lead our therapy teams in the ethical care of our residents. If you ve always wanted to work for a company that supports your occupational therapy career development or can help you maintain a work/life balance, you owe it to yourself to find out more about AFFIRMA Rehabilitation.\r\n \r\nAFFIRMA Rehabilitation is changing the way you look at geriatric rehabilitation. With our Homeward Bound Programs, AFFIRMA Rehabilitation facilities are now returning over 75 of their residents to home or community level living. Our interdisciplinary team of Physical, Occupational and Speech Therapists use a collaborative approach to finding the right plan of care of each 

In [80]:
len(index.bucket.keys())

1710

# Assign Document Clusters based on Hash Similarity

In [82]:
# determine how many clusters there are and which docs belong to each cluster
clusters = {}
cluster_id = 0
for simhash_key, hashes in index.bucket.items():
    similar_doc_ids = [int(hash_details.split(",")[1]) for hash_details in hashes]
    clusters[cluster_id] = set(similar_doc_ids)
    cluster_id += 1

Clusters seems like it should contain the cluster assignments for each document, but there are documents appearing in multiple clusters.  To de-duplicate, I will form an adjancency matrix and extract connected components.

In [83]:
print(len(edges), len(data_ads), matrix_shape, len(rows))

798731 1000 (8954, 8954) 798731


In [84]:
# create a list of tuples representing the edges in a graph
edges = set([(x, y) for c in clusters.values() for x, y in combinations_with_replacement(c, 2)])

# create an adjency matrix from the edge list
matrix_shape = (len(data_ads), len(data_ads))  # will be a square adjacency matrix
rows, cols = zip(*edges)
sparse_mat = coo_matrix((np.ones(len(edges)), (rows, cols)), shape=matrix_shape)

# get the cluster for each document
nbr_clusters, cluster = connected_components(sparse_mat)
data_ads['cluster'] = cluster

In [153]:
# inspect a cluster
data_ads[data_ads['cluster']==3]

Unnamed: 0,index,value,cluster
3,3,Be your own boss working under a Strong Brand ...,3
4,4,Be your own boss working under a Strong Brand ...,3
5,5,Be your own boss working under a Strong Brand ...,3
6,6,Be your own boss working under a Strong Brand ...,3
7,7,Be your own boss working under a Strong Brand ...,3
...,...,...,...
114,114,Be your own boss working under a Strong Brand ...,3
115,115,Be your own boss working under a Strong Brand ...,3
116,116,Be your own boss working under a Strong Brand ...,3
117,117,Be your own boss working under a Strong Brand ...,3


# Try Adding New Docs

New docs can be added to the index without having to rebuild it from scratch.  

In [159]:
data_ads_new = pd.read_csv('kaggle_text_classified_ads.csv').reset_index().iloc[1000:1010]
data_ads_new = data_ads_new[['index', 'value']]

data_ads_new_dict = data_ads_new['value'].to_dict()

# hash the documents
data_ads_new_objs = [(str(k), Simhash(get_features(v))) for k, v in data_ads_new_dict.items()]
assert(len(data_ads_new_objs) == len(data_ads_new))

# update the index
for obj in data_ads_new_objs:
    index.add(*obj)

('1000', <simhash.Simhash object at 0x7fe28f038fd0>)
('1001', <simhash.Simhash object at 0x7fe2aef98c40>)
('1002', <simhash.Simhash object at 0x7fe29044d250>)
('1003', <simhash.Simhash object at 0x7fe2e58184c0>)
('1004', <simhash.Simhash object at 0x7fe28f04b6a0>)
('1005', <simhash.Simhash object at 0x7fe28f04b670>)
('1006', <simhash.Simhash object at 0x7fe28f04bd30>)
('1007', <simhash.Simhash object at 0x7fe28f04b580>)
('1008', <simhash.Simhash object at 0x7fe28f04ba00>)
('1009', <simhash.Simhash object at 0x7fe28f04baf0>)


In [160]:
# determine how many clusters there are and which docs belong to each cluster
clusters = {}
cluster_id = 0
for simhash_key, hashes in index.bucket.items():
    similar_doc_ids = [int(hash_details.split(",")[1]) for hash_details in hashes]
    clusters[cluster_id] = set(similar_doc_ids)
    cluster_id += 1

In [166]:
# create a list of tuples representing the edges in a graph
edges = set([(x, y) for c in clusters.values() for x, y in combinations_with_replacement(c, 2)])

# create an adjency matrix from the edge list
matrix_shape = (len(data_ads)+len(data_ads_new), len(data_ads)+len(data_ads_new))  # will be a square adjacency matrix
rows, cols = zip(*edges)
sparse_mat = coo_matrix((np.ones(len(edges)), (rows, cols)), shape=matrix_shape)

# get the cluster for each document
nbr_clusters, cluster = connected_components(sparse_mat)
data_ads_new['cluster'] = cluster[-len(data_ads_new):]

In [167]:
data_ads_new.head()

Unnamed: 0,index,value,cluster
1000,1000,Job Description:/h3:\r\n\r\nExamples of Import...,5
1001,1001,"Branch Location: San Jose, CA \r\n\r\n Carpet ...",5
1002,1002,Registered Nurses Only \r\n Seasonal Nationwid...,108
1003,1003,"Branch Location: San Jose, CA \r\n\r\n Carpet ...",5
1004,1004,Calling All Production Workers\r\n\r\nJob Desc...,5


# Evaluate on Public Text Similarity Datasets

Datasets used:
1. Google PAWS for paraphrased sentence pairs, https://huggingface.co/datasets/paws
2. Kaggle Text Classified Ads, https://www.kaggle.com/overflow012/playing-with-ads

The text classified ads have duplicates but they are not labeled, so I will do a pairwise comparison of a selection of rows to identify exact, verbatim duplicates.  This will test Simhash's ability to find perfect matches.

In [189]:
paws = load_dataset("paws", "labeled_final", split="train")
print(paws)

Reusing dataset paws (/Users/nicholaslincoln/.cache/huggingface/datasets/paws/labeled_final/1.1.0/8d567c6472623f42bd2cc635cad06932d0f0cd2f897db56013c1180f4317d338)


Dataset({
    features: ['id', 'sentence1', 'sentence2', 'label'],
    num_rows: 49401
})


In [235]:
# the sentences will have to be combined for hashing, but keep the index so they can be split back out
split_idx = len(paws)
paws_all_sents = paws['sentence1'] + paws['sentence2']
paws_dict = {idx: sent for idx, sent in enumerate(paws_all_sents)}

# hash the documents
paws_objs = [(str(k), Simhash(get_features(v))) for k, v in paws_dict.items()]

# create an index for efficient searching
distance_threshold = 3  # hashes > this value will not be considered similar, value must be an integer
index = SimhashIndex(paws_objs, k=distance_threshold)

In [236]:
# determine how many clusters there are and which docs belong to each cluster
clusters = {}
cluster_id = 0
for simhash_key, hashes in index.bucket.items():
    similar_doc_ids = [int(hash_details.split(",")[1]) for hash_details in hashes]
    clusters[cluster_id] = set(similar_doc_ids)
    cluster_id += 1

In [237]:
# create a list of tuples representing the edges in a graph
edges = set([(x, y) for c in clusters.values() for x, y in combinations_with_replacement(c, 2)])

# create an adjency matrix from the edge list
matrix_shape = (len(paws_dict), len(paws_dict))  # will be a square adjacency matrix
rows, cols = zip(*edges)
sparse_mat = coo_matrix((np.ones(len(edges)), (rows, cols)), shape=matrix_shape)

# get the cluster for each document
nbr_clusters, cluster = connected_components(sparse_mat)

In [238]:
# split eval data back into 2 columns, add cluster, add label
eval_df = pd.DataFrame({
    'sent1': paws['sentence1'],
    'sent2': paws['sentence2'],
    'label': paws['label'],
    'cluster1': cluster[:split_idx],
    'cluster2': cluster[split_idx:]
})
eval_df['simhash_predicted_similar'] = np.where(eval_df.cluster1 == eval_df.cluster2, 1, 0)

In [239]:
print(
    "Simhash Evaluation on Google PAWS:\n",
    f"accuracy: {accuracy_score(eval_df.label, eval_df.simhash_predicted_similar):2f}\n",
    f"precision: {precision_score(eval_df.label, eval_df.simhash_predicted_similar):2f}\n",
    f"recall: {recall_score(eval_df.label, eval_df.simhash_predicted_similar):2f}\n",
    f"f1: {f1_score(eval_df.label, eval_df.simhash_predicted_similar):2f}"
)

Simhash Evaluation on Google PAWS:
 accuracy: 0.449647
 precision: 0.443090
 recall: 0.955701
 f1: 0.605468


In [241]:
eval_df.simhash_predicted_similar.mean()

0.9530778729175523

So it misses very few texts that are similar, but it has a lot of false positives.  I found that playing with the distance threshold, k, had a big impact, but since it must be an integer, setting it to 2 was too low (it missed everything) and 3 is almost too high (nearly everything is considered similar).  

In [240]:
eval_df.head(25)

Unnamed: 0,sent1,sent2,label,cluster1,cluster2,simhash_predicted_similar
0,"In Paris , in October 1560 , he secretly met t...","In October 1560 , he secretly met with the Eng...",0,0,0,1
1,The NBA season of 1975 -- 76 was the 30th seas...,The 1975 -- 76 season of the National Basketba...,1,0,0,1
2,"There are also specific discussions , public p...","There are also public discussions , profile sp...",0,0,0,1
3,When comparable rates of flow can be maintaine...,The results are high when comparable flow rate...,1,0,0,1
4,It is the seat of Zerendi District in Akmola R...,It is the seat of the district of Zerendi in A...,1,0,0,1
5,William Henry Henry Harman was born on 17 Febr...,"William Henry Harman was born in Waynesboro , ...",1,0,0,1
6,Bullion Express - concept is being introduced ...,2011-DGSE Bullion Express concept is introduce...,0,0,0,1
7,With a discrete amount of probabilities Formul...,Given a discrete set of probabilities formula ...,1,0,0,1
8,The Soviet Union maintained an embassy in Oslo...,The Soviet Union maintained an embassy in Mosc...,0,0,0,1
9,Vocabulary even went to Brazil through leaving...,Vocabulary even went to Brazil by leaving Maca...,0,0,0,1


### Eval on Perfect Matches

In [23]:
# shrink dataset to make it more manageable
data_ads_sub = data_ads.iloc[:50]

# create labels for verbatim similarity for text classified ads
comparison_dict = {}
comparison_dict_index = 0
data_ads_docs = data_ads_sub.value.tolist()
for doc_1 in data_ads_docs:
    for doc_2 in data_ads_docs:
        if doc_1 == doc_2:
            comparison_dict[comparison_dict_index] = (doc_1, doc_2, 1)
        else:
            comparison_dict[comparison_dict_index] = (doc_1, doc_2, 0)
        comparison_dict_index += 1

In [24]:
eval_df = pd.DataFrame.from_dict(comparison_dict, orient="index", columns=["doc1", "doc2", "label"])
eval_df.head()

Unnamed: 0,doc1,doc2,label
0,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,1
1,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,0
2,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Overview:\r\n\r\nUnder general supervision by ...,0
3,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0
4,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0


In [26]:
# the sentences will have to be combined for hashing, but keep the index so they can be split back out
split_idx = len(eval_df)
eval_df_all_sents = eval_df['doc1'].tolist() + eval_df['doc2'].tolist()
eval_df_dict = {idx: sent for idx, sent in enumerate(eval_df_all_sents)}

# hash the documents
eval_df_objs = [(str(k), Simhash(get_features(v))) for k, v in eval_df_dict.items()]

# create an index for efficient searching
distance_threshold = 3  # hashes > this value will not be considered similar, value must be an integer
index = SimhashIndex(eval_df_objs, k=distance_threshold)

In [28]:
# determine how many clusters there are and which docs belong to each cluster
clusters = {}
cluster_id = 0
for simhash_key, hashes in index.bucket.items():
    similar_doc_ids = [int(hash_details.split(",")[1]) for hash_details in hashes]
    clusters[cluster_id] = set(similar_doc_ids)
    cluster_id += 1

In [31]:
# create a list of tuples representing the edges in a graph
edges = set([(x, y) for c in clusters.values() for x, y in combinations_with_replacement(c, 2)])

# create an adjency matrix from the edge list
matrix_shape = (len(eval_df_dict), len(eval_df_dict))  # will be a square adjacency matrix
rows, cols = zip(*edges)
sparse_mat = coo_matrix((np.ones(len(edges)), (rows, cols)), shape=matrix_shape)

# get the cluster for each document
nbr_clusters, cluster = connected_components(sparse_mat)

In [33]:
# split eval data back into 2 columns, add cluster, add label
eval_df['cluster1'] = cluster[:split_idx]
eval_df['cluster2'] = cluster[split_idx:]
eval_df['simhash_predicted_similar'] = np.where(eval_df.cluster1 == eval_df.cluster2, 1, 0)

In [34]:
print(
    "Simhash Evaluation on Google PAWS:\n",
    f"accuracy: {accuracy_score(eval_df.label, eval_df.simhash_predicted_similar):2f}\n",
    f"precision: {precision_score(eval_df.label, eval_df.simhash_predicted_similar):2f}\n",
    f"recall: {recall_score(eval_df.label, eval_df.simhash_predicted_similar):2f}\n",
    f"f1: {f1_score(eval_df.label, eval_df.simhash_predicted_similar):2f}"
)

Simhash Evaluation on Google PAWS:
 accuracy: 1.000000
 precision: 1.000000
 recall: 1.000000
 f1: 1.000000


In [35]:
eval_df.head(25)

Unnamed: 0,doc1,doc2,label,cluster1,cluster2,simhash_predicted_similar
0,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,1,0,0,1
1,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,0,0,1,0
2,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Overview:\r\n\r\nUnder general supervision by ...,0,0,2,0
3,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0,0,3,0
4,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0,0,3,0
5,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0,0,3,0
6,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0,0,3,0
7,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0,0,3,0
8,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0,0,3,0
9,Overview \r\n \r\nWhy AFFIRMA Rehabilitation? ...,Be your own boss working under a Strong Brand ...,0,0,3,0


So simhash finds identical docs perfectly. There were also no false positives, even for a distance threshold, k, of 3.  This suggests that it works better with longer texts.

# Applying to Blackwing Training Data

In [118]:
blackwing_data = pd.read_csv('blackwing_3m_9k.csv')
blackwing_data = blackwing_data[blackwing_data['disposition'].isin(['SELECT', 'IGNORE'])]

blackwing_data_dict = blackwing_data.text.to_dict()

# hash the documents
blackwing_objs = [(str(k), Simhash(get_features(v))) for k, v in blackwing_data_dict.items()]

# create an index for efficient searching
distance_threshold = 1  # hashes > this value will not be considered similar, value must be an integer
index = SimhashIndex(blackwing_objs, k=distance_threshold)

In [119]:
# determine how many clusters there are and which docs belong to each cluster
clusters = {}
cluster_id = 0
for simhash_key, hashes in index.bucket.items():
    similar_doc_ids = [int(hash_details.split(",")[1]) for hash_details in hashes]
    clusters[cluster_id] = set(similar_doc_ids)
    cluster_id += 1

# create a list of tuples representing the edges in a graph
edges = set([(x, y) for c in clusters.values() for x, y in combinations_with_replacement(c, 2)])

In [120]:
# create an adjency matrix from the edge list
matrix_shape = (len(blackwing_data), len(blackwing_data))  # will be a square adjacency matrix
rows, cols = zip(*edges)
sparse_mat = coo_matrix((np.ones(len(edges)), (rows, cols)), shape=matrix_shape)

# get the cluster for each document
nbr_clusters, cluster = connected_components(sparse_mat)

In [121]:
blackwing_data['cluster'] = cluster
blackwing_data.reset_index(drop=False, inplace=True)
blackwing_data.sort_values(by=['cluster', 'index']).reset_index(drop=True).to_csv('blackwing_sorted_by_sim.csv', index=False)

Here is proof that syndicated content is captured...

In [138]:
blackwing_data.loc[
    blackwing_data.cluster.isin(
        blackwing_data.groupby(['cluster'])['source_name'].nunique()[
            blackwing_data.groupby(['cluster'])['source_name'].nunique()>1
        ].index
    ),
    ['source_name', 'headline', 'text', 'cluster']
].drop_duplicates().head(25)

Unnamed: 0,source_name,headline,text,cluster
2229,Anchorage (AK) Daily News,Boosters wane in effectiveness after 4 months ...,Booster shots of the Pfizer-BioNTech and Moder...,31
2484,Guam Daily Post,"Booster effectiveness wanes after 4 months, bu...",Booster shots of the Pfizer-BioNTech and Moder...,31
2571,New Delhi (IND) Pioneer,Mcap of nine of top-10 cos erodes by over Rs 1...,Nine of the 10 most valued companies together ...,51
2644,Hindu Business Line (IND),Mcap of nine of top-10 cos erodes by over Rs 1...,Nine of the 10 most valued companies together ...,55
2728,Newshub (NZL),Climate Change Minister James Shaw was warned ...,Treasury warned even a less ambitious goal wou...,59
2762,Outlook India,Mcap Of Nine Most-Valued Firm Declines By Over...,Nine of the 10 most valued companies together ...,51
2866,theoutreach.in,Mcap of nine of top-10 cos erodes by over Rs 1...,Nine of the 10 most valued companies together ...,55
3005,MSN (NZL),Climate Change Minister James Shaw was warned ...,Newshub can reveal the Climate Change Minister...,59
3123,Gulf Digital News (BHR),MEBAA Show for business aviation returns in De...,The MEBAA Show the Middle East?s leading busin...,83
3145,Travel & Tourism News Middle East,MEBAA Show for business aviation returns in De...,The MEBAA Show the Middle East?s leading busin...,83
