In [10]:
import re
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import networkx as nx
print(nx.__version__)
from simhash import Simhash, SimhashIndex
from itertools import combinations_with_replacement
from collections import defaultdict
from scipy.sparse import coo_matrix
from scipy.sparse.csgraph import connected_components

2.7.1


In [2]:
def get_features(s: str, width: int=3) -> list:
    """
    Returns list of substrings of a given width.  Example: 'how are' -> ['how', 'owa', 'war', 'are']
    
    :param s: String to parse
    :param width: Length of the sliding window.  The default, 3, results in 1 character on each side of 
        each characters position.  
    """
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i + width] for i in range(max(len(s) - width + 1, 1))]


def hash_and_index_documents(docs: pd.DataFrame, text_col: str, distance_threshold: int=1):
    """
    Use SimHash to hash the documents and index them for efficient searching and distance calculation.
    
    :param docs: Dataframe of documents
    :param text_col: Name of the dataframe column with the document text
    :param distance_threshold: Hash distances > than this value will not be considered similar.  
        This value must be an integer.  The default, 1, limits similarity to exact matches.
    """
    docs_dict = docs[text_col].to_dict()
    hashes = [(str(k), Simhash(get_features(v))) for k, v in docs_dict.items()]
    index = SimhashIndex(hashes, k=distance_threshold)
    return (hashes, index)


def determine_clusters(index) -> dict:
    """
    The SimHash library does not have a way to determine clusters, so this function serves that 
    purpose.  The SimHash index.bucket is a dictionary whose keys are the hash keys and whose 
    values are lists of [hash, doc_id].  So by splitting these on commas and taking the second 
    element, it is possible to get a list of document Ids in the same bucket (cluster).
    
    The bucket keys are determined by index.get_keys().  They are in the form c:i, where c is 
    the result of some bitwise operation on the hash, and i is the offset determined by the 
    tolerance (distance) threshold k.  For example, if k=1, then there will be a bucket key for 
    i=0 and i=1.  If k=2, then there will be 3 bucket keys, etc.  An example of a bucket key is: 
    554607d1:0.  All of the documents per bucket key need to be considered for similarity, and there 
    could be multiple documents per bucket key.  Though every document for each bucket key needs 
    to be checked, not all of them will ultimately be similar.  That depends on the tolerance 
    threshold, k.  
    
    :param index: SimHash index object
    """
    clusters = {}
    cluster_id = 0
    for simhash_key, hashes in index.bucket.items():
        similar_doc_ids = [int(hash_details.split(",")[1]) for hash_details in hashes]
        clusters[cluster_id] = set(similar_doc_ids)
        cluster_id += 1
    return clusters


def build_adjacency_matrix_from_clusters(clusters: dict, nbr_documents: int):
    """
    Given the clusters, build a square, sparse adjanceny matrix that shows which documents 
    are connected (documents are connected if they are in the same cluster).  
    
    :param clusters: The cluster dictionary from determine_clusters()
    :param nbr_documents: The total number of documents.  The matrix will have this many rows and columns.
    """
    # create a list of tuples representing the edges in a graph
    edges = set([(x, y) for c in clusters.values() for x, y in combinations_with_replacement(c, 2)])
    # create a square adjency matrix from the edge list
    matrix_shape = (nbr_documents, nbr_documents)
    rows, cols = zip(*edges)
    sparse_mat = coo_matrix((np.ones(len(edges)), (rows, cols)), shape=matrix_shape)
    return sparse_mat


def get_one_sample_text_per_cluster(df: pd.DataFrame, text_col) -> dict:
    """
    Clusters have similar texts, so take the first one per cluster to reduce the number of documents 
    that the similarity search has to run through.  Haystack expects the input to be a list of dictionaries 
    in the following format, which is what this function creates:
        [{'doc_id': 0, 'content': 'this is a document'}, {'doc_id': 1, 'content': 'this is another document'}]
    
    :param df: Dataframe with the texts
    :param text_col: Name of the column with the document text
    """
    grouped = df.groupby('cluster').first().rename(columns={text_col: 'content'})
    return grouped[['doc_id', 'content']].to_dict(orient='records')

In [3]:
d = pd.read_csv("mock_similarity_data.csv")
d.head()

Unnamed: 0,pair_id,doc_id_a,text_a,doc_id_b,text_b,label,expected_cluster_a,expected_cluster_b
0,0,0,"I lived at West Egg, the--well, the less fashi...",0,"I lived at West Egg, the--well, the less fashi...",0,0,0
1,1,0,"I lived at West Egg, the--well, the less fashi...",1,"I lived at West Egg, the less fashionable of t...",1,0,0
2,2,0,"I lived at West Egg, the--well, the less fashi...",2,"I lived at the very tip of West Egg, fifty yar...",2,0,0
3,3,0,"I lived at West Egg, the--well, the less fashi...",3,The Normandy American Cemetery and Memorial in...,3,0,2
4,4,0,"I lived at West Egg, the--well, the less fashi...",4,"The other girl, Daisy, made an attempt to rise...",4,0,3


## SimHash

In [126]:
# threshold must be an integer
hash_distance_limit = int(6)

In [127]:
# ingest and hash documents
hashable_df = pd.DataFrame({
    "text": d['text_a'].tolist() + d['text_b'].tolist(),
    "expected_cluster": d['expected_cluster_a'].tolist() + d['expected_cluster_b'].tolist(),
})
df_objs, index = hash_and_index_documents(
    docs=hashable_df,
    text_col='text',
    distance_threshold=hash_distance_limit
)
print(f"{len(hashable_df)} documents hashed")

30 documents hashed


In [128]:
# get the cluster for each document
clusters = determine_clusters(index)
sparse_mat = build_adjacency_matrix_from_clusters(clusters=clusters, nbr_documents=len(hashable_df))
nbr_clusters, cluster = connected_components(sparse_mat)
hashable_df['cluster'] = cluster
hashable_df = hashable_df.reset_index(drop=False).rename(columns={'index':'doc_id'})
doc_id_to_cluster_map = dict(zip(hashable_df['doc_id'], hashable_df['cluster']))
print(f"{nbr_clusters} clusters found")

8 clusters found


In [129]:
clusters

{0: {0, 1, 2, 3, 4, 15},
 1: {0, 1, 2, 3, 4, 15},
 2: {0, 1, 2, 3, 4, 10, 11, 12, 13, 14, 15, 25},
 3: {0, 1, 2, 3, 4, 15, 16},
 4: {0, 1, 2, 3, 4, 15, 18},
 5: {0, 1, 2, 3, 4, 15, 16},
 6: {0, 1, 2, 3, 4, 15},
 7: {5, 6, 7, 8, 9, 20},
 8: {5, 6, 7, 8, 9, 20},
 9: {5, 6, 7, 8, 9, 20},
 10: {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 20, 25},
 11: {5, 6, 7, 8, 9, 20},
 12: {5, 6, 7, 8, 9, 20},
 13: {5, 6, 7, 8, 9, 20},
 14: {10, 11, 12, 13, 14, 24, 25},
 15: {10, 11, 12, 13, 14, 25},
 16: {10, 11, 12, 13, 14, 25},
 17: {10, 11, 12, 13, 14, 25},
 18: {10, 11, 12, 13, 14, 25},
 19: {16},
 20: {16},
 21: {16, 26},
 22: {16},
 23: {16},
 24: {17},
 25: {17},
 26: {17},
 27: {17},
 28: {17},
 29: {17},
 30: {17},
 31: {18},
 32: {18},
 33: {18},
 34: {18},
 35: {18},
 36: {18},
 37: {19},
 38: {19},
 39: {19},
 40: {19},
 41: {19},
 42: {19},
 43: {19},
 44: {21, 26},
 45: {21},
 46: {21},
 47: {21},
 48: {21},
 49: {21},
 50: {21},
 51: {22},
 52: {22},
 53: {22},
 54: {22},
 55: {22},
 56: {22},
 

In [130]:
for k in index.get_keys(df_objs[0][1]):
    print(k)

1d1:0
103:1
151:2
a:3
85:4
1:5
2ac:6


In [137]:
df_objs[0][1].value
alt_clusters = {}
cluster_id = 0
for simhash_key, hashes in index.bucket.items():
    print(simhash_key, "-", len(hashes), "-", [int(hash_details.split(",")[1]) for hash_details in hashes])
    similar_doc_ids = [int(hash_details.split(",")[1]) for hash_details in hashes]
    alt_clusters[cluster_id] = set(similar_doc_ids)
    cluster_id += 1

1d1:0 - 6 - [4, 1, 15, 0, 3, 2]
103:1 - 6 - [4, 1, 15, 0, 3, 2]
151:2 - 12 - [25, 4, 1, 10, 14, 15, 12, 0, 3, 2, 11, 13]
a:3 - 7 - [4, 1, 15, 0, 3, 2, 16]
85:4 - 7 - [4, 1, 15, 0, 18, 3, 2]
1:5 - 7 - [4, 1, 15, 0, 3, 2, 16]
2ac:6 - 6 - [4, 1, 15, 0, 3, 2]
117:0 - 6 - [8, 9, 5, 6, 20, 7]
1be:1 - 6 - [8, 9, 5, 6, 20, 7]
78:2 - 6 - [8, 9, 5, 6, 20, 7]
6b:3 - 12 - [25, 10, 8, 14, 9, 20, 5, 12, 6, 11, 13, 7]
1ec:4 - 6 - [8, 9, 5, 6, 20, 7]
28:5 - 6 - [8, 9, 5, 6, 20, 7]
8e:6 - 6 - [8, 9, 5, 6, 20, 7]
152:0 - 7 - [25, 10, 14, 12, 24, 11, 13]
bb:1 - 6 - [25, 10, 14, 12, 11, 13]
c4:4 - 6 - [25, 10, 14, 12, 11, 13]
6a:5 - 6 - [25, 10, 14, 12, 11, 13]
46:6 - 6 - [25, 10, 14, 12, 11, 13]
1d3:0 - 1 - [16]
123:1 - 1 - [16]
153:2 - 2 - [26, 16]
87:4 - 1 - [16]
2a6:6 - 1 - [16]
159:0 - 1 - [17]
151:1 - 1 - [17]
171:2 - 1 - [17]
ca:3 - 1 - [17]
1a:4 - 1 - [17]
3:5 - 1 - [17]
2e4:6 - 1 - [17]
137:0 - 1 - [18]
1ab:1 - 1 - [18]
175:2 - 1 - [18]
8a:3 - 1 - [18]
2a:5 - 1 - [18]
2ed:6 - 1 - [18]
143:0 - 1 -

In [133]:
# sense check: these should align with the clusters
simhash_clusters = {doc[0]: set(index.get_near_dups(doc[1])) for doc in df_objs}
simhash_clusters

{'0': {'0', '1', '15', '16', '2', '3', '4'},
 '1': {'0', '1', '15', '16', '2', '3', '4'},
 '2': {'0', '1', '15', '16', '2', '3', '4'},
 '3': {'0', '1', '15', '16', '2', '3', '4'},
 '4': {'0', '1', '15', '16', '2', '3', '4'},
 '5': {'20', '5', '6', '7', '8', '9'},
 '6': {'20', '5', '6', '7', '8', '9'},
 '7': {'20', '5', '6', '7', '8', '9'},
 '8': {'20', '5', '6', '7', '8', '9'},
 '9': {'20', '5', '6', '7', '8', '9'},
 '10': {'10', '11', '12', '13', '14', '25'},
 '11': {'10', '11', '12', '13', '14', '25'},
 '12': {'10', '11', '12', '13', '14', '25'},
 '13': {'10', '11', '12', '13', '14', '25'},
 '14': {'10', '11', '12', '13', '14', '25'},
 '15': {'0', '1', '15', '16', '2', '3', '4'},
 '16': {'0', '1', '15', '16', '2', '3', '4'},
 '17': {'17'},
 '18': {'18'},
 '19': {'19'},
 '20': {'20', '5', '6', '7', '8', '9'},
 '21': {'21'},
 '22': {'22'},
 '23': {'23'},
 '24': {'24'},
 '25': {'10', '11', '12', '13', '14', '25'},
 '26': {'26'},
 '27': {'27'},
 '28': {'28'},
 '29': {'29'}}

In [138]:
G = nx.from_scipy_sparse_array(sparse_mat)

# determine node position from a given layout (use the spring layout)
pos = nx.spring_layout(G)

# assign node data attributes, such as position
for n in G.nodes:
    G.nodes[n]['pos'] = pos[n]
    G.nodes[n]['doc_id'] = hashable_df.loc[n, 'doc_id']
    G.nodes[n]['text'] = hashable_df.loc[n, 'text']
    G.nodes[n]['expected_cluster'] = hashable_df.loc[n, 'expected_cluster']
    G.nodes[n]['cluster'] = hashable_df.loc[n, 'cluster']

In [139]:
def plot_graph(G):
    """
    Plots a NetworkX graph object with Plotly
    """
    # get edge positions for graph
    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = G.nodes[edge[0]]['pos']
        x1, y1 = G.nodes[edge[1]]['pos']
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)

    # plot edges
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines')

    # get node positions for graph
    node_x = []
    node_y = []
    for node in G.nodes:
        x, y = G.nodes[node]['pos']
        node_x.append(x)
        node_y.append(y)

    # plot nodes
    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        hoverinfo='text',
        marker=dict(
            showscale=True,
            # colorscale options
            #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
            #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
            #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
            colorscale='YlGnBu',
            reversescale=True,
            color=[],
            size=14,
            colorbar=dict(
                thickness=15,
                title='Node Connections',
                xanchor='left',
                titleside='right'
            ),
            line_width=2))
    
    # color nodes by the number of connections
    node_adjacencies = []
    node_text = []
    for node, adjacencies in enumerate(G.adjacency()):
        node_adjacencies.append(len(adjacencies[1]))
        node_text.append((
            f"Doc ID: {G.nodes[node]['doc_id']} <br>"
            f"Text: {G.nodes[node]['text'][:75]} <br>"
            f"Expected Cluster: {G.nodes[node]['expected_cluster']} <br>"
            f"Cluster: {G.nodes[node]['cluster']}"
        ))

    node_trace.marker.color = node_adjacencies
    node_trace.text = node_text
    
    # create the plot
    fig = go.Figure(
        data=[edge_trace, node_trace],
        layout=go.Layout(
            title=f'Network Graph from SimHash Sparse Matrix, Distance Thresh: {hash_distance_limit}',
            titlefont_size=16,
            showlegend=False,
            hovermode='closest',
            margin=dict(b=20,l=5,r=5,t=40),
            annotations=[ 
                dict(
                    text="Python code: <a href='https://plotly.com/ipython-notebooks/network-graphs/'> https://plotly.com/ipython-notebooks/network-graphs/</a>",
                    showarrow=False,
                    xref="paper", 
                    yref="paper",
                    x=0.005, 
                    y=-0.002 
                ) 
            ],
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
        )
    )
    
    return fig

In [140]:
fig = plot_graph(G)
fig.show()

### Analysis

SimHash with a distance threshold of 1 only groups texts that are exactly the same.  The semantically similar texts are not included in the same cluster.  

Increasing the threshold reaches a cliff, beyond which all texts are considered similar.  Unfortunately, SimHash's limitation of requiring the threshold to be an integer makes it hard to determine where the threshold should be set to capture less exact matches.