# Duplicates and PII detection


In [3]:
import pandas as pd

In [4]:
df_all = pd.read_csv("../datasets/local/vast-challenge-2021-mc3/all_tweets.csv")

In [5]:
df_all["message"]

0                                   Follow us @POK-Kronos
1       Don't miss a moment!  Follow our live coverage...
2       Come join us in the Park! Music tonight at Abi...
3       POK rally to start in Abila City Park. POK lea...
4       POK rally set to take place in Abila City Park...
                              ...                        
4058    RT @AbilaPost unknown explosion heard from the...
4059    RT @CentralBulletin explosion heard at dancing...
4060    RT @KronosStar There has been an explosion fro...
4061                    RT @redisrad What was that? #boom
4062    RT @CentralBulletin explosion heard at dancing...
Name: message, Length: 4063, dtype: object

# Exact duplicates


In [6]:
df_all["message"].value_counts()

message
TRAFFIC STOP                                                                                                                           36
Develop success from failures. Discouragement and failure are two of the surest stepping stones to success.  #POKRally #HI             15
INCOMPLETE CALL FOR POLICE                                                                                                             15
ALARM-SECURE NO CRIME                                                                                                                  15
You've got to get up every morning with determination if you're going to go to bed with satisfaction.  #POKRally #HI                   14
                                                                                                                                       ..
RT @OnlytheTruth MOAR FIRE! Really, thatz it?? pahetic                                                                                  1
RT @GreyCatCollectibles wh

# Near duplicates


- Old school way: use k means or other clustering method based on tfidf representation; this is brittle representation
- newer basic way: get embeddigns and then cluster, but then need to have number of clusters a priori
- new new way: Using vector db and embeddings from gpt: https://mikulskibartosz.name/text-search-and-duplicate-detection-with-word-embeddings-and-vector-databases
  - I think this requires a distance metrics


In [75]:
import numpy as np
import sentence_transformers
import time
import torch

In [39]:
corpus_sentences = df_all.message
deduped_corpus_sentences = df_all.message.drop_duplicates()

In [25]:
model_names = [
    "all-mpnet-base-v2",  # best performing on leaderboard
    "all-MiniLM-L6-v2",  # smaller and faster
]


def calculate_embeddings(col: np.ndarray, model_name: str):
    """
    Calcs the embeddings.
    TODO: My understanding is the sentence_transformers library cuts off inputs longer than
    384 words. Need to fix that for longer docs probably by chunking text into longest amount and
    then averaging the resulting embeddings
    """
    print("Starting embeddings...")
    start_time = time.time()
    model = sentence_transformers.SentenceTransformer(model_name)
    e = model.encode(col, convert_to_tensor=True)
    elapsed_time = time.time() - start_time
    print(
        "Created embedding of shape",
        e.shape,
        "with",
        model_name,
        "in",
        elapsed_time,
        "seconds",
    )
    return e

In [86]:
from pathlib import Path

p = Path("../datasets/processed/results_cache")

torch.save(
    embeddings["all-mpnet-base-v2"], p / "vast_challenge_2021_all-all-mpnet-base-v2.pt"
)

In [83]:
torch.save(
    embeddings["all-mpnet-base-v2"],
    "../datasets/processed/results_cache/vast_challenge_2021_all-all-mpnet-base-v2.pt",
)

In [52]:
embed_mpnet_base = embeddings["all-mpnet-base-v2"]
embed_mpnet_base_deduped = deduped_embeddings["all-mpnet-base-v2"]

In [59]:
deduped_corpus_sentences.index

Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       4039, 4044, 4045, 4048, 4049, 4050, 4052, 4054, 4058, 4061],
      dtype='int64', length=2072)

In [69]:
torch.equal(embed_mpnet_base[deduped_corpus_sentences.index], embed_mpnet_base_deduped)

False

In [82]:
idx = 100

torch.allclose(
    embed_mpnet_base[deduped_corpus_sentences.index],
    embed_mpnet_base_deduped,
    atol=1e-5,
)

True

In [81]:
embed_mpnet_base_deduped[100]

tensor([-1.5468e-02,  9.8530e-02,  2.3847e-02, -2.2306e-04,  2.6073e-02,
        -2.5093e-02, -2.9947e-02,  3.3188e-02, -1.3392e-01,  4.5183e-02,
        -3.2350e-02,  8.1459e-02,  1.0748e-02,  5.0281e-02, -1.8304e-03,
        -4.1923e-02,  3.9567e-02,  2.1980e-02, -2.5987e-02,  1.6290e-02,
         1.5120e-02,  5.3738e-02,  1.9845e-02,  4.5473e-02, -6.1413e-03,
        -2.5419e-02, -3.2123e-02,  1.3655e-02,  2.2238e-02,  2.8874e-02,
         1.9102e-02, -3.0591e-02, -1.2322e-02, -8.8406e-03,  1.6808e-06,
        -1.2453e-02, -7.0594e-02,  2.2467e-02, -2.8630e-02, -9.8567e-03,
         2.2171e-02, -1.2481e-01, -6.1683e-02, -1.4328e-02, -3.0874e-02,
         2.5721e-03, -7.6173e-03,  6.2462e-02,  3.8937e-02, -2.3184e-02,
         1.2206e-02,  5.5598e-02, -1.1791e-02,  2.8031e-02,  1.5690e-02,
         5.6476e-02, -4.3279e-02, -2.3559e-02, -1.7782e-02, -9.5259e-03,
         3.6179e-02,  7.5523e-02, -3.5805e-02, -1.1464e-02, -9.7837e-02,
         4.5665e-03, -2.3565e-02,  2.0369e-02, -1.0

In [80]:
embed_mpnet_base[deduped_corpus_sentences.index][100]

tensor([-1.5468e-02,  9.8530e-02,  2.3847e-02, -2.2305e-04,  2.6073e-02,
        -2.5093e-02, -2.9947e-02,  3.3188e-02, -1.3392e-01,  4.5183e-02,
        -3.2350e-02,  8.1459e-02,  1.0748e-02,  5.0281e-02, -1.8305e-03,
        -4.1923e-02,  3.9567e-02,  2.1980e-02, -2.5987e-02,  1.6290e-02,
         1.5120e-02,  5.3738e-02,  1.9845e-02,  4.5473e-02, -6.1413e-03,
        -2.5419e-02, -3.2123e-02,  1.3655e-02,  2.2238e-02,  2.8874e-02,
         1.9102e-02, -3.0591e-02, -1.2322e-02, -8.8406e-03,  1.6808e-06,
        -1.2453e-02, -7.0594e-02,  2.2467e-02, -2.8630e-02, -9.8566e-03,
         2.2171e-02, -1.2481e-01, -6.1683e-02, -1.4328e-02, -3.0874e-02,
         2.5720e-03, -7.6173e-03,  6.2462e-02,  3.8937e-02, -2.3184e-02,
         1.2206e-02,  5.5598e-02, -1.1791e-02,  2.8031e-02,  1.5690e-02,
         5.6476e-02, -4.3279e-02, -2.3559e-02, -1.7782e-02, -9.5259e-03,
         3.6179e-02,  7.5523e-02, -3.5805e-02, -1.1464e-02, -9.7837e-02,
         4.5665e-03, -2.3565e-02,  2.0369e-02, -1.0

In [26]:
embeddings = {}

for m in model_names:
    embeddings[m] = calculate_embeddings(corpus_sentences, m)

Starting embeddings...
Created embedding of shape torch.Size([4063, 768]) with all-mpnet-base-v2 in 55.075347900390625 seconds
Starting embeddings...
Created embedding of shape torch.Size([4063, 384]) with all-MiniLM-L6-v2 in 11.42600417137146 seconds


In [43]:
deduped_embeddings = {}

for m in model_names:
    deduped_embeddings[m] = calculate_embeddings(deduped_corpus_sentences.values, m)

Starting embeddings...
Created embedding of shape torch.Size([2072, 768]) with all-mpnet-base-v2 in 30.12472701072693 seconds
Starting embeddings...
Created embedding of shape torch.Size([2072, 384]) with all-MiniLM-L6-v2 in 6.536754846572876 seconds


In [27]:
mpnet_embeddings = embeddings["all-mpnet-base-v2"]

In [87]:
import time


def timer_print(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"{func.__name__} run in {end_time - start_time} seconds")
        return result

    return wrapper


@timer_print
def get_clusters_comm_detection(corpus_embeddings, min_community_size=2, threshold=0.5):
    """
    min_community_size: Only consider cluster that have at least a certain number of elements.
    threshold: cosine-similarity larger than threshold are similar (cos has range of -1 to 1)

    Uses custom community detection algorithm from sentence_transformers library, each sentence ends up in only 1 cluster
    but not all sentences are clustered.
    """
    print("Starting clustering...")
    clusters = sentence_transformers.util.community_detection(
        corpus_embeddings, min_community_size=min_community_size, threshold=threshold
    )

    print(f"Found {len(clusters)} clusters")

    return clusters

In [28]:
clustering_1 = get_clusters_comm_detection(mpnet_embeddings)

Starting clustering...
Found 304 clusters in 10.97336196899414 seconds


In [33]:
# Print for all clusters the top 3 elements
for i, cluster in enumerate(clustering_1):
    print("\nCluster {}:: {} items ".format(i + 1, len(cluster)))
    for sentence_id in cluster[0:3]:
        print("\t", corpus_sentences[sentence_id])


Cluster 1:: 965 items 
	 Success is walking from failure to failure with no loss of enthusiasm.  #POKRally #HI
	 A person who never made a mistake never tried anything new.  #KronosStar #POKRally
	 The only person you are destined to become is the person you decide to be.  #KronosStar #POKRally

Cluster 2:: 864 items 
	 Concerns of violence surround POK rally today heavy police presence #AbilaPost
	 RT @AbilaPost Concerns of violence surround POK rally today heavy police presence #AbilaPost
	 POK rally today in Abila City Park concerns of violence and heavy police presence #Abila #CentralBulletin

Cluster 3:: 145 items 
	 Knowledge is being aware of what you can do. Wisdom is knowing when not to do it.  #POKRally #HI
	 It's not what you look at that matters, it's what you see.  #POKRally #HI
	 There are two types of people who will tell you that you cannot make a difference in this world. #POKRally #HI

Cluster 4:: 125 items 
	 police have closed streets surrounding POK rally in Abila

In [88]:
c3 = get_clusters_comm_detection(deduped_embeddings["all-mpnet-base-v2"], threshold=0.9)

Starting clustering...
Found 254 clusters
get_clusters_comm_detection run in 0.15440988540649414 seconds


In [98]:
deduped_embeddings["all-mpnet-base-v2"].shape

size = 2072

In [100]:
arr = [None] * size

len(arr)

2072

In [105]:
def get_cluster_strings(clusters, max_size):
    """
    Params:
        clusters: list of lists where each list has indicies of rows belonging to that cluster
        max_size: size of original corpus

    Returns:
        cluster_strings: list of strings where each string is the cluster id for that row or None
    """
    cluster_strings = [None] * max_size

    for i, cluster_indexes in enumerate(clusters):
        cluster_string = "c" + str(i)
        for sentence_idx in cluster_indexes:
            if cluster_strings[sentence_idx] is None:
                cluster_strings[sentence_idx] = cluster_string
            else:
                print("Repeat entry: ", sentence_idx)
            # cluster_strings[sentence_idx] = cluster_string
    return cluster_strings


cluster_id_col = get_cluster_strings(c3, 2072)

In [107]:
cluster_df_1 = pd.DataFrame(
    {"message": deduped_corpus_sentences, "cluster_id": cluster_id_col}
)

cluster_df_1[cluster_df_1.cluster_id == "c0"]

Unnamed: 0,message,cluster_id
1315,Residents being saved from Dancing Dolphin Fir...,c0
1574,Nearby residents being evacuated in Dancing Do...,c0
1589,RT @NewsOnlineToday Nearby residents being eva...,c0
2047,Neighborhood evacuations in Dancing Dolphin Fi...,c0
2048,Additional residents being evacuated in Dancin...,c0
2060,RT @NewsOnlineToday Additional residents being...,c0
2551,Continued evacuations n the Abila Dancing Dolp...,c0


In [104]:
deduped_corpus_sentences

0                                   Follow us @POK-Kronos
1       Don't miss a moment!  Follow our live coverage...
2       Come join us in the Park! Music tonight at Abi...
3       POK rally to start in Abila City Park. POK lea...
4       POK rally set to take place in Abila City Park...
                              ...                        
4050                          May the Creator help us all
4052    RT @KronosStar There has been an explosion fro...
4054    RT @CentralBulletin explosion heard at dancing...
4058    RT @AbilaPost unknown explosion heard from the...
4061                    RT @redisrad What was that? #boom
Name: message, Length: 2072, dtype: object

In [50]:
clustering_2 = get_clusters_comm_detection(
    deduped_embeddings["all-mpnet-base-v2"], threshold=0.9
)

# Print for all clusters the top 3 elements
for i, cluster in enumerate(clustering_2):
    print("\nCluster {}:: {} items ".format(i + 1, len(cluster)))
    for sentence_id in cluster[0:3]:
        print("\t", deduped_corpus_sentences.iloc[sentence_id])

Starting clustering...
Found 254 clusters in 0.1239621639251709 seconds

Cluster 1:: 7 items 
	 Residents being saved from Dancing Dolphin Fire in Abila. #NewsOnline
	 Nearby residents being evacuated in Dancing Dolphin fire in Abila. #NewsOnline
	 RT @NewsOnlineToday Nearby residents being evacuated in Dancing Dolphin fire in Abila. #NewsOnline

Cluster 2:: 5 items 
	 POK rally to start in Abila City Park. POK leader Sylvia Marek to open with a speech. #KronosStar
	 POK rally set to take place in Abila City Park - POK leader Sylvia Marek has begun with opening remarks #AbilaPost
	 RT @AbilaPost POK rally set to take place in Abila City Park - POK leader Sylvia Marek has begun with opening remarks #AbilaPost

Cluster 3:: 4 items 
	 Fire at the Dancing Dolphin apartment complex. #Abila #IntNews
	 RT @InternationalNews Fire at the Dancing Dolphin apartment complex. #Abila #IntNews
	 Continuing fire at the Dancing Dolphin apartment #Abila #IntNews

Cluster 4:: 4 items 
	 Abila Fire Depar

In [130]:
from sklearn.cluster import HDBSCAN


@timer_print
def get_clusters_hdbscan(corpus_embeddings, min_size=2):
    """
    Uses hdbscan to cluster embeddings, each sentence ends up in only 1 cluster or is laeled as noise (undefined cluster)

    Args:
        corpus_embeddings: torch tensor embedding matrix
        min_size: Only consider cluster that have at least a certain number of elements.
        threshold: cosine-similarity larger than threshold are similar (cos has range of -1 to 1)

    Returns:
        clusters: list of lists where each list has indicies of rows belonging to that cluster
    """

    print("Starting clustering...")

    hdb = HDBSCAN(min_cluster_size=min_size, metric="euclidean")
    hdb.fit(corpus_embeddings)
    clusters = hdb.labels_
    print(f"Found {np.unique(clusters).size - 1} clusters")

    # hdbscan labels -1 as noise, so we replace that with None
    clusters = np.where(clusters == -1, None, clusters)

    return clusters

In [131]:
cluster_4 = get_clusters_hdbscan(deduped_embeddings["all-mpnet-base-v2"])

Starting clustering...
Found 588 clusters
get_clusters_hdbscan run in 8.259826898574829 seconds


In [143]:
np.where(cluster_4 == -1, None, cluster_4)

array([437, None, None, ..., 525, 357, 55], dtype=object)

In [144]:
print_cluster_results(
    np.where(cluster_4 == -1, None, cluster_4), deduped_corpus_sentences
)


Cluster 0:: 2 items 
	 Titanium frame, carbon wheels. guy's gonna be pissed when he gets over being scared.
	 RT @trapanitweets Titanium frame, carbon wheels. guy's gonna be pissed when he gets over being scared.

Cluster 1:: 2 items 
	 the terorrist are off their rockers - they should all go back to church
	 RT @surferMan the terorrist are off their rockers - they should all go back to church

Cluster 2:: 2 items 
	 Anyone know the name of the shoe repair on Egeou St ?
	 anyone know a good body shop?

Cluster 3:: 2 items 
	 How to power your radio with vegetables:  SOW.kronos/greenliving
	 RT @SaveOurWildlands How to power your radio with vegetables:  SOW.kronos/greenliving

Cluster 4:: 2 items 
	 Elian discovered alive, living in Tethys. smad.co.kronos/xx3942
	 RT @Officia1AbilaPost Elian discovered alive, living in Tethys. smad.co.kronos/xx3942

Cluster 5:: 2 items 
	 SUBJECT STOP
	 DRUNK SUBJECT

Cluster 6:: 3 items 
	 time for a refill ? refill your meds online at drugs.kronos/ch

In [137]:
def print_cluster_results(clustering, data):
    _df = pd.DataFrame({"d": data, "cluster_id": clustering})

    for i, cluster in _df.groupby("cluster_id"):
        print("\nCluster {}:: {} items ".format(i, len(cluster)))
        for sentence in cluster["d"].values[0:3]:
            print("\t", sentence)

    # # Print for all clusters the top 3 elements
    # for i, cluster in enumerate(clustering_2):
    #     print("\nCluster {}:: {} items ".format(i + 1, len(cluster)))
    #     for sentence_id in cluster[0:3]:
    #         print("\t", deduped_corpus_sentences.iloc[sentence_id])

In [115]:
deduped_embeddings["all-mpnet-base-v2"]

tensor([[ 6.6671e-02,  2.4539e-02, -1.5406e-02,  ...,  3.9320e-02,
         -4.3061e-02, -1.5672e-02],
        [-6.2846e-02,  5.0827e-02,  9.1334e-03,  ...,  1.9086e-02,
         -2.4561e-02, -5.1466e-03],
        [-8.5090e-02, -4.2328e-03, -2.3331e-02,  ...,  1.0685e-02,
          4.3658e-03, -9.7629e-03],
        ...,
        [-4.2885e-02, -1.7654e-03,  2.3000e-02,  ...,  6.4197e-02,
         -8.7117e-02, -1.1348e-03],
        [-1.9246e-02,  6.5907e-03,  6.9041e-03,  ...,  9.0256e-02,
         -2.6947e-02, -3.7649e-02],
        [ 3.1774e-02, -6.7119e-02, -1.1828e-05,  ...,  7.6287e-02,
         -2.1145e-02, -2.8326e-02]])

In [128]:
distances = sentence_transformers.util.cos_sim(
    deduped_embeddings["all-mpnet-base-v2"], deduped_embeddings["all-mpnet-base-v2"]
)

In [129]:
distances

tensor([[1.0000, 0.5186, 0.2321,  ..., 0.1870, 0.2496, 0.2691],
        [0.5186, 1.0000, 0.5115,  ..., 0.2479, 0.2438, 0.2220],
        [0.2321, 0.5115, 1.0000,  ..., 0.2642, 0.2501, 0.0935],
        ...,
        [0.1870, 0.2479, 0.2642,  ..., 1.0000, 0.7186, 0.4402],
        [0.2496, 0.2438, 0.2501,  ..., 0.7186, 1.0000, 0.5620],
        [0.2691, 0.2220, 0.0935,  ..., 0.4402, 0.5620, 1.0000]])

In [None]:
clustering_methods = [
    # hierarchical clustering
    # dbscan -- og method
    # hdbscan -- slightly better dbscan, maybe? Fewer hyperparam
    # optics -- similar to dbscan but supposedly less sensitive to hyperparams
]

hierarchical and dbscan


In [8]:
from pathlib import Path
import torch

# datasets = ["dolly15k", "opus100-en-es", "squad_validation"]

CACHE_PATH = Path("../datasets/processed/embedding_cache")


def calculate_or_get_from_cache(ds_name, col_name, model_name):
    filename = f"DATA_{ds_name}COL_{col_name}MODEL_{model_name}.pt"

    if (CACHE_PATH / filename).exists():
        print("Loading from cache...")
        return torch.load(CACHE_PATH / filename)
    else:
        print("Calculating embeddings...")

        return None
        # embeddings = calculate_embeddings(ds_name, col_name, model_name)
        # torch.save(embeddings, CACHE_PATH / filename)
        # return embeddings

In [9]:
calculate_or_get_from_cache("vast2021", "message", "all-mpnet-base-v2")

Loading from cache...


tensor([[ 6.6671e-02,  2.4539e-02, -1.5406e-02,  ...,  3.9320e-02,
         -4.3061e-02, -1.5672e-02],
        [-6.2846e-02,  5.0827e-02,  9.1334e-03,  ...,  1.9086e-02,
         -2.4561e-02, -5.1466e-03],
        [-8.5090e-02, -4.2328e-03, -2.3331e-02,  ...,  1.0685e-02,
          4.3658e-03, -9.7629e-03],
        ...,
        [-3.9866e-02,  2.4670e-02, -1.7471e-03,  ...,  5.1797e-02,
         -6.6075e-02, -3.0211e-03],
        [ 3.1774e-02, -6.7119e-02, -1.1838e-05,  ...,  7.6287e-02,
         -2.1145e-02, -2.8326e-02],
        [-4.2885e-02, -1.7654e-03,  2.3000e-02,  ...,  6.4197e-02,
         -8.7117e-02, -1.1348e-03]])