In [None]:
!pip install datasketch

Collecting datasketch
  Downloading datasketch-1.6.4-py3-none-any.whl (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.3/88.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: datasketch
Successfully installed datasketch-1.6.4


In [None]:
import pandas as pd
import numpy as np
from datasketch import MinHash, MinHashLSH
from tqdm import tqdm
import logging
logging.getLogger('simhash').setLevel(logging.CRITICAL)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/310Project/english_only_titles.csv')

In [None]:
df

Unnamed: 0,cord_uid,title,publish_time,language
0,l2d0mwsr,Endovascular treatment of debilitating tinnitu...,2019,en
1,od684vu5,A Scoping Review of Validated Tools to Measure...,2019,en
2,3cnamu29,17α-Hydroxyprogesterone Caproate and the Risk ...,2019,en
3,jhs2u33l,The effects of exercise on lipid profile and b...,2019,en
4,zhta1788,Comparison of vocal cord view between neutral ...,2019,en
...,...,...,...,...
919991,crcqwec6,Microcirculatory Predictors of Thrombosis in P...,2022,en
919992,ma9xs6wa,Chapter 10 An optimized CNN based automated CO...,2022,en
919993,ct4h3eb7,Fractional optimal control of compartmental SI...,2022,en
919994,th6ldewp,How to support the economic recovery of aviati...,2022,en


In [None]:
dfs = {}
for year in df['publish_time'].unique():
    dfs[year] = df[df['publish_time'] == year]

In [None]:
for year, df in dfs.items():
    total_articles = df.shape[0]
    number_of_title_duplicates = df['title'].duplicated().sum()
    print(f"Year {year} total articles: {total_articles}, number of duplicate titles: {number_of_title_duplicates}")

Year 2019 total articles: 8817, number of duplicate titles: 66
Year 2020 total articles: 352690, number of duplicate titles: 84621
Year 2021 total articles: 434565, number of duplicate titles: 95229
Year 2022 total articles: 123924, number of duplicate titles: 5763


# Test Trial

In [None]:
test = dfs[2020].copy()

In [None]:
test

Unnamed: 0,cord_uid,title,publish_time,language
8817,dhxux00x,[Coronavirus disease 2019 and hypertension in ...,2020,en
8818,xbfh0rcy,CACNA1S haploinsufficiency confers resistance ...,2020,en
8819,6aj02tw1,Cross-border capital flows in Russia: Prospect...,2020,en
8820,g85h1jri,Heart Best Research Paper Award 2020,2020,en
8821,amq8hw8k,COVID-19 critical illness in Sweden: character...,2020,en
...,...,...,...,...
361502,5xscl5o6,Reduced Hedonic Tone and Emotion Dysregulation...,2020,en
361503,hfjhg5de,358. Sociodemographic and clinical features of...,2020,en
361504,ko1pnree,The use of technology in the learning environm...,2020,en
361505,ib0l00lh,"Insomnia is associated with worry, cognitive a...",2020,en


In [None]:
# Function to generate shingles
def get_shingles(title, k=3):
    return [title[i:i+k] for i in range(len(title) - k + 1)]

# Create a function to compute MinHash for each title
def compute_minhash(title, k=3):
    m = MinHash()
    for s in get_shingles(title, k=k):
        m.update(s.encode('utf8'))
    return m

# Compute MinHash for each title and store it in a new column
test['minhash'] = [compute_minhash(title) for title in tqdm(test['title'], desc="Computing MinHash", dynamic_ncols=True, position=0, leave=True)]

# Create an LSH index
lsh = MinHashLSH(threshold=0.85, num_perm=128)  # Threshold is Jaccard similarity threshold

# Insert into the LSH index with tqdm for progress
for index, row in tqdm(test.iterrows(), desc="Inserting into LSH", total=test.shape[0], dynamic_ncols=True, position=0, leave=True):
    lsh.insert(str(index), row['minhash'])

In [None]:
# Dictionary to hold articles and their number of duplicates
duplicate_counts = {}

for article_index in test.index:
    # Fetch the MinHash of the article
    article_minhash = test.loc[article_index, 'minhash']

    # Query LSH for duplicates of the article
    duplicates = set(lsh.query(article_minhash))

    # Convert the result from string to integer for index matching
    duplicates_indices = [int(i) for i in duplicates if i != article_index]  # exclude the main article itself

    # Store the number of duplicates for the article
    duplicate_counts[article_index] = len(duplicates_indices)

In [None]:
duplicate_counts = sorted(duplicate_counts.items(), key=lambda item: item[1], reverse=True)
duplicate_counts;

In [None]:
# Given article index
given_index = 54340 # Replace with your specific article index

# Fetch the MinHash of the article
article_minhash = test.loc[given_index, 'minhash']

# Query LSH for duplicates of the article
duplicates = set(lsh.query(article_minhash))

# Convert the result from string to integer for index matching
duplicates_indices = [int(i) for i in duplicates if i != given_index]  # exclude the main article itself

# Display the main article
print("Article:")
print(test.loc[given_index, 'title'])
print('-' * 80)

# Display its duplicates
print(f"{len(duplicates_indices)} Identified Duplicates:")
duplicate_articles = test.loc[duplicates_indices]
for idx, row in duplicate_articles.iterrows():
    print(row['title'])
print('=' * 80)

Article:
Daily Situation Report on Coronavirus disease (COVID-19) in Iran; March 17, 2020
--------------------------------------------------------------------------------
32 Identified Duplicates:
Daily Situation Report on Coronavirus disease (COVID-19) in Iran; March 22, 2020
Daily Situation Report on Coronavirus disease (COVID-19) in Iran; March 13, 2020
Daily Situation Report on Coronavirus disease (COVID-19) in Iran;March 17, 2020
Daily Situation Report on Coronavirus disease (COVID-19) in Iran; March 25, 2020.
Daily Situation Report on Coronavirus disease (COVID-19) in Iran; March 22, 2020
Daily Situation Report on Coronavirus disease (COVID-19) in Iran; March 15, 2020.
Daily Situation Report on Coronavirus disease (COVID-19) in Iran;March 25, 2020
Daily Situation Report on Coronavirus disease (COVID-19) in Iran; March 17, 2020.
Daily Situation Report on Coronavirus disease (COVID-19) in Iran; March 25, 2020
Daily Situation Report on Coronavirus disease (COVID-19) in Iran;March 16

# Deduplicate for all df

In [None]:
# Select the dataframe for 2019
df_2019 = dfs[2019].copy()

# Function to generate shingles
def get_shingles(title, k=3):
    return [title[i:i+k] for i in range(len(title) - k + 1)]

# Create a function to compute MinHash for each title
def compute_minhash(title, k=3):
    m = MinHash()
    for s in get_shingles(title, k=k):
        m.update(s.encode('utf8'))
    return m

# Compute MinHash for each title and store it in a new column
df_2019.loc[:, 'minhash'] = [compute_minhash(title) for title in tqdm(df_2019['title'], desc="Computing MinHash")]

# Create an LSH index
lsh = MinHashLSH(threshold=0.85, num_perm=128)  # Threshold is Jaccard similarity threshold

# Insert into the LSH index with tqdm for progress
for index, row in tqdm(df_2019.iterrows(), desc="Inserting into LSH", total=df_2019.shape[0]):
    lsh.insert(str(index), row['minhash'])

# Deduplication
duplicates = set()
for index, row in tqdm(df_2019.iterrows(), desc="Deduplicating", total=df_2019.shape[0]):
    result = lsh.query(row['minhash'])
    if len(result) > 1:  # If more than itself is found
        duplicates.add(index)

# Remove duplicates
df_deduplicated_2019 = df_2019.drop(duplicates)

df_deduplicated_2019


Computing MinHash:   0%|          | 0/8817 [00:00<?, ?it/s][A
Computing MinHash:   0%|          | 31/8817 [00:00<00:28, 309.75it/s][A
Computing MinHash:   1%|          | 64/8817 [00:00<00:27, 319.38it/s][A
Computing MinHash:   1%|          | 97/8817 [00:00<00:26, 323.51it/s][A
Computing MinHash:   1%|▏         | 131/8817 [00:00<00:26, 329.36it/s][A
Computing MinHash:   2%|▏         | 164/8817 [00:00<00:27, 314.24it/s][A
Computing MinHash:   2%|▏         | 198/8817 [00:00<00:26, 321.82it/s][A
Computing MinHash:   3%|▎         | 231/8817 [00:00<00:27, 315.90it/s][A
Computing MinHash:   3%|▎         | 263/8817 [00:00<00:27, 316.53it/s][A
Computing MinHash:   3%|▎         | 295/8817 [00:00<00:26, 316.40it/s][A
Computing MinHash:   4%|▎         | 327/8817 [00:01<00:26, 314.66it/s][A
Computing MinHash:   4%|▍         | 359/8817 [00:01<00:26, 315.35it/s][A
Computing MinHash:   4%|▍         | 391/8817 [00:01<00:26, 313.81it/s][A
Computing MinHash:   5%|▍         | 423/8817 [00:01

Unnamed: 0,cord_uid,title,publish_time,language,minhash
0,l2d0mwsr,Endovascular treatment of debilitating tinnitu...,2019,en,<datasketch.minhash.MinHash object at 0x7cee2b...
1,od684vu5,A Scoping Review of Validated Tools to Measure...,2019,en,<datasketch.minhash.MinHash object at 0x7cee2b...
2,3cnamu29,17α-Hydroxyprogesterone Caproate and the Risk ...,2019,en,<datasketch.minhash.MinHash object at 0x7cee2b...
3,jhs2u33l,The effects of exercise on lipid profile and b...,2019,en,<datasketch.minhash.MinHash object at 0x7cee2b...
4,zhta1788,Comparison of vocal cord view between neutral ...,2019,en,<datasketch.minhash.MinHash object at 0x7cee2b...
...,...,...,...,...,...
8812,bk7lete2,Genetic Diversity and Evolution of Viral Popul...,2019,en,<datasketch.minhash.MinHash object at 0x7cee2b...
8813,c1qqbhl8,Discovery and Prevalence of Divergent RNA Viru...,2019,en,<datasketch.minhash.MinHash object at 0x7cee2b...
8814,add9tk0m,Ventilator-Associated Events: Definitions and ...,2019,en,<datasketch.minhash.MinHash object at 0x7cee2b...
8815,ejkgune0,Chapter 1 Introduction into nanotechnology and...,2019,en,<datasketch.minhash.MinHash object at 0x7cee2b...


In [None]:
df_deduplicated_2019.to_csv('/content/drive/MyDrive/310Project/deduplicated_dataset_2019.csv')

In [None]:
# Select the dataframe for 2020
df_2020 = dfs[2020].copy()

# Function to generate shingles
def get_shingles(title, k=3):
    return [title[i:i+k] for i in range(len(title) - k + 1)]

# Create a function to compute MinHash for each title
def compute_minhash(title, k=3):
    m = MinHash()
    for s in get_shingles(title, k=k):
        m.update(s.encode('utf8'))
    return m

# Compute MinHash for each title and store it in a new column
df_2020.loc[:, 'minhash'] = [compute_minhash(title) for title in tqdm(df_2020['title'], desc="Computing MinHash")]

# Create an LSH index
lsh = MinHashLSH(threshold=0.85, num_perm=128)  # Threshold is Jaccard similarity threshold

# Insert into the LSH index with tqdm for progress
for index, row in tqdm(df_2020.iterrows(), desc="Inserting into LSH", total=df_2020.shape[0], dynamic_ncols=True, position=0, leave=True):
    lsh.insert(str(index), row['minhash'])

# Deduplication
duplicates = set()
for index, row in tqdm(df_2020.iterrows(), desc="Deduplicating", total=df_2020.shape[0], dynamic_ncols=True, position=0, leave=True):
    result = lsh.query(row['minhash'])
    if len(result) > 1:  # If more than itself is found
        duplicates.add(index)

# Remove duplicates
df_deduplicated_2020 = df_2020.drop(duplicates)

df_deduplicated_2020


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Computing MinHash:  70%|██████▉   | 245483/352690 [18:08<08:33, 208.66it/s][A
Computing MinHash:  70%|██████▉   | 245504/352690 [18:08<09:02, 197.65it/s][A
Computing MinHash:  70%|██████▉   | 245524/352690 [18:08<09:18, 191.96it/s][A
Computing MinHash:  70%|██████▉   | 245544/352690 [18:08<09:22, 190.41it/s][A
Computing MinHash:  70%|██████▉   | 245565/352690 [18:08<09:10, 194.64it/s][A
Computing MinHash:  70%|██████▉   | 245585/352690 [18:08<09:12, 193.96it/s][A
Computing MinHash:  70%|██████▉   | 245605/352690 [18:08<09:36, 185.74it/s][A
Computing MinHash:  70%|██████▉   | 245624/352690 [18:08<09:59, 178.73it/s][A
Computing MinHash:  70%|██████▉   | 245644/352690 [18:09<09:44, 183.18it/s][A
Computing MinHash:  70%|██████▉   | 245668/352690 [18:09<09:02, 197.14it/s][A
Computing MinHash:  70%|██████▉   | 245690/352690 [18:09<08:50, 201.64it/s][A
Computing MinHash:  70%|██████▉   | 245712/352690 [18:09<08:42, 20

Unnamed: 0,cord_uid,title,publish_time,language,minhash
8819,6aj02tw1,Cross-border capital flows in Russia: Prospect...,2020,en,<datasketch.minhash.MinHash object at 0x7cee49...
8821,amq8hw8k,COVID-19 critical illness in Sweden: character...,2020,en,<datasketch.minhash.MinHash object at 0x7cee49...
8831,b94m1ssf,The likely economic impact of fewer elective s...,2020,en,<datasketch.minhash.MinHash object at 0x7cee49...
8852,hjvd72rc,Developing L2 productive language skills onlin...,2020,en,<datasketch.minhash.MinHash object at 0x7cee49...
8853,h47qh6sp,[Mental health in epidemics: A perspective fro...,2020,en,<datasketch.minhash.MinHash object at 0x7cee49...
...,...,...,...,...,...
361498,aem15g84,1 Tropical Lung Diseases,2020,en,<datasketch.minhash.MinHash object at 0x7cee21...
361500,uf74dpqa,Detection and identification of COVID -19 base...,2020,en,<datasketch.minhash.MinHash object at 0x7cee21...
361501,cg7bhk1s,Mimicking the Mammalian Plasma Membrane: An Ov...,2020,en,<datasketch.minhash.MinHash object at 0x7cee21...
361504,ko1pnree,The use of technology in the learning environm...,2020,en,<datasketch.minhash.MinHash object at 0x7cee21...


In [None]:
df_deduplicated_2020.to_csv('/content/drive/MyDrive/310Project/deduplicated_dataset_2020.csv')

In [None]:
# Select the dataframe for 2021
df_2021 = dfs[2021].copy()

# Function to generate shingles
def get_shingles(title, k=3):
    return [title[i:i+k] for i in range(len(title) - k + 1)]

# Create a function to compute MinHash for each title
def compute_minhash(title, k=3):
    m = MinHash()
    for s in get_shingles(title, k=k):
        m.update(s.encode('utf8'))
    return m

# Compute MinHash for each title and store it in a new column
df_2021.loc[:, 'minhash'] = [compute_minhash(title) for title in tqdm(df_2021['title'], desc="Computing MinHash")]

# Create an LSH index
lsh = MinHashLSH(threshold=0.85, num_perm=128)  # Threshold is Jaccard similarity threshold

# Insert into the LSH index with tqdm for progress
for index, row in tqdm(df_2021.iterrows(), desc="Inserting into LSH", total=df_2021.shape[0], dynamic_ncols=True, position=0, leave=True):
    lsh.insert(str(index), row['minhash'])

# Deduplication
duplicates = set()
for index, row in tqdm(df_2021.iterrows(), desc="Deduplicating", total=df_2021.shape[0], dynamic_ncols=True, position=0, leave=True):
    result = lsh.query(row['minhash'])
    if len(result) > 1:  # If more than itself is found
        duplicates.add(index)

# Remove duplicates
df_deduplicated_2021 = df_2021.drop(duplicates)

df_deduplicated_2021

Computing MinHash: 100%|██████████| 434565/434565 [25:42<00:00, 281.79it/s]
Inserting into LSH: 100%|██████████| 434565/434565 [01:10<00:00, 6171.72it/s]
Deduplicating: 100%|██████████| 434565/434565 [00:49<00:00, 8840.81it/s]


Unnamed: 0,cord_uid,title,publish_time,language,minhash
361510,4aic8967,The ABC of compassionate leadership – improvin...,2021,en,<datasketch.minhash.MinHash object at 0x78b755...
361516,z9jk3kpp,Estimation of Excess Mortality Resulting from ...,2021,en,<datasketch.minhash.MinHash object at 0x78b755...
361518,pxpeblza,Creating a Digital Bridge: Lessons and Policy ...,2021,en,<datasketch.minhash.MinHash object at 0x78b755...
361519,31wl6tv9,Escape Room Dual Mode Approach to Teach Maths ...,2021,en,<datasketch.minhash.MinHash object at 0x78b755...
361521,mz4f4ymc,The Heritability of Trust and Trustworthiness ...,2021,en,<datasketch.minhash.MinHash object at 0x78b755...
...,...,...,...,...,...
796067,0qb5qxr6,Data-driven prediction of antiviral peptides b...,2021,en,<datasketch.minhash.MinHash object at 0x78b738...
796068,fcotl7jr,Cardiac Device Implantations During COVID-19 P...,2021,en,<datasketch.minhash.MinHash object at 0x78b738...
796069,ajhzghie,Hypervirulent FAdV-4 infection induces activat...,2021,en,<datasketch.minhash.MinHash object at 0x78b738...
796070,3glk6b8d,Chapter Thirteen Ten statements for simplifyin...,2021,en,<datasketch.minhash.MinHash object at 0x78b738...


In [None]:
df_deduplicated_2021.to_csv('/content/drive/MyDrive/310Project/deduplicated_dataset_2021.csv')

In [None]:
# Select the dataframe for 2022
df_2022 = dfs[2022].copy()

# Function to generate shingles (k-grams)
def get_shingles(title, k=3):
    return [title[i:i+k] for i in range(len(title) - k + 1)]

# Create a function to compute MinHash for each title
def compute_minhash(title, k=3):
    m = MinHash()
    for s in get_shingles(title, k=k):
        m.update(s.encode('utf8'))
    return m

# Compute MinHash for each title and store it in a new column
df_2022.loc[:, 'minhash'] = [compute_minhash(title) for title in tqdm(df_2022['title'], desc="Computing MinHash")]

# Create an LSH index
lsh_2022 = MinHashLSH(threshold=0.85, num_perm=128)

# Insert MinHash values into the LSH index
for index, row in tqdm(df_2022.iterrows(), desc="Inserting into LSH for 2022", total=df_2022.shape[0]):
    lsh_2022.insert(str(index), row['minhash'])

# Deduplication
duplicates_2022 = set()
for index, row in tqdm(df_2022.iterrows(), desc="Deduplicating for 2022", total=df_2022.shape[0]):
    result = lsh_2022.query(row['minhash'])
    if len(result) > 1:  # If more than itself is found
        duplicates_2022.add(index)

# Remove duplicates
df_deduplicated_2022 = df_2022.drop(duplicates_2022)

df_deduplicated_2022

Computing MinHash: 100%|██████████████| 123924/123924 [01:38<00:00, 1252.16it/s]
Inserting into LSH for 2022: 100%|███| 123924/123924 [00:04<00:00, 30588.21it/s]
Deduplicating for 2022: 100%|████████| 123924/123924 [00:02<00:00, 45808.18it/s]


Unnamed: 0,cord_uid,title,publish_time,language,minhash
796072,enixg3oa,Covid-19: Antibodies after AstraZeneca and Pfi...,2022,en,<datasketch.minhash.MinHash object at 0x3b7953...
796073,rylf5lfe,Cholesterol crystals and their implications in...,2022,en,<datasketch.minhash.MinHash object at 0x31689e...
796075,fucgqg95,Liver injury and cytopenia after BNT162b2 COVI...,2022,en,<datasketch.minhash.MinHash object at 0x31689e...
796076,zkocmtj5,Vibrational Characterization and Molecular Ele...,2022,en,<datasketch.minhash.MinHash object at 0x31689f...
796077,x2yrf3dr,HIV Patients’ Tracer for Clinical Assistance a...,2022,en,<datasketch.minhash.MinHash object at 0x31689d...
...,...,...,...,...,...
919985,j830sk6k,Causal Analysis of Impact Factors of COVID-19 ...,2022,en,<datasketch.minhash.MinHash object at 0x3e2522...
919988,bs206r15,COVID-19 Time Series Forecasting – Twenty Days...,2022,en,<datasketch.minhash.MinHash object at 0x3e2522...
919989,xmwcf7ry,Shipping and Transportation Traffic of Medical...,2022,en,<datasketch.minhash.MinHash object at 0x3e2522...
919992,ma9xs6wa,Chapter 10 An optimized CNN based automated CO...,2022,en,<datasketch.minhash.MinHash object at 0x3e2522...


In [None]:
df_deduplicated_2022.to_csv('deduplicated_dataset_2022.csv')