In [61]:
# Imports
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import requests
from am_sma_data_cleaning.utils import remove_spammers
import glob
import re
from tqdm.notebook import tqdm
import unicodedata
import re

### Helper Functions

In [62]:
def show_similar_posts(df, column_name, similarity_threshold=90):
    """
    Return a DataFrame listing all pairs of rows in `df` whose text in `column_name`
    are similar above the given similarity_threshold.
    
    :param df: DataFrame containing the text column of interest.
    :param column_name: Name of the column to compare (e.g. "Normalized").
    :param similarity_threshold: Fuzzy matching threshold (default=90).
    :return: A DataFrame listing the indices, the texts, and the similarity ratio.
    """
    similar_pairs = []
    
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            text_i = df.iloc[i][column_name]
            text_j = df.iloc[j][column_name]
            
            # Compare the two texts using fuzzy ratio
            ratio = fuzz.ratio(text_i, text_j)
            
            if ratio >= similarity_threshold:
                similar_pairs.append({
                    'Index_1': df.index[i],
                    'Text_1': text_i,
                    'Index_2': df.index[j],
                    'Text_2': text_j,
                    'SimilarityRatio': ratio
                })

    return pd.DataFrame(similar_pairs)

def normalize_post(text):
    """
    Convert the text to lowercase and remove URLs, hashtags, mentions, punctuation,
    and extra whitespace.
    """
    if not isinstance(text, str):
        # Convert non-string (like float NaN) to an empty string
        text = ""
    text = text.lower()
    text = re.sub(r'http\S+', '', text)   # Remove URLs
    #text = re.sub(r'[@#]\S+', '', text)     # Remove hashtags and mentions
    #text = re.sub(r'[^\w\s]', '', text)     # Remove punctuation
    return text.strip()


def tokenize(text):
    """
    Simple word-based tokenization.
    You could also use n-grams for more robust matching.
    """
    return text.split()


def create_minhash(tokens, num_perm=128):
    """
    Builds a MinHash for a list of tokens.
    num_perm: number of hash permutations (the length of the signature)
    """
    m = MinHash(num_perm=num_perm)
    for t in tokens:
        # Convert token to bytes for MinHash
        m.update(t.encode('utf8'))
    return m




### Read All Data

In [79]:
# Use glob to find all CSV files recursively under the given directory
all_csv_files = glob.glob(
    "/home/shola/am_sma_sample_data/**/*.csv",
    recursive=True
)
# Read each CSV file and store in a list
df_list = [pd.read_csv(csv_file) for csv_file in all_csv_files]

# Concatenate all into one DataFrame
df = pd.concat(df_list, ignore_index=True)

# Drop views since its not tracked by tweeter anymore 
df = df.drop(columns="Views")

# Add column for total engagement
df["Total Engagement"] = (
    df["Replies"] + df["Likes"] + df["Reshares"]
)
df

Unnamed: 0,Author Name,Author ID,Date,Post Text,Replies,Likes,Reshares,Total Engagement
0,Lynn Allen,@Lynn_Allen,2017-03-30T22:20:38.000Z,A Traditional Lace Design Results in a New 3D Printed Breast Reconstruction Method http://,0,0,0,0
1,sipp,@SippSippin,2017-03-30T23:35:09.000Z,"Executive Director of MBUS, John Stuart, Attended 3D Printing Conference in Washington, D.C. http:// ctor-of-mbus-john-stuart-attended-3d-printing-conference-in-washington-d-c/ …",0,0,0,0
2,Harry Strauss,@HarryStrauss,2017-03-30T21:30:16.000Z,Would you drive a 3D printed car? Learn about the rise of 3D printing in the automotive tech industry: http://,0,0,0,0
3,David Papp,@DavidPapp,2017-03-30T20:54:00.000Z,World's first robot-operated 3D printing factory is the future of manufacturing http://,0,0,0,0
4,DentistryIQ,@DentistryIQ,2017-03-31T00:52:02.000Z,#IDS2017 : #Dentistry 's wait for viable 3D printing is over (VIDEO) http:// #dental #3Dprinting,0,2,2,4
...,...,...,...,...,...,...,...,...
1609165,All3DP,@All3DP,2019-05-13T08:01:05.000Z,"Warping may be cool in Star Trek, but in 3D printing , not so much... How to Prevent ABS from Warping on a Heated Bed https:// nt-abs-from-warping-on-a-heated-bed/ … #3Dprinting",0,3,0,3
1609166,Kim Snowden,@KimSnowden12,2019-05-13T04:57:15.000Z,if anyone wants to know how much of a geek I am - I got my favourite plant hormone (strigolactone) 3D printed so I can wear it round my neck.,3,20,2,25
1609167,Matcom,@Matcom1976,2019-05-13T13:30:40.000Z,"“ 3D printing is clearly an attempt to...allow the industry to gain, on one hand, a lot more productivity, but also give it the opportunity to go to new frontiers.” \n\nSee how a #3DPrinting is changing the future of construction. https:// g-shape-3d-printing-revolution-on-the-horizon/ …",0,0,0,0
1609168,Seatrade Maritime Events,@MaritimeIC,2019-05-13T09:08:00.000Z,"[WATCH] World's first ever class certified, 3D-printed crane hooks installed -> http:// #Maritime #Offshore #Innovation #3DPrinting #Technology",0,0,0,0


### Handle Spammers
Tweets of known authors whose posts/tweets are mostly either adverts, give-aways or outright-spams

In [64]:
## Review Spammer List
spammer_list = ["@zeppy_3dprint", "@bubbleistrouble"]

### Normalization

In [81]:
pd.set_option('display.max_colwidth', None)
df_selected = df[["Author ID", "Post Text", "Total Engagement"]]
df_selected["Normalized Text"] = df_selected["Post Text"].apply(normalize_post)
df_selected

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected["Normalized Text"] = df_selected["Post Text"].apply(normalize_post)


Unnamed: 0,Author ID,Post Text,Total Engagement,Normalized Text
0,@Lynn_Allen,A Traditional Lace Design Results in a New 3D Printed Breast Reconstruction Method http://,0,a traditional lace design results in a new 3d printed breast reconstruction method
1,@SippSippin,"Executive Director of MBUS, John Stuart, Attended 3D Printing Conference in Washington, D.C. http:// ctor-of-mbus-john-stuart-attended-3d-printing-conference-in-washington-d-c/ …",0,"executive director of mbus, john stuart, attended 3d printing conference in washington, d.c. ctor-of-mbus-john-stuart-attended-3d-printing-conference-in-washington-d-c/ …"
2,@HarryStrauss,Would you drive a 3D printed car? Learn about the rise of 3D printing in the automotive tech industry: http://,0,would you drive a 3d printed car? learn about the rise of 3d printing in the automotive tech industry:
3,@DavidPapp,World's first robot-operated 3D printing factory is the future of manufacturing http://,0,world's first robot-operated 3d printing factory is the future of manufacturing
4,@DentistryIQ,#IDS2017 : #Dentistry 's wait for viable 3D printing is over (VIDEO) http:// #dental #3Dprinting,4,#ids2017 : #dentistry 's wait for viable 3d printing is over (video) #dental #3dprinting
...,...,...,...,...
1609165,@All3DP,"Warping may be cool in Star Trek, but in 3D printing , not so much... How to Prevent ABS from Warping on a Heated Bed https:// nt-abs-from-warping-on-a-heated-bed/ … #3Dprinting",3,"warping may be cool in star trek, but in 3d printing , not so much... how to prevent abs from warping on a heated bed nt-abs-from-warping-on-a-heated-bed/ … #3dprinting"
1609166,@KimSnowden12,if anyone wants to know how much of a geek I am - I got my favourite plant hormone (strigolactone) 3D printed so I can wear it round my neck.,25,if anyone wants to know how much of a geek i am - i got my favourite plant hormone (strigolactone) 3d printed so i can wear it round my neck.
1609167,@Matcom1976,"“ 3D printing is clearly an attempt to...allow the industry to gain, on one hand, a lot more productivity, but also give it the opportunity to go to new frontiers.” \n\nSee how a #3DPrinting is changing the future of construction. https:// g-shape-3d-printing-revolution-on-the-horizon/ …",0,"“ 3d printing is clearly an attempt to...allow the industry to gain, on one hand, a lot more productivity, but also give it the opportunity to go to new frontiers.” \n\nsee how a #3dprinting is changing the future of construction. g-shape-3d-printing-revolution-on-the-horizon/ …"
1609168,@MaritimeIC,"[WATCH] World's first ever class certified, 3D-printed crane hooks installed -> http:// #Maritime #Offshore #Innovation #3DPrinting #Technology",0,"[watch] world's first ever class certified, 3d-printed crane hooks installed -> #maritime #offshore #innovation #3dprinting #technology"


In [82]:
# Drop the intermediate column "Post Text" and keep its normalized version
df_selected = df_selected.drop(columns="Post Text")

### Remove Empty Tweets/Posts
This could have happened if the author only posted a picture or video without text

In [83]:
df_selected = df_selected.sort_values(by="Normalized Text")
mask_empty = df_selected["Normalized Text"].str.strip().eq("")
df_empty = df_selected[mask_empty]
df_empty

Unnamed: 0,Author ID,Total Engagement,Normalized Text
1106909,@stilson41,0,
1037791,@Mins3dprinter,0,
102359,@Mins3dprinter,0,
1041479,@3D_Protothai,0,
139975,@JayEfikeco,0,
...,...,...,...
498147,@3D_PrintingTech,0,
357182,@3dprinting7,0,
1418352,@JERRY_3DHP,0,
465116,@3DCola,0,


In [84]:
# Keep only rows where "Normalized Text" column is not empty string
df_selected = df_selected[~mask_empty]
df_selected

Unnamed: 0,Author ID,Total Engagement,Normalized Text
1300623,@8BitoffunX,2,!
281626,@tannahillglen,0,! 3d printed brain beer bottle opener \nby 1nezer0 via @etsy
1277797,@walterrhett,0,! @gcobbhunter the world's first 3d printed village. homes are built in 24 hours! a solution to homelessness and affordable housing!
1333167,@canadada,3,! > a mini 3d-printed #press brings #printmaking to the #people ress-project/ …
1273025,@johnedgarpark,7,! card full\nhazard of doing 3d printing time lapses on a card that’s already full of video you’ve been meaning to delete.
...,...,...,...
792193,@ICEEfest,0,"󾬶 scott summit, designer and researcher, stanford university, talking @ #iceefest about 3d printing , innovation..."
200909,@Simply_Arcel,0,"󾭚new arrival korean 3d print 3/4 batwing sleeve blouse chiffon fabric free size, loose size fits up to xl\n\nonline sale: p250 only !!"
697993,@dEngzHue,0,"󾮜 @p280 only !!\n󾬏new arrival u.s. style 3d print silk combined cotton blouse󾬏\n󾔏cotton fabric, right thickness,..."
60538,@dEngzHue,0,"󾮜@p380 only !!\n󾬏new arrival u.s. style 3d print chiffon batwing sleeve polo shirt󾬏\n󾔏chiffon fabric, right..."


### Tweets with No Alphabetic Content

In [86]:
mask_only_punct = df_selected["Normalized Text"].str.strip().apply(
    lambda x: bool(re.match(r'^[^\w]+$', x))  # "start ^, one or more non-alphanumeric chars, end $"
)

# This finds rows where 'Normalized Text' is purely punctuation/symbols 
df_only_punct = df_selected[mask_only_punct]

df_only_punct

Unnamed: 0,Author ID,Total Engagement,Normalized Text
1300623,@8BitoffunX,2,!
202944,@joeltelling,10,!!!
399693,@joeltelling,21,!!!
295639,@ClifWilson1,0,!!!!
1246499,@3DPrintRockets,1,#
1186240,@8BitoffunX,0,$
1513062,@Reksaurian,0,(:
870819,@Puchiluh,14,(?)
1118024,@Puchiluh,8,(?).
724492,@Reksaurian,0,-


In [87]:
# Remove those rows by inverting the mask with '~'
df_selected = df_selected[~mask_only_punct]
df_selected

Unnamed: 0,Author ID,Total Engagement,Normalized Text
281626,@tannahillglen,0,! 3d printed brain beer bottle opener \nby 1nezer0 via @etsy
1277797,@walterrhett,0,! @gcobbhunter the world's first 3d printed village. homes are built in 24 hours! a solution to homelessness and affordable housing!
1333167,@canadada,3,! > a mini 3d-printed #press brings #printmaking to the #people ress-project/ …
1273025,@johnedgarpark,7,! card full\nhazard of doing 3d printing time lapses on a card that’s already full of video you’ve been meaning to delete.
1514023,@Daniel21891,0,"! i'm not mistaken here, your project provides a lot of experience for us, which you call 3d manufacturing or additive manufacturing . very good. thank you. #erecoin # 3dprinting"
...,...,...,...
792193,@ICEEfest,0,"󾬶 scott summit, designer and researcher, stanford university, talking @ #iceefest about 3d printing , innovation..."
200909,@Simply_Arcel,0,"󾭚new arrival korean 3d print 3/4 batwing sleeve blouse chiffon fabric free size, loose size fits up to xl\n\nonline sale: p250 only !!"
697993,@dEngzHue,0,"󾮜 @p280 only !!\n󾬏new arrival u.s. style 3d print silk combined cotton blouse󾬏\n󾔏cotton fabric, right thickness,..."
60538,@dEngzHue,0,"󾮜@p380 only !!\n󾬏new arrival u.s. style 3d print chiffon batwing sleeve polo shirt󾬏\n󾔏chiffon fabric, right..."


### Non English/non-ASCII Tweets

In [88]:
def ascii_normalize_text(text):
    # Convert any non-ASCII fullwidth forms into ASCII equivalents
    # e.g., "ｔ" -> "t", "　" -> " ", etc.
    return unicodedata.normalize("NFKC", str(text))

# 1) Apply the normalization function to your "Normalized Text" column
df_selected["Normalized Text"] = df_selected["Normalized Text"].apply(ascii_normalize_text)

# 2) Create a mask for rows that contain no [a-zA-Z] after normalization
mask_no_letters = df_selected["Normalized Text"].str.strip().apply(
    lambda x: not bool(re.search(r'[a-zA-Z]', x))
)

# 3) Filter them out (or just inspect them)
df_no_letters = df_selected[mask_no_letters]

df_no_letters


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected["Normalized Text"] = df_selected["Normalized Text"].apply(ascii_normalize_text)


Unnamed: 0,Author ID,Total Engagement,Normalized Text
957457,@3DPrintReview,0,"""делай грязь"" большой дельта принтер способен и на это)))"
851313,@3dprinter3dison,0,"""로킷 주주분이 주신 격려의 글"" \n\n아무도? 누구나! 가지않는 길 ...... 길이 없는 곳에는 길을 만들고 암흑같은 미로를 오감과 오각으로 벗어나는 의지와 도전은 인간이..."
1255486,@3dprintexpo,0,#что_напечатать цилиндрическая текстурированная коробка.\n\nавтор: скачать:
1343427,@3dprintexpo,0,#что_напечатать необычная подставка для карандашей и ручек.\nсебе на стол радовать глаз или же в подарок!\nкак вам идея? ______________________\nавтор: скачать:
1315803,@3D_Printer_Man,0,#ادعم_قرارات_الاهلي
...,...,...,...
1445214,@MitchellandSon1,0,헤드폰 스탠드
437945,@3dprinter3dison,0,호주 시드니 대학의 바이오교수가 로킷을 방문하여 로킷 인비보 바이오프린터와 초코렛 프린터에 대한 호감과 멋진후기를 올려 주셨네요! 강한 구매 희망 사항도 피력 했네요! 이렇게 우수한제품은 하나 하나 퍼져...
1084269,@3dprinter3dison,0,"화장품개발 할때 1년에 약1억만마리의 동물이 희생 되고 있습니다. 유럽등 선진국은 21014년, 한국은 2017년 2월부터 화장품 개발 동물시험은 금지 되었습니다. 이러한 대안으로 급부상하는 것이 바이오..."
890799,@3DPrintingCorp,0,2日目が無事に始まりました!\nワークショップは満席!󾬱\nジュエリー業界向けの説明会も行っております。


In [90]:
# Filter out non-ascii tweets/posts
df_selected = df_selected[~mask_no_letters]
df_selected

  df_selected = df_selected[~mask_no_letters]


Unnamed: 0,Author ID,Total Engagement,Normalized Text
281626,@tannahillglen,0,! 3d printed brain beer bottle opener \nby 1nezer0 via @etsy
1277797,@walterrhett,0,! @gcobbhunter the world's first 3d printed village. homes are built in 24 hours! a solution to homelessness and affordable housing!
1333167,@canadada,3,! > a mini 3d-printed #press brings #printmaking to the #people ress-project/ ...
1273025,@johnedgarpark,7,! card full\nhazard of doing 3d printing time lapses on a card that’s already full of video you’ve been meaning to delete.
1514023,@Daniel21891,0,"! i'm not mistaken here, your project provides a lot of experience for us, which you call 3d manufacturing or additive manufacturing . very good. thank you. #erecoin # 3dprinting"
...,...,...,...
792193,@ICEEfest,0,"󾬶 scott summit, designer and researcher, stanford university, talking @ #iceefest about 3d printing , innovation..."
200909,@Simply_Arcel,0,"󾭚new arrival korean 3d print 3/4 batwing sleeve blouse chiffon fabric free size, loose size fits up to xl\n\nonline sale: p250 only !!"
697993,@dEngzHue,0,"󾮜 @p280 only !!\n󾬏new arrival u.s. style 3d print silk combined cotton blouse󾬏\n󾔏cotton fabric, right thickness,..."
60538,@dEngzHue,0,"󾮜@p380 only !!\n󾬏new arrival u.s. style 3d print chiffon batwing sleeve polo shirt󾬏\n󾔏chiffon fabric, right..."


### Filtering Out Tweets That Contain No Alphabetic Characters (Unicode-Aware)
detecting rows with zero letters in any language, thanks to the \p{L} approach.

In [91]:
import regex

pattern = r'[\p{L}]'  # \p{L} matches any kind of letter from any language

mask_no_letters_v2 = df_selected["Normalized Text"].str.strip().apply(
    lambda x: not bool(regex.search(pattern, x))  # Use regex.search, not re.search
)

df_no_letters = df_selected[mask_no_letters_v2]
df_no_letters

Unnamed: 0,Author ID,Total Engagement,Normalized Text


In [92]:
df_selected = df_selected[~mask_no_letters_v2]
df_selected

Unnamed: 0,Author ID,Total Engagement,Normalized Text
281626,@tannahillglen,0,! 3d printed brain beer bottle opener \nby 1nezer0 via @etsy
1277797,@walterrhett,0,! @gcobbhunter the world's first 3d printed village. homes are built in 24 hours! a solution to homelessness and affordable housing!
1333167,@canadada,3,! > a mini 3d-printed #press brings #printmaking to the #people ress-project/ ...
1273025,@johnedgarpark,7,! card full\nhazard of doing 3d printing time lapses on a card that’s already full of video you’ve been meaning to delete.
1514023,@Daniel21891,0,"! i'm not mistaken here, your project provides a lot of experience for us, which you call 3d manufacturing or additive manufacturing . very good. thank you. #erecoin # 3dprinting"
...,...,...,...
792193,@ICEEfest,0,"󾬶 scott summit, designer and researcher, stanford university, talking @ #iceefest about 3d printing , innovation..."
200909,@Simply_Arcel,0,"󾭚new arrival korean 3d print 3/4 batwing sleeve blouse chiffon fabric free size, loose size fits up to xl\n\nonline sale: p250 only !!"
697993,@dEngzHue,0,"󾮜 @p280 only !!\n󾬏new arrival u.s. style 3d print silk combined cotton blouse󾬏\n󾔏cotton fabric, right thickness,..."
60538,@dEngzHue,0,"󾮜@p380 only !!\n󾬏new arrival u.s. style 3d print chiffon batwing sleeve polo shirt󾬏\n󾔏chiffon fabric, right..."


In [93]:
# Some sanity check
df_lynn = df_selected[df_selected["Author ID"] == "@Lynn_Allen"]
df_lynn

Unnamed: 0,Author ID,Total Engagement,Normalized Text
101723,@Lynn_Allen,2,. @autodesk project dreamcatcher wins @maschinenmarkt best of industry award 2017 for additive manufacturing :
620878,@Lynn_Allen,4,". @cerevo_orbitrec wants to revolutionize the cycling industry with its 3d-printed , iot-connected bike."
1104890,@Lynn_Allen,0,.@conceptlaserinc is using metal additive manufacturing to keep a wwii-era fighter jet flying high.
1042407,@Lynn_Allen,8,"3d printing is changing construction industry, but are the buildings of tomorrow here yet?"
423776,@Lynn_Allen,3,3d printing is changing the way doctors approach life-saving heart operations.
0,@Lynn_Allen,0,a traditional lace design results in a new 3d printed breast reconstruction method
176420,@Lynn_Allen,9,"design camp teaches girls fashion, engineering and 3d printing #stem @autodesk @kirakira3d mp-011063 ..."
1021238,@Lynn_Allen,12,enable community foundation gets personal with 3d-printed prosthetics @engineeringcom
925946,@Lynn_Allen,6,"grab your tissue, this inventive dad designed & built a 3d printed , body powered, hydraulic prosthetics for his..."
427788,@Lynn_Allen,10,here are some of the things you could do in education today with 3d printing .


### Drop Pure Duplicates (!!!! This need to be engagement based)

In [51]:
# Create a DataFrame of the duplicate rows that would be dropped
duplicates = df_selected[df_selected.duplicated(subset=["Normalized Text"], keep="first")]
print("Rows that would be dropped based on 'Normalized Text' duplicates:")
duplicates

Rows that would be dropped based on 'Normalized Text' duplicates:


Unnamed: 0,Author ID,Normalized Text
331245,@AlShuryan,""" news about 3d printing "" printing-alfred-shuryan ... by @alshuryan on @linkedin"
1109645,@My3DPrinting,""" #3dprinting and personalized healthcare is here to stay"" - anatomics discusses #3dprinted implant, regulations cs-3dp-sternum-interview/ ..."
435819,@fastfuture,""" #ge makes $1.4b bet on #3dprinting , acquires two firms to boost additive #manufacturing """
288143,@Step_Holt,""" #nasa testing a 3d-printed #rocket #engine "" #space #cosmos #universe"
148371,@Step_Holt,""" #nasa testing a 3d-printed #rocket #engine "" #space #cosmos #universe"
...,...,...
258080,@newswire_nin,적층 가공 전략 컨퍼런스 ‘ additive manufacturing strategies 2018’ 개최
232761,@r3d2solid,"1円~6.新品 3d systems 3dプリンター cube 3d printer gen3 grey 391100 3dプリンター 企業に1台 子... 現在価格:¥2,268円 #ヤフオク !"
1305687,@jeffmacharyas,"Rappahannock Community College\nglenns, gloucester county, virginia\n\nrising 7, 8 & 9 grade students:\nsaturday stem academy\nlearn basics of engineering design and 3d printing ! #3dprinting #engineering \njune 8, 2019\n9am – 12:30pm"
162451,@lhaiidelaila,"󾬏new arrival korean 3d print 3/4 batwing sleeve blouse @290 󾔏chiffon fabric, right thickness, soft touch,..."


In [94]:
# 2. Sort by descending total engagement
df_selected = df_selected.sort_values("Total Engagement", ascending=False)

# 3. Drop duplicates while keeping the rows with the highest engagement
df_selected = df_selected.drop_duplicates(subset=["Normalized Text"], keep="first")
df_selected

Unnamed: 0,Author ID,Total Engagement,Normalized Text
516701,@Bill_Gross,439944,"in the ""i'm getting old"" department.., a kid saw this and said, ""oh, you 3d-printed the 'save' icon."""
1560709,@rustbeltlady,383384,who gave my little brother a 3d printer
1304272,@rveenewman,213595,a 3d printed light projected animation. proof that there's always new ways to animate everything. #3dprint #animation
1173577,@faveslooks,187701,"jourdan dunn wore a zac posen gown to the met gala. weighing in at 30 pounds, this dress took more than 1,100 hours to 3d print ."
1272675,@ThamKhaiMeng,128878,this arrow by mathematician and sculptor kokichi sugihara can't point left. here's how it works: it's 3d-printed with a bunch of curves our brains don't register.
...,...,...,...
850473,@FantasticDeals6,0,cr-10 3d printer on sale at
1408590,@MaleAlien,0,cr-10 3d printer im coming for you no cap!
682117,@GodElectronicG,0,cr-10 3d printer high precision aluminum pre-assembled large print creality twt/?item=192358497534 ...
439146,@lamurdiparasian,0,cr-10 3d printer giveaway winners announced! 3d-printer-giveaway-winners.html ...


In [95]:
# # Then drop them in the original DataFrame:
# df_selected = df_selected.drop_duplicates(subset=["Normalized Text"], keep="first")
# df_selected

One of the prominent challenges in managing large-scale social media datasets, particularly those derived from platforms like Twitter, is the prevalence of near-duplicate content. Due to the nature of online discourse—where users often retweet, quote tweets, or slightly modify existing tweets to repost them—the dataset can become saturated with near-identical textual entries. This redundancy not only inflates dataset size and impacts computational efficiency but can also introduce bias into downstream analyses such as sentiment classification and topic modeling. For example, a series of tweets that differ only by minor variations (e.g., additional punctuation, URLs, or short appended phrases) can skew model training or frequency counts, thus undermining the validity of the findings.

To address these issues, a robust near-duplicate detection strategy is required, one capable of recognizing similarities between texts without relying on a prohibitively expensive pairwise comparison of every tweet. Here, the MinHash-based Locality-Sensitive Hashing (LSH) approach provides a scalable and effective solution. Unlike exact string matching, MinHash operates on a tokenized set representation of the text, converting each tweet into a collection of words or n-grams. It then computes a “signature” that approximates the textual overlap—measured by the Jaccard similarity—between any two sets of tokens. By grouping signatures that appear similar in a computationally efficient way, LSH drastically reduces the number of pairwise comparisons needed, thus making it feasible to identify redundancies within extensive datasets that may contain hundreds of thousands, or even millions, of tweets. This approach not only preserves the essential semantic information within the text but also accommodates minor textual variations, allowing researchers to confidently prune near-duplicates and ensure that their analyses are neither skewed nor computationally overwhelmed by repetitive data.


Install datasketch first:
pip install datasketch
LSH will drastically reduce comparisons compared to an 
𝑂(𝑛2)O(n2) brute force fuzzy-match. However, it still can be memory-intensive with hundreds of thousands of tweets, so make sure you have enough memory and consider chunking or more advanced setups if needed.

This approach approximates Jaccard-based similarity via MinHash. For a final “exact” check, you might want to do a secondary pass on the returned candidate pairs (e.g., using fuzzy matching or direct Jaccard on the token sets).

1. Reading multiple CSV files and concatenating them into a single DataFrame.

2. Preprocessing and tokenizing each tweet.

3. Creating MinHash signatures for each tweet.

4. Building and using an LSH index to group near-duplicates efficiently.

5. Collecting near-duplicate pairs in a final DataFrame for further analysis (e.g., deciding which one to keep).

In [96]:
from datasketch import MinHash, MinHashLSH
num_perm = 128    # Number of hash permutations for each MinHash
lsh_threshold = 0.8  # Approx Jaccard similarity threshold for grouping

lsh = MinHashLSH(threshold=lsh_threshold, num_perm=num_perm)
minhash_dict = {}

In [97]:
df_selected["tokens"] = df_selected["Normalized Text"].apply(tokenize)

In [None]:
print("Building MinHash signatures and inserting into LSH...")
for i, row in tqdm(df_selected.iterrows(), 
                   total=len(df_selected), 
                   desc="MinHash & LSH Insertion"):
    tokens = row["tokens"]
    mh = create_minhash(tokens, num_perm=num_perm)
    minhash_dict[i] = mh
    
    # Insert into the LSH index
    # Use str(i) or something unique as the key
    lsh.insert(str(i), mh)

print("LSH index built.")

Building MinHash signatures and inserting into LSH...


MinHash & LSH Insertion:   0%|          | 0/1085540 [00:00<?, ?it/s]

In [None]:
pip install --upgrade ipywidgets

In [None]:
similar_pairs = []

print("Finding near-duplicate pairs...")

for i in range(len(df)):
    mh = minhash_dict[i]
    # Query LSH for near-duplicates
    # This returns keys of all items that have Jaccard >= lsh_threshold with i
    result = lsh.query(mh)
    
    # Convert the result (list of strings) back to integer indices
    # We also skip the case i == j
    dup_indices = [int(x) for x in result if int(x) != i]
    
    # Record pairs (i, j) with approximate similarity
    for j in dup_indices:
        similar_pairs.append({
            "Index_1": i,
            "Tweet_1": df.loc[i, "Tweet"],
            "Index_2": j,
            "Tweet_2": df.loc[j, "Tweet"],
            "Approx_Jaccard": lsh_threshold,
            # Optionally store actual tokens or other info
        })

# Build a DataFrame of near-duplicate pairs
duplicates_df = pd.DataFrame(similar_pairs)

# Optionally, drop exact duplicates from this table (since i, j can appear repeatedly)
# for instance by sorting & dropping duplicates:
duplicates_df["Pair"] = duplicates_df.apply(
    lambda row: tuple(sorted([row["Index_1"], row["Index_2"]])), axis=1
)
duplicates_df.drop_duplicates("Pair", inplace=True)
duplicates_df.drop(columns="Pair", inplace=True)

print("Number of near-duplicate pairs found:", len(duplicates_df))
print("Sample of near-duplicate pairs:")
print(duplicates_df.head(10))
##########################
# 6) DEDUPLICATION STRATEGY
##########################

# The 'duplicates_df' now holds candidate near-duplicates. You can:
# 1) Inspect them directly.
# 2) Further refine with a second pass (e.g., a direct fuzzy match or exact Jaccard
#    on the tokens) to confirm. 
# 3) For each group of near-duplicates, decide which tweet to keep (e.g., the earliest
#    or the one with highest engagement) and which to remove.

##########################
# DONE
##########################