In [1]:
# Imports
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import requests
from am_sma_data_cleaning.utils import remove_spammers
import glob
import re
from tqdm.notebook import tqdm
import unicodedata
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import make_interp_spline
import matplotlib.ticker as mtick
import dataframe_image as dfi
#--
import fasttext
from langdetect import detect
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm



### Helper Functions And Global Variables

In [2]:
CURRENT_DATE_TIME = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

def normalize_post(text):
    """
    Convert the text to lowercase and remove URLs, hashtags, mentions, punctuation,
    and extra whitespace.
    """
    if not isinstance(text, str):
        # Convert non-string (like float NaN) to an empty string
        text = ""
    text = text.lower()
    text = re.sub(r'http\S+', '', text)   # Remove URLs
    #text = re.sub(r'[@#]\S+', '', text)     # Remove hashtags and mentions
    #text = re.sub(r'[^\w\s]', '', text)     # Remove punctuation
    return text.strip()

def tokenize(text):
    """
    Simple word-based tokenization.
    You could also use n-grams for more robust matching.
    """
    return text.split()

### Read All Data

In [3]:
##-- Use glob to find all CSV files recursively under the given directory
all_csv_files = glob.glob(
    "/home/shola/research_raw_data/**/*.csv",
    recursive=True
)
##-- Read each CSV file and store in a list
df_list = [pd.read_csv(csv_file) for csv_file in all_csv_files]

##-- Concatenate all into one DataFrame
df = pd.concat(df_list, ignore_index=True)

##-- Drop views since its not tracked by tweeter for later years
df = df.drop(columns="Views")

##-- Add column for total engagement
df["Total Engagement"] = (
    df["Replies"] + df["Likes"] + df["Reshares"]
)

##-- Add tweet id column
df["tweet_id"] = df["Author ID"].astype(str) + "_" + df["Date"].astype(str)

##--
df["Date"] = pd.to_datetime(df["Date"], format='ISO8601')

##-- Drop unwanted years..these must have been added via over run of the years..they are not full years but only few days in 
df = df[~df["Date"].dt.year.isin([2011, 2024, 2025])]  

#create column row_num
df = df.assign(row_num=range(1, len(df) + 1))

pd.set_option('display.max_colwidth', None)

In [4]:
df

Unnamed: 0,Author Name,Author ID,Date,Post Text,Replies,Likes,Reshares,Total Engagement,tweet_id,row_num
0,₊˚⊹♡ Ashley ୨୧,@Whimsu,2013-01-01 00:19:40+00:00,My sister got me this 3D printed gyroid for Christmas! Sooo cute!,0,0,0,0,@Whimsu_2013-01-01 00:19:40+00:00,1
1,Jon Witts,@jonwitts,2012-12-30 20:18:03+00:00,"This is just so cool! "" @josiefraser : 3D printer converts digital audio files into 33rpm records http:// """,0,0,0,0,@jonwitts_2012-12-30 20:18:03+00:00,2
2,Marshall Advanced Manufacturing Center,@MUAdvMfg,2012-12-30 15:13:18+00:00,3D printing predictions for 2013 - http:// 229-3d-printing-predictions-for-2013.html …,0,0,0,0,@MUAdvMfg_2012-12-30 15:13:18+00:00,3
3,tttitoflip,@tttitoflip,2012-12-30 13:19:02+00:00,Features/ 3D Printing http://,0,0,0,0,@tttitoflip_2012-12-30 13:19:02+00:00,4
4,3DPrint_news,@3dprint_news,2012-12-30 22:37:16+00:00,RT @RicardoBlanco This is the best analysis of the pros and cons of commerical-scale 3D printing I've seen. http:// -achilles-heel-of-3d-printing-015281 …,0,0,0,0,@3dprint_news_2012-12-30 22:37:16+00:00,5
...,...,...,...,...,...,...,...,...,...,...
6728757,"Helfrich Bicycles, LLC",@HelfrichBicycle,2022-12-17 04:56:50+00:00,"I custom built this wheel with a 3D printed logo for the @pmmarchingunit @pennmanorschools annual banquet basket raffle. #bicyclerepair #bicyclerestoration @ Lancaster, Pennsylvania",0,0,0,0,@HelfrichBicycle_2022-12-17T04:56:50.000Z,6725981
6728758,Ghost Guns,@buyghostguns,2022-12-17 01:21:51+00:00,SG22 .22LR 3D PRINTED BUILD KIT W/CMMG PARTS https:// 2lr-3d-printed-build-kit-w-cmmg-parts/?utm_source=ReviveOldPost&utm_medium=social&utm_campaign=ReviveOldPost … #22lr #pistols,0,0,0,0,@buyghostguns_2022-12-17T01:21:51.000Z,6725982
6728759,Smoke & Mirrors,@DrMask12,2022-12-16 23:21:22+00:00,"Finally decided to try and 3d print the B.P.S logo. Need to tweak some settings and make it a little larger, but I'd say it's a great success. I made the 3d model of the logo, credit for the actual digital logo goes to @alexkisterr",0,13,0,13,@DrMask12_2022-12-16T23:21:22.000Z,6725983
6728760,X Æ A-12,@ray5ar,2022-12-17 10:57:20+00:00,More 3D Printed Compliant Mechanisms https:// EIPA?feature=share … via @YouTube,0,0,0,0,@ray5ar_2022-12-17T10:57:20.000Z,6725984


# Data CLeaning

In [5]:
KNOWN_POSSIBLE_BOT_OR_SPAM_ACCOUNTS = [
    "@weitingforyou",
    "@JayEfikeco",
    "@buy3d",
    #"@bmine3rz",
    "@Reksaurian",
    "@Trend_deal_420",
    "@zeppy_3dprint",
    "@bubbleistrouble",
    "@HakanFagnell",
    "@Domain_Buyer",
    "@TriciaClyne",
    "@NEWsDealz",
    "@Deals_New_712",
    "@Deal_HOT_341",
    "@RealistRider",
    "@ShantelW6",
    "@SomeDroidCom",
    "@Arc_Deals_8812",
    "@Buy_Now_811",
    "@Best_Tips_443",
    "@truthinautism",
    "@Tweet_Deal_819",
    "@New_deal_932",
    "@My3DPrinting",
    "@Manhal_Deal_311",
    "@new_offer_827",
    "@danowall",
    "@Hot_Tips_123",
    "@lovepornsites",
    "@my3dsupplies",
    "@ISG3D",
    "@csmlibrarian",
    "@Great_Deal_691",
    "@bitcoinagile",
    #confirmed
    "@bectcomputing",
    "@Reksaurian",
    "@JohnTerz",
    "@AVS3DPrint",
    "@3dprintingjobs",
    "@SMilloow",
    "@PECHOLATATV", #irrelevant content mostly but not spam/bot account
]

## 1. Remove Tweets From Accounts With Promotional or Spam-Like Usernames
NOTE: Many promotional or spam accounts include keywords like "buy", "deal", "sale", etc. in their usernames.
This pattern used in the python code belowtargets such accounts by matching those keywords at word boundaries (start, end, or surrounded by underscores).
Tweets from these accounts are removed to reduce noise in the dataset.

In [6]:
pattern = r'(deal|buy|sale|discount|offer|promo|bargain)_'
# Using case-insensitive flag instead of .str.lower()
promo_zero_engagement = df['Author ID'].str.strip().str.contains(pattern, flags=re.IGNORECASE, na=False) & (df['Total Engagement'] == 0)
promo_account_tweets_df = df[promo_zero_engagement]

# filter them out
df = df[~promo_zero_engagement]

#Show for review
promo_account_tweets_df

  promo_zero_engagement = df['Author ID'].str.strip().str.contains(pattern, flags=re.IGNORECASE, na=False) & (df['Total Engagement'] == 0)


Unnamed: 0,Author Name,Author ID,Date,Post Text,Replies,Likes,Reshares,Total Engagement,tweet_id,row_num
8956,Marketing Madness,@Promo_Madness,2012-12-16 15:10:42+00:00,Interactive 3D Printed LED Pixel Hat Simulates Gravity: Advertise here with BSAWhen it com... http:// http://,0,0,0,0,@Promo_Madness_2012-12-16 15:10:42+00:00,8957
10446,Marketing Madness,@Promo_Madness,2012-12-14 19:55:18+00:00,Creating Simple Electronics Through 3D Printing Is Now Possible: Advertise here with BSA3D... http:// http://,0,0,0,0,@Promo_Madness_2012-12-14 19:55:18+00:00,10447
34317,Marketing Madness,@Promo_Madness,2012-11-22 19:06:51+00:00,DIY 3D Printed iPhone Case (Even Incorporate Your Favorite Sound): Advertise here with BSA... http:// http://,0,0,0,0,@Promo_Madness_2012-11-22 19:06:51+00:00,34318
36551,HouseBuyNI,@House_Buy_NI,2012-11-21 19:45:42+00:00,Kickstarter sued over 3D printers: Kickstarter has been sued over claims that a 3D printer marketed through its ... http://,0,0,0,0,@House_Buy_NI_2012-11-21 19:45:42+00:00,36552
36780,Menusis Domains,@sale_domains,2012-11-21 19:45:35+00:00,Kickstarter sued over 3D printers: Kickstarter has been sued over claims that a 3D printer marketed through its ... http://,0,0,0,0,@sale_domains_2012-11-21 19:45:35+00:00,36781
...,...,...,...,...,...,...,...,...,...,...
6452834,buy_silver,@buy_silver,2022-11-02 22:36:20+00:00,My first stack in a 3D printed container #silver,0,0,0,0,@buy_silver_2022-11-02T22:36:20.000Z,6450058
6458937,buy_silver,@buy_silver,2022-08-02 16:36:41+00:00,"Pure Silver Mew & Pika, casted with 3d printed lost resin wax #silver",0,0,0,0,@buy_silver_2022-08-02T16:36:41.000Z,6456161
6481267,The Bingo Deal,@deal_bingo,2022-08-16 06:30:30+00:00,3D Printing Pen With USB\n________________ #toys #kids #toys4life #toycommunity #deal #style #bestshopping #shoppingdaily #worldsales #shopping #onlineshopping https:// n-with-usb/ …,0,0,0,0,@deal_bingo_2022-08-16T06:30:30.000Z,6478491
6487651,The Bingo Deal,@deal_bingo,2022-06-21 09:18:43+00:00,3D Printing Pen With USB\n________________\n\n #accessories #thebingodeal #sale #shopping https:// n-with-usb/ …,0,0,0,0,@deal_bingo_2022-06-21T09:18:43.000Z,6484875


In [7]:
user_list = list(set(u.strip().strip('"') for u in KNOWN_POSSIBLE_BOT_OR_SPAM_ACCOUNTS if u.startswith('@')))
filtered_authors = promo_account_tweets_df['Author ID'].astype(str).str.strip().str.lower().unique()
not_in_filtered = [u for u in user_list if u.lower() not in filtered_authors]
not_in_filtered

['@lovepornsites',
 '@AVS3DPrint',
 '@Domain_Buyer',
 '@NEWsDealz',
 '@bectcomputing',
 '@JayEfikeco',
 '@truthinautism',
 '@my3dsupplies',
 '@SMilloow',
 '@ISG3D',
 '@PECHOLATATV',
 '@csmlibrarian',
 '@bubbleistrouble',
 '@HakanFagnell',
 '@Reksaurian',
 '@Best_Tips_443',
 '@bitcoinagile',
 '@Arc_Deals_8812',
 '@Hot_Tips_123',
 '@JohnTerz',
 '@buy3d',
 '@ShantelW6',
 '@SomeDroidCom',
 '@danowall',
 '@My3DPrinting',
 '@RealistRider',
 '@TriciaClyne',
 '@Deals_New_712',
 '@weitingforyou',
 '@3dprintingjobs',
 '@zeppy_3dprint']

## 2. Normalization of The Tweets

In [8]:
df_selected = df[["Author ID", "Post Text", "Total Engagement", "Date", "tweet_id", "row_num"]]
df_selected["Normalized Text"] = df_selected["Post Text"].apply(normalize_post)
# Drop the intermediate column "Post Text" and keep its normalized version
df_selected = df_selected.drop(columns="Post Text")
df_selected

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected["Normalized Text"] = df_selected["Post Text"].apply(normalize_post)


Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text
0,@Whimsu,0,2013-01-01 00:19:40+00:00,@Whimsu_2013-01-01 00:19:40+00:00,1,my sister got me this 3d printed gyroid for christmas! sooo cute!
1,@jonwitts,0,2012-12-30 20:18:03+00:00,@jonwitts_2012-12-30 20:18:03+00:00,2,"this is just so cool! "" @josiefraser : 3d printer converts digital audio files into 33rpm records """
2,@MUAdvMfg,0,2012-12-30 15:13:18+00:00,@MUAdvMfg_2012-12-30 15:13:18+00:00,3,3d printing predictions for 2013 - 229-3d-printing-predictions-for-2013.html …
3,@tttitoflip,0,2012-12-30 13:19:02+00:00,@tttitoflip_2012-12-30 13:19:02+00:00,4,features/ 3d printing
4,@3dprint_news,0,2012-12-30 22:37:16+00:00,@3dprint_news_2012-12-30 22:37:16+00:00,5,rt @ricardoblanco this is the best analysis of the pros and cons of commerical-scale 3d printing i've seen. -achilles-heel-of-3d-printing-015281 …
...,...,...,...,...,...,...
6728757,@HelfrichBicycle,0,2022-12-17 04:56:50+00:00,@HelfrichBicycle_2022-12-17T04:56:50.000Z,6725981,"i custom built this wheel with a 3d printed logo for the @pmmarchingunit @pennmanorschools annual banquet basket raffle. #bicyclerepair #bicyclerestoration @ lancaster, pennsylvania"
6728758,@buyghostguns,0,2022-12-17 01:21:51+00:00,@buyghostguns_2022-12-17T01:21:51.000Z,6725982,sg22 .22lr 3d printed build kit w/cmmg parts 2lr-3d-printed-build-kit-w-cmmg-parts/?utm_source=reviveoldpost&utm_medium=social&utm_campaign=reviveoldpost … #22lr #pistols
6728759,@DrMask12,13,2022-12-16 23:21:22+00:00,@DrMask12_2022-12-16T23:21:22.000Z,6725983,"finally decided to try and 3d print the b.p.s logo. need to tweak some settings and make it a little larger, but i'd say it's a great success. i made the 3d model of the logo, credit for the actual digital logo goes to @alexkisterr"
6728760,@ray5ar,0,2022-12-17 10:57:20+00:00,@ray5ar_2022-12-17T10:57:20.000Z,6725984,more 3d printed compliant mechanisms eipa?feature=share … via @youtube


## 3. Remove Empty Tweets/Posts
This could have happened if the author only posted a picture or video without text

In [9]:
df_selected = df_selected.sort_values(by="Normalized Text")
mask_empty = df_selected["Normalized Text"].str.strip().eq("")
df_empty = df_selected[mask_empty]
df_empty

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text
2798457,@Reksaurian,1,2021-12-22 02:05:38+00:00,@Reksaurian_2021-12-22T02:05:38.000Z,2797225,
4362434,@Linus3DPrinting,0,2015-03-10 11:02:02+00:00,@Linus3DPrinting_2015-03-10T11:02:02.000Z,4359658,
5696951,@3DPrinterSA,0,2013-07-03 04:58:45+00:00,@3DPrinterSA_2013-07-03T04:58:45.000Z,5694175,
2055272,@3DPrinterSA,0,2016-02-24 04:00:47+00:00,@3DPrinterSA_2016-02-24T04:00:47.000Z,2054088,
1046129,@stilson41,0,2017-11-14 20:28:59+00:00,@stilson41_2017-11-14T20:28:59.000Z,1044945,
...,...,...,...,...,...,...
739831,@stilson41,0,2017-10-20 13:44:59+00:00,@stilson41_2017-10-20T13:44:59.000Z,738647,
4529840,@3DCola,0,2015-11-24 18:33:17+00:00,@3DCola_2015-11-24T18:33:17.000Z,4527064,
185956,@3DPrinterSA,0,2012-04-04 04:44:10+00:00,@3DPrinterSA_2012-04-04 04:44:10+00:00,185957,
5838369,@3d_printing_jp,0,2013-12-01 05:08:12+00:00,@3d_printing_jp_2013-12-01T05:08:12.000Z,5835593,


In [10]:
# Keep only rows where "Normalized Text" column is not empty string
df_selected = df_selected[~mask_empty]
df_selected

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text
91729,@JayEfikeco,0,2012-09-05 18:56:02+00:00,@JayEfikeco_2012-09-05 18:56:02+00:00,91730,!
91174,@JayEfikeco,0,2012-09-06 22:23:10+00:00,@JayEfikeco_2012-09-06 22:23:10+00:00,91175,!
108597,@JayEfikeco,0,2012-08-14 00:55:17+00:00,@JayEfikeco_2012-08-14 00:55:17+00:00,108598,!
2445059,@8BitoffunX,2,2019-08-08 02:08:04+00:00,@8BitoffunX_2019-08-08T02:08:04.000Z,2443875,!
108900,@JayEfikeco,0,2012-08-12 20:05:48+00:00,@JayEfikeco_2012-08-12 20:05:48+00:00,108901,!
...,...,...,...,...,...,...
4365097,@Ticketmaster_GR,0,2015-11-04 14:36:59+00:00,@Ticketmaster_GR_2015-11-04T14:36:59.000Z,4362321,"󾮍 η έκθεση που θα σας πάρει το μυαλό! \npac-man, super mario, tetris, ψηφιακή τεχνολογία και τέχνη, 3d printing ..."
1667383,@dEngzHue,0,2016-07-21 10:12:31+00:00,@dEngzHue_2016-07-21T10:12:31.000Z,1666199,"󾮜 @p280 only !!\n󾬏new arrival u.s. style 3d print silk combined cotton blouse󾬏\n󾔏cotton fabric, right thickness,..."
759531,@dEngzHue,0,2016-06-30 04:10:50+00:00,@dEngzHue_2016-06-30T04:10:50.000Z,758347,"󾮜@p380 only !!\n󾬏new arrival u.s. style 3d print chiffon batwing sleeve polo shirt󾬏\n󾔏chiffon fabric, right..."
1667130,@dEngzHue,0,2016-07-21 09:50:22+00:00,@dEngzHue_2016-07-21T09:50:22.000Z,1665946,󾮜@p380 only !!\n󾬏new arrival u.s. style vivid 3d print spandex-chiffon like jumpsuit󾬏\n󾔏spandex-chiffon like...


In [11]:
####--Helper function
def ascii_normalize_text(text):
    # Convert any non-ASCII fullwidth forms into ASCII equivalents
    # e.g., "ｔ" -> "t", "　" -> " ", etc.
    return unicodedata.normalize("NFKC", str(text))


## 4. Tweets with No Alphabetic Content
Rmeove tweets where there no letters, no digits, just things like !, ..., #@!, etc.

In [12]:
df_selected["Normalized Text"] = df_selected["Normalized Text"].apply(ascii_normalize_text)
mask_only_punct = df_selected["Normalized Text"].str.strip().apply(
    lambda x: bool(re.match(r'^[^\w]+$', x))  # "start ^, one or more non-alphanumeric chars, end $"
)

# This finds rows where 'Normalized Text' is purely punctuation/symbols 
df_only_punct = df_selected[mask_only_punct]

df_only_punct

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text
91729,@JayEfikeco,0,2012-09-05 18:56:02+00:00,@JayEfikeco_2012-09-05 18:56:02+00:00,91730,!
91174,@JayEfikeco,0,2012-09-06 22:23:10+00:00,@JayEfikeco_2012-09-06 22:23:10+00:00,91175,!
108597,@JayEfikeco,0,2012-08-14 00:55:17+00:00,@JayEfikeco_2012-08-14 00:55:17+00:00,108598,!
2445059,@8BitoffunX,2,2019-08-08 02:08:04+00:00,@8BitoffunX_2019-08-08T02:08:04.000Z,2443875,!
108900,@JayEfikeco,0,2012-08-12 20:05:48+00:00,@JayEfikeco_2012-08-12 20:05:48+00:00,108901,!
...,...,...,...,...,...,...
76930,@JayEfikeco,0,2012-09-22 21:49:34+00:00,@JayEfikeco_2012-09-22 21:49:34+00:00,76931,♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫
76638,@JayEfikeco,0,2012-09-22 23:02:34+00:00,@JayEfikeco_2012-09-22 23:02:34+00:00,76639,♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫
77288,@JayEfikeco,0,2012-09-22 11:27:16+00:00,@JayEfikeco_2012-09-22 11:27:16+00:00,77289,♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫
76805,@JayEfikeco,0,2012-09-22 19:12:04+00:00,@JayEfikeco_2012-09-22 19:12:04+00:00,76806,♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫ ♪ ♫♪


In [13]:
# Remove those rows by inverting the mask with '~'
df_selected = df_selected[~mask_only_punct]
df_selected

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text
6403163,@wultramanmaxfr,0,2023-01-29 14:09:15+00:00,@wultramanmaxfr_2023-01-29T14:09:15.000Z,6400387,!\n\nwoah! 3d-printed homes made with 100% bio-based materials - talk about sustainable living! #news #greenliving #innovation rinted-home-you-can-recycle/#ftag=cad590a51e ...
6440207,@CaballeroBarce2,2,2022-03-26 23:13:27+00:00,@CaballeroBarce2_2022-03-26T23:13:27.000Z,6437431,"!\n\n—\n\nwe are a team of designers and engineers who have created a simple and sustainable solution to the global problem of food waste. we have developed an edible 3d-printed deer that's made from plants, has no packaging, and is the most realistic deer replica ever created.\n\n—\n."
145686,@izaiijigiri,0,2012-06-14 21:00:24+00:00,@izaiijigiri_2012-06-14 21:00:24+00:00,145687,! custom parts 3d systems cube(r) 3d printer receives american technology award
1074324,@tannahillglen,0,2016-10-26 14:59:19+00:00,@tannahillglen_2016-10-26T14:59:19.000Z,1073140,! 3d printed brain beer bottle opener \nby 1nezer0 via @etsy
475318,@Mattsit21,0,2020-01-19 00:00:30+00:00,@Mattsit21_2020-01-19T00:00:30.000Z,474134,! 3d printing parts to a shield cell!!! #6wattson .
...,...,...,...,...,...,...
4365097,@Ticketmaster_GR,0,2015-11-04 14:36:59+00:00,@Ticketmaster_GR_2015-11-04T14:36:59.000Z,4362321,"󾮍 η έκθεση που θα σας πάρει το μυαλό! \npac-man, super mario, tetris, ψηφιακή τεχνολογία και τέχνη, 3d printing ..."
1667383,@dEngzHue,0,2016-07-21 10:12:31+00:00,@dEngzHue_2016-07-21T10:12:31.000Z,1666199,"󾮜 @p280 only !!\n󾬏new arrival u.s. style 3d print silk combined cotton blouse󾬏\n󾔏cotton fabric, right thickness,..."
759531,@dEngzHue,0,2016-06-30 04:10:50+00:00,@dEngzHue_2016-06-30T04:10:50.000Z,758347,"󾮜@p380 only !!\n󾬏new arrival u.s. style 3d print chiffon batwing sleeve polo shirt󾬏\n󾔏chiffon fabric, right..."
1667130,@dEngzHue,0,2016-07-21 09:50:22+00:00,@dEngzHue_2016-07-21T09:50:22.000Z,1665946,󾮜@p380 only !!\n󾬏new arrival u.s. style vivid 3d print spandex-chiffon like jumpsuit󾬏\n󾔏spandex-chiffon like...


## 5. Filtering Out Tweets That Contain No Alphabetic Characters (Unicode-Aware)
detecting rows with zero letters in any language, thanks to the \p{L} approach.

In [14]:
import regex

pattern = r'[\p{L}]'  # \p{L} matches any kind of letter from any language

mask_no_letters_v2 = df_selected["Normalized Text"].str.strip().apply(
    lambda x: not bool(regex.search(pattern, x))  # Use regex.search, not re.search
)

df_no_letters = df_selected[mask_no_letters_v2]
df_no_letters

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text
3523525,@Positiv3Prints,0,2018-12-14 18:09:10+00:00,@Positiv3Prints_2018-12-14T18:09:10.000Z,3520749,#2020
6594268,@3DPrintGeneral,46,2022-11-17 19:44:36+00:00,@3DPrintGeneral_2022-11-17T19:44:36.000Z,6591492,$699
6585696,@3dprintmars,0,2022-07-21 16:01:45+00:00,@3dprintmars_2022-07-21T16:01:45.000Z,6582920,$800!!!!!!!
5424053,@3dprintingmba,0,2014-12-08 17:33:50+00:00,@3dprintingmba_2014-12-08T17:33:50.000Z,5421277,$97
2915906,@Reksaurian,0,2021-09-21 12:36:18+00:00,@Reksaurian_2021-09-21T12:36:18.000Z,2914674,*_*
...,...,...,...,...,...,...
1863687,@Reksaurian,0,2017-05-17 13:17:45+00:00,@Reksaurian_2017-05-17T13:17:45.000Z,1862503,=_= -
2917208,@Reksaurian,3,2021-07-30 18:20:14+00:00,@Reksaurian_2021-07-30T18:20:14.000Z,2915976,______
6379766,@Reksaurian,0,2023-04-03 01:44:11+00:00,@Reksaurian_2023-04-03T01:44:11.000Z,6376990,________
5983721,@JayEfikeco,0,2013-08-27 23:15:53+00:00,@JayEfikeco_2013-08-27T23:15:53.000Z,5980945,┈┏╮┈┈┈┈┈┈┃┈┈┈┈┈┈┈ ┈┣┫┣╮╭╮╭╮┃╱┃┣╮╭┫┈ ┈┗╯┛┈╰━╰┻┃╲┗┗┗╰┫┈ ┈┈┈┈┈┏╮┈┈┈┃┈┈┈╰╯┈ ┈┈┈┈┈┣┫╭╮╭┫┈┈┈┈┈┈ ┈┈┈┈┈┗╯╰┻╰┛┈┈ 468 ...


In [15]:
df_selected = df_selected[~mask_no_letters_v2]
df_selected

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text
6403163,@wultramanmaxfr,0,2023-01-29 14:09:15+00:00,@wultramanmaxfr_2023-01-29T14:09:15.000Z,6400387,!\n\nwoah! 3d-printed homes made with 100% bio-based materials - talk about sustainable living! #news #greenliving #innovation rinted-home-you-can-recycle/#ftag=cad590a51e ...
6440207,@CaballeroBarce2,2,2022-03-26 23:13:27+00:00,@CaballeroBarce2_2022-03-26T23:13:27.000Z,6437431,"!\n\n—\n\nwe are a team of designers and engineers who have created a simple and sustainable solution to the global problem of food waste. we have developed an edible 3d-printed deer that's made from plants, has no packaging, and is the most realistic deer replica ever created.\n\n—\n."
145686,@izaiijigiri,0,2012-06-14 21:00:24+00:00,@izaiijigiri_2012-06-14 21:00:24+00:00,145687,! custom parts 3d systems cube(r) 3d printer receives american technology award
1074324,@tannahillglen,0,2016-10-26 14:59:19+00:00,@tannahillglen_2016-10-26T14:59:19.000Z,1073140,! 3d printed brain beer bottle opener \nby 1nezer0 via @etsy
475318,@Mattsit21,0,2020-01-19 00:00:30+00:00,@Mattsit21_2020-01-19T00:00:30.000Z,474134,! 3d printing parts to a shield cell!!! #6wattson .
...,...,...,...,...,...,...
4365097,@Ticketmaster_GR,0,2015-11-04 14:36:59+00:00,@Ticketmaster_GR_2015-11-04T14:36:59.000Z,4362321,"󾮍 η έκθεση που θα σας πάρει το μυαλό! \npac-man, super mario, tetris, ψηφιακή τεχνολογία και τέχνη, 3d printing ..."
1667383,@dEngzHue,0,2016-07-21 10:12:31+00:00,@dEngzHue_2016-07-21T10:12:31.000Z,1666199,"󾮜 @p280 only !!\n󾬏new arrival u.s. style 3d print silk combined cotton blouse󾬏\n󾔏cotton fabric, right thickness,..."
759531,@dEngzHue,0,2016-06-30 04:10:50+00:00,@dEngzHue_2016-06-30T04:10:50.000Z,758347,"󾮜@p380 only !!\n󾬏new arrival u.s. style 3d print chiffon batwing sleeve polo shirt󾬏\n󾔏chiffon fabric, right..."
1667130,@dEngzHue,0,2016-07-21 09:50:22+00:00,@dEngzHue_2016-07-21T09:50:22.000Z,1665946,󾮜@p380 only !!\n󾬏new arrival u.s. style vivid 3d print spandex-chiffon like jumpsuit󾬏\n󾔏spandex-chiffon like...


## 6. Investigate and Drop Tweets Authored Spam/Bot Accounts Before Dropping Duplicates

In [16]:
duplicate_counts = (
    df_selected.groupby(['Author ID', 'Normalized Text'])
    .size()
    .reset_index(name='tweet_repeat_count')
)

# Keep only where tweet is repeated
repeated_tweets = duplicate_counts[duplicate_counts['tweet_repeat_count'] > 1]
user_repeat_stats = (
    repeated_tweets.groupby('Author ID')
    .size()
    .reset_index(name='num_repeated_texts')
)

# users with more than N repeated messages
likely_bots = user_repeat_stats[user_repeat_stats['num_repeated_texts'] >= 500]
likely_bots = likely_bots.sort_values(by='num_repeated_texts', ascending=False)
likely_bots

Unnamed: 0,Author ID,num_repeated_texts
44144,@bmine3rz,9728
44773,@buy3d,6900
10017,@Darkspiral_94,5663
14826,@GaryWeston11,5447
16593,@Hertz_ie,4765
908,@3d_printing_jp,3708
549,@3DPrintBoard,2966
25216,@My3DPrinting,2956
635,@3DPrintingFans,2919
565,@3DPrintGirl,2501


#### Tweets Per Over Certain Tresshold (e.g. 500) Per Year

In [17]:
# Ensure Date is datetime
df_selected['Date'] = pd.to_datetime(df_selected['Date'])

# Extract year
df_selected['Year'] = df_selected['Date'].dt.year

# Group by Author ID, Normalized Text, and Year
duplicate_counts = (
    df_selected.groupby(['Author ID', 'Normalized Text', 'Year'])
    .size()
    .reset_index(name='tweet_repeat_count')
)

# Keep only repeated tweets within the same year
repeated_tweets = duplicate_counts[duplicate_counts['tweet_repeat_count'] > 1]

# Count how many *distinct* repeated tweets each user has per year
user_repeat_stats = (
    repeated_tweets.groupby(['Author ID', 'Year'])
    .size()
    .reset_index(name='num_repeated_texts')
)

# ilter for users who repeat >=50 messages in a single year
likely_bots = user_repeat_stats[user_repeat_stats['num_repeated_texts'] >= 1100]

# Sort by most suspicious activity
likely_bots = likely_bots.sort_values(by='num_repeated_texts', ascending=False)

likely_bots


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Date'] = pd.to_datetime(df_selected['Date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Year'] = df_selected['Date'].dt.year


Unnamed: 0,Author ID,Year,num_repeated_texts
57240,@buy3d,2014.0,6422
21789,@Hertz_ie,2017.0,3440
1534,@3d_printing_jp,2013.0,2236
21788,@Hertz_ie,2016.0,2066
56449,@bmine3rz,2016.0,1982
56447,@bmine3rz,2014.0,1937
56450,@bmine3rz,2017.0,1937
13327,@Darkspiral_94,2018.0,1872
13326,@Darkspiral_94,2017.0,1767
19570,@GaryWeston11,2018.0,1734


In [18]:
df_selected[df_selected['Author ID'] == '@Hertz_ie']

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text,Year
750434,@Hertz_ie,0,2016-10-06 09:57:29+00:00,@Hertz_ie_2016-10-06T09:57:29.000Z,749250,#te 3d printed light tube by u rok design #interiordesign #lightinguk #giftideasuk,2016.0
846830,@Hertz_ie,0,2017-03-26 05:51:54+00:00,@Hertz_ie_2017-03-26T05:51:54.000Z,845646,#te 3d printed light tube by u rok design #interiordesign #lightinguk #giftideasuk,2017.0
1558823,@Hertz_ie,0,2016-11-19 23:44:06+00:00,@Hertz_ie_2016-11-19T23:44:06.000Z,1557639,#te 3d printed light tube by u rok design #interiorideas #homeshopuk #lightuk,2016.0
2174575,@Hertz_ie,0,2017-05-01 01:20:46+00:00,@Hertz_ie_2017-05-01T01:20:46.000Z,2173391,#te 3d printed light tube by u rok design #interiorideas #homeshopuk #lightuk,2017.0
1691512,@Hertz_ie,0,2017-07-20 20:19:59+00:00,@Hertz_ie_2017-07-20T20:19:59.000Z,1690328,#te 3d printed light tube by u rok design #interiorideas #homeshopuk #lightuk,2017.0
...,...,...,...,...,...,...,...
747409,@Hertz_ie,0,2016-10-10 06:03:38+00:00,@Hertz_ie_2016-10-10T06:03:38.000Z,746225,❧❧ 3d printed light tube by u rok design #interiorideas #homeshopuk #lightuk,2016.0
1758319,@Hertz_ie,0,2017-08-23 05:20:10+00:00,@Hertz_ie_2017-08-23T05:20:10.000Z,1757135,❧❧ 3d printed wood light by u rok design #interiordesign #3dprinting #saleuk,2017.0
1158822,@Hertz_ie,0,2017-09-15 08:56:29+00:00,@Hertz_ie_2017-09-15T08:56:29.000Z,1157638,❧❧ 3d printed wood light by u rok design #interiordesign #3dprinting #saleuk,2017.0
1609046,@Hertz_ie,0,2017-03-12 08:52:17+00:00,@Hertz_ie_2017-03-12T08:52:17.000Z,1607862,❧❧ 3d printed wood light by u rok design #interiordesign #3dprinting #saleuk,2017.0


In [19]:
filtered_users = ['@bmine3rz', '@Darkspiral_94', '@GaryWeston11', '@Hertz_ie']
df_filtered = df_selected[df_selected['Author ID'].isin(filtered_users)]

df_filtered

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text,Year
4887135,@bmine3rz,0,2014-08-03 21:06:29+00:00,@bmine3rz_2014-08-03T21:06:29.000Z,4884359,!brand new in the box printrbot metal simple 3d printer ... - nd-new-in-the-box-printrbot-metal-simple-3d-printer-for-home-office-printer-2/ ... #3dprinting,2014.0
4887424,@bmine3rz,0,2014-08-03 21:18:24+00:00,@bmine3rz_2014-08-03T21:18:24.000Z,4884648,!brand new in the box printrbot metal simple 3d printer ... - nd-new-in-the-box-printrbot-metal-simple-3d-printer-for-home-office-printer-3/ ... #3dprinting,2014.0
4887297,@bmine3rz,0,2014-08-03 21:34:49+00:00,@bmine3rz_2014-08-03T21:34:49.000Z,4884521,!brand new in the box printrbot metal simple 3d printer ... - nd-new-in-the-box-printrbot-metal-simple-3d-printer-for-home-office-printer-4/ ... #3dprinting,2014.0
4886962,@bmine3rz,0,2014-08-03 21:35:54+00:00,@bmine3rz_2014-08-03T21:35:54.000Z,4884186,!brand new in the box printrbot metal simple 3d printer ... - nd-new-in-the-box-printrbot-metal-simple-3d-printer-for-home-office-printer-5/ ... #3dprinting,2014.0
4887124,@bmine3rz,0,2014-08-03 21:50:22+00:00,@bmine3rz_2014-08-03T21:50:22.000Z,4884348,!brand new in the box printrbot metal simple 3d printer ... - nd-new-in-the-box-printrbot-metal-simple-3d-printer-for-home-office-printer-6/ ... #3dprinting,2014.0
...,...,...,...,...,...,...,...
747409,@Hertz_ie,0,2016-10-10 06:03:38+00:00,@Hertz_ie_2016-10-10T06:03:38.000Z,746225,❧❧ 3d printed light tube by u rok design #interiorideas #homeshopuk #lightuk,2016.0
1758319,@Hertz_ie,0,2017-08-23 05:20:10+00:00,@Hertz_ie_2017-08-23T05:20:10.000Z,1757135,❧❧ 3d printed wood light by u rok design #interiordesign #3dprinting #saleuk,2017.0
1158822,@Hertz_ie,0,2017-09-15 08:56:29+00:00,@Hertz_ie_2017-09-15T08:56:29.000Z,1157638,❧❧ 3d printed wood light by u rok design #interiordesign #3dprinting #saleuk,2017.0
1609046,@Hertz_ie,0,2017-03-12 08:52:17+00:00,@Hertz_ie_2017-03-12T08:52:17.000Z,1607862,❧❧ 3d printed wood light by u rok design #interiordesign #3dprinting #saleuk,2017.0


In [20]:
# Group by Author ID and sum total engagement
engagement_per_user = (
    df_filtered.groupby("Author ID")["Total Engagement"]
    .sum()
    .reset_index()
    .rename(columns={"Total Engagement": "Total Engagement (Sum)"})
    .sort_values("Total Engagement (Sum)", ascending=False)
)

engagement_per_user


Unnamed: 0,Author ID,Total Engagement (Sum)
3,@bmine3rz,6401
2,@Hertz_ie,1470
0,@Darkspiral_94,447
1,@GaryWeston11,398


In [21]:
# Get all unique bot Author IDs
bot_ids = likely_bots['Author ID'].unique()

# Filter df_selected to include only tweets from those bot IDs
df_bots_only = df_selected[df_selected['Author ID'].isin(bot_ids)]

df_bots_only


Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text,Year
4887135,@bmine3rz,0,2014-08-03 21:06:29+00:00,@bmine3rz_2014-08-03T21:06:29.000Z,4884359,!brand new in the box printrbot metal simple 3d printer ... - nd-new-in-the-box-printrbot-metal-simple-3d-printer-for-home-office-printer-2/ ... #3dprinting,2014.0
4887424,@bmine3rz,0,2014-08-03 21:18:24+00:00,@bmine3rz_2014-08-03T21:18:24.000Z,4884648,!brand new in the box printrbot metal simple 3d printer ... - nd-new-in-the-box-printrbot-metal-simple-3d-printer-for-home-office-printer-3/ ... #3dprinting,2014.0
4887297,@bmine3rz,0,2014-08-03 21:34:49+00:00,@bmine3rz_2014-08-03T21:34:49.000Z,4884521,!brand new in the box printrbot metal simple 3d printer ... - nd-new-in-the-box-printrbot-metal-simple-3d-printer-for-home-office-printer-4/ ... #3dprinting,2014.0
4886962,@bmine3rz,0,2014-08-03 21:35:54+00:00,@bmine3rz_2014-08-03T21:35:54.000Z,4884186,!brand new in the box printrbot metal simple 3d printer ... - nd-new-in-the-box-printrbot-metal-simple-3d-printer-for-home-office-printer-5/ ... #3dprinting,2014.0
4887124,@bmine3rz,0,2014-08-03 21:50:22+00:00,@bmine3rz_2014-08-03T21:50:22.000Z,4884348,!brand new in the box printrbot metal simple 3d printer ... - nd-new-in-the-box-printrbot-metal-simple-3d-printer-for-home-office-printer-6/ ... #3dprinting,2014.0
...,...,...,...,...,...,...,...
5957342,@3d_printing_jp,0,2013-12-30 14:08:56+00:00,@3d_printing_jp_2013-12-30T14:08:56.000Z,5954566,jump!!!!!,2013.0
4836352,@3d_printing_jp,0,2014-01-10 15:10:00+00:00,@3d_printing_jp_2014-01-10T15:10:00.000Z,4833576,jump!!!!! ...続く,2014.0
5100150,@3d_printing_jp,0,2014-01-04 19:10:13+00:00,@3d_printing_jp_2014-01-04T19:10:13.000Z,5097374,jump!!!!! ...続く,2014.0
5508384,@3d_printing_jp,0,2014-01-14 13:09:17+00:00,@3d_printing_jp_2014-01-14T13:09:17.000Z,5505608,nato軍ストラップタイプのベルト(替えバンド16mm)、ダークグリーンレッド,2014.0


In [22]:
confirmed = ["@buy3d", "@Hertz_ie", "@3d_printing_jp"]

#### Remove Tweets Suspected of being authored by bots or spam accounts
For now, remove only tweets from suspected bot accounts *if* they have zero total engagement.
This helps reduce noise from automated or spammy accounts without discarding possibly valuable high-engagement posts.

In [23]:
# Create a mask for bot accounts with 0 engagement
bot_zero_engagement = (
    df_selected["Author ID"].isin(likely_bots["Author ID"]) &
    (df_selected["Total Engagement"] == 0)
)

# Remove only those rows
df_cleaned = df_selected[~bot_zero_engagement]

# Assign back NOTE: anti-best practice
df_selected = df_cleaned


In [24]:
df_selected.shape

(6409310, 7)

## 7. Handle Known Spammers/Bots 
During data collection, some accounts were identified as potential spam or bot accounts. Tweets of known authors whose posts/tweets are mostly either adverts, give-aways or outright-spams

In [25]:
df_selected = df_selected[~df_selected['Author ID'].str.strip().isin(KNOWN_POSSIBLE_BOT_OR_SPAM_ACCOUNTS)]
df_selected.shape

(6226521, 7)

In [26]:
df_spammers = df_selected[df_selected['Author ID'].str.strip().isin(KNOWN_POSSIBLE_BOT_OR_SPAM_ACCOUNTS)]
df_spammers

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text,Year


## 8. Drop Absolute Duplicates (!!!! This engagement based)

In [27]:
# Create a DataFrame of the duplicate rows that would be dropped
duplicates = df_selected[df_selected.duplicated(subset=["Normalized Text"], keep="first")]
print("Rows that would be dropped based on 'Normalized Text' duplicates:")
duplicates

Rows that would be dropped based on 'Normalized Text' duplicates:


Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text,Year
3961804,@scoiel984,0,2015-03-28 22:11:51+00:00,@scoiel984_2015-03-28T22:11:51.000Z,3959028,! 3d-printed tumor replicas to better measure doses of cancer-fighting drugs on,2015.0
4237536,@adaul989,0,2015-02-02 20:21:14+00:00,@adaul989_2015-02-02T20:21:14.000Z,4234760,! 3d-printed tumor replicas to better measure doses of cancer-fighting drugs on,2015.0
5495818,@RobertW79886085,0,2014-02-21 08:47:04+00:00,@RobertW79886085_2014-02-21T08:47:04.000Z,5493042,"! 3d printer creates creepy sculpture of your face in dark, milk or white chocolate ?id_thread=64583 ...",2014.0
5495945,@Barbara07089370,0,2014-02-21 08:26:32+00:00,@Barbara07089370_2014-02-21T08:26:32.000Z,5493169,! 3d-printed eeg headset from openbci is customizable and open-source ?id_thread=64401 ...,2014.0
5376818,@Barbara23772984,0,2014-02-22 13:00:00+00:00,@Barbara23772984_2014-02-22T13:00:00.000Z,5374042,! 3d-printed hip implant lets teenager walk again ?id_thread=67437 ...,2014.0
...,...,...,...,...,...,...,...
4235590,@All_In_Comm,0,2015-12-06 07:27:45+00:00,@All_In_Comm_2015-12-06T07:27:45.000Z,4232814,"󾬐high quality 3d print soft case󾬐\n\n󾭚rm25 each shipping fee : rm6 , sabah&sarawak rm12\n\nor visit us via shopee...",2015.0
4235573,@All_In_Comm,0,2015-12-06 07:27:46+00:00,@All_In_Comm_2015-12-06T07:27:46.000Z,4232797,"󾬐high quality 3d print soft case󾬐\n\n󾭚rm25 each shipping fee : rm6 , sabah&sarawak rm12\n\nor visit us via shopee...",2015.0
4235526,@All_In_Comm,0,2015-12-06 07:27:48+00:00,@All_In_Comm_2015-12-06T07:27:48.000Z,4232750,"󾬐high quality 3d print soft case󾬐\n\n󾭚rm25 each shipping fee : rm6 , sabah&sarawak rm12\n\nor visit us via shopee...",2015.0
4471107,@All_In_Comm,0,2015-12-06 07:27:49+00:00,@All_In_Comm_2015-12-06T07:27:49.000Z,4468331,"󾬐high quality 3d print soft case󾬐\n\n󾭚rm25 each shipping fee : rm6 , sabah&sarawak rm12\n\nor visit us via shopee...",2015.0


In [28]:
duplicates.shape

(1961373, 7)

In [29]:
# Sort by descending total engagement
df_selected = df_selected.sort_values("Total Engagement", ascending=False)

# Drop duplicates while keeping the rows with the highest engagement
df_selected = df_selected.drop_duplicates(subset=["Normalized Text"], keep="first")
df_selected

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text,Year
1400664,@Bill_Gross,439944,2017-10-17 21:48:10+00:00,@Bill_Gross_2017-10-17T21:48:10.000Z,1399480,"in the ""i'm getting old"" department.., a kid saw this and said, ""oh, you 3d-printed the 'save' icon.""",2017.0
2705145,@rustbeltlady,383384,2019-03-08 04:41:13+00:00,@rustbeltlady_2019-03-08T04:41:13.000Z,2703961,who gave my little brother a 3d printer,2019.0
2889185,@McJesse,348608,2021-12-31 00:34:14+00:00,@McJesse_2021-12-31T00:34:14.000Z,2887953,"got a 3d printer for christmas, realized i can use it to print any new year’s glasses i want.",2021.0
2964346,@olivelorraine_,283017,2021-08-15 20:47:32+00:00,@olivelorraine__2021-08-15T20:47:32.000Z,2963114,the vagina is the original 3d printer,2021.0
2448708,@rveenewman,213595,2019-01-09 13:25:07+00:00,@rveenewman_2019-01-09T13:25:07.000Z,2447524,a 3d printed light projected animation. proof that there's always new ways to animate everything. #3dprint #animation,2019.0
...,...,...,...,...,...,...,...
506874,@cokreeate,0,2020-02-04 05:34:50+00:00,@cokreeate_2020-02-04T05:34:50.000Z,505690,congrats to our january winner @willow408 \nstay tuned for our february give away category for a chance to win a 3d print of yourself or a 6x4 photo.\n.\n. #3dprinted #3dprinting #3dscanned #3dscanner #3dminime ... ?igshid=z3vl889r6waz ...,2020.0
6398807,@SoCalERC,0,2023-05-30 17:31:24+00:00,@SoCalERC_2023-05-30T17:31:24.000Z,6396031,congrats to our ih graduate students who presented at the @aiha conference!\namelia chen & dorothy nguyen: particle characteristics of dust-induced pulmonary toxicity in mines.\nnatalie ireland: nanoparticle penetration through lab coats.\nconnor krause: emissions during 3d printing,2023.0
2597255,@CommunityFinale,0,2019-04-24 07:30:49+00:00,@CommunityFinale_2019-04-24T07:30:49.000Z,2596071,congrats to our friends at & on a 3d printed this rad skateboard for party tonight.,2019.0
2176436,@gravitytankinc,0,2016-05-23 22:43:27+00:00,@gravitytankinc_2016-05-23T22:43:27.000Z,2175252,congrats to our friends at @pii_inc for launching their 3d printing business! pii-launches-new-3d-printing-business-unit-300271492.html ...,2016.0


In [30]:
df_selected.shape

(4265148, 7)

## 9. Remove Tweets with Fewer Than N Alphabetic Characters and Zero Total Engagement
identifying and dropping tweets that are likely non-informative spam or noise (i.e. tweets with very few alphabetic characters and no engagement).

In [31]:
# Create a mask for tweets with fewer than 5 alphabetic characters
mask_few_letters = df_selected["Normalized Text"].apply(lambda text: sum(1 for c in text if c.isalpha()) < 10)

# Create a mask for tweets with Total Engagement equal to 0
mask_zero_engagement = df_selected["Total Engagement"] == 0

# Combine both conditions
combined_mask = mask_few_letters & mask_zero_engagement

# Show tweets that satisfy the condition
tweets_to_drop = df_selected[combined_mask]
print("Tweets that will be dropped:")
tweets_to_drop

# Now drop these tweets from df_selected
df_selected = df_selected[~combined_mask]


Tweets that will be dropped:


In [32]:
df_selected

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text,Year
1400664,@Bill_Gross,439944,2017-10-17 21:48:10+00:00,@Bill_Gross_2017-10-17T21:48:10.000Z,1399480,"in the ""i'm getting old"" department.., a kid saw this and said, ""oh, you 3d-printed the 'save' icon.""",2017.0
2705145,@rustbeltlady,383384,2019-03-08 04:41:13+00:00,@rustbeltlady_2019-03-08T04:41:13.000Z,2703961,who gave my little brother a 3d printer,2019.0
2889185,@McJesse,348608,2021-12-31 00:34:14+00:00,@McJesse_2021-12-31T00:34:14.000Z,2887953,"got a 3d printer for christmas, realized i can use it to print any new year’s glasses i want.",2021.0
2964346,@olivelorraine_,283017,2021-08-15 20:47:32+00:00,@olivelorraine__2021-08-15T20:47:32.000Z,2963114,the vagina is the original 3d printer,2021.0
2448708,@rveenewman,213595,2019-01-09 13:25:07+00:00,@rveenewman_2019-01-09T13:25:07.000Z,2447524,a 3d printed light projected animation. proof that there's always new ways to animate everything. #3dprint #animation,2019.0
...,...,...,...,...,...,...,...
506874,@cokreeate,0,2020-02-04 05:34:50+00:00,@cokreeate_2020-02-04T05:34:50.000Z,505690,congrats to our january winner @willow408 \nstay tuned for our february give away category for a chance to win a 3d print of yourself or a 6x4 photo.\n.\n. #3dprinted #3dprinting #3dscanned #3dscanner #3dminime ... ?igshid=z3vl889r6waz ...,2020.0
6398807,@SoCalERC,0,2023-05-30 17:31:24+00:00,@SoCalERC_2023-05-30T17:31:24.000Z,6396031,congrats to our ih graduate students who presented at the @aiha conference!\namelia chen & dorothy nguyen: particle characteristics of dust-induced pulmonary toxicity in mines.\nnatalie ireland: nanoparticle penetration through lab coats.\nconnor krause: emissions during 3d printing,2023.0
2597255,@CommunityFinale,0,2019-04-24 07:30:49+00:00,@CommunityFinale_2019-04-24T07:30:49.000Z,2596071,congrats to our friends at & on a 3d printed this rad skateboard for party tonight.,2019.0
2176436,@gravitytankinc,0,2016-05-23 22:43:27+00:00,@gravitytankinc_2016-05-23T22:43:27.000Z,2175252,congrats to our friends at @pii_inc for launching their 3d printing business! pii-launches-new-3d-printing-business-unit-300271492.html ...,2016.0


## 10. Pre-Identify Non-English Lang Tweets

In [33]:

# === SETTINGS ===
CHUNK_SIZE = 100_000
MAX_WORKERS = 6

# === Detection logic ===
def detect_language_langdetect(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# === Process a chunk ===
def process_chunk(chunk_df):
    with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
        results = list(tqdm(
            executor.map(detect_language_langdetect, chunk_df["Normalized Text"]),
            total=len(chunk_df),
            desc="LangDetect"
        ))
    chunk_df["langdetect_is_english"] = results
    return chunk_df

# === CHUNK IN MEMORY ===
def chunk_dataframe(df, chunk_size=100_000):
    for i in range(0, len(df), chunk_size):
        yield df.iloc[i:i + chunk_size]

# === MAIN ===
processed_chunks = []
for i, chunk in enumerate(chunk_dataframe(df_selected, chunk_size=CHUNK_SIZE)):
    print(f"Processing chunk {i + 1}")
    processed_chunk = process_chunk(chunk.copy())
    processed_chunks.append(processed_chunk)

# Combine all processed chunks into one final DataFrame
df_all = pd.concat(processed_chunks, ignore_index=True)

# # Optional filter: keep only non-English
# df_filtered = df_all[~df_all["langdetect_is_english"]]

# # Save both if needed
# df_all.to_csv("tweets_with_langdetect.csv", index=False)
# df_filtered.to_csv("tweets_filtered_non_english.csv", index=False)


Processing chunk 1


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:11<00:00, 1403.82it/s]


Processing chunk 2


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:08<00:00, 1451.92it/s]


Processing chunk 3


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:15<00:00, 1326.90it/s]


Processing chunk 4


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:23<00:00, 1198.64it/s]


Processing chunk 5


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:22<00:00, 1210.40it/s]


Processing chunk 6


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:14<00:00, 1345.16it/s]


Processing chunk 7


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:22<00:00, 1212.98it/s]


Processing chunk 8


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:21<00:00, 1224.40it/s]


Processing chunk 9


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:17<00:00, 1283.48it/s]


Processing chunk 10


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:25<00:00, 1168.61it/s]


Processing chunk 11


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:30<00:00, 1110.36it/s]


Processing chunk 12


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:35<00:00, 1042.56it/s]


Processing chunk 13


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:39<00:00, 1006.73it/s]


Processing chunk 14


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:30<00:00, 1102.37it/s]


Processing chunk 15


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:36<00:00, 1031.67it/s]


Processing chunk 16


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:37<00:00, 1029.76it/s]


Processing chunk 17


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:38<00:00, 1018.22it/s]


Processing chunk 18


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:11<00:00, 1391.95it/s]


Processing chunk 19


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:25<00:00, 1166.65it/s]


Processing chunk 20


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:13<00:00, 1354.84it/s]


Processing chunk 21


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:14<00:00, 1333.75it/s]


Processing chunk 22


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:28<00:00, 1132.19it/s]


Processing chunk 23


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:17<00:00, 1292.71it/s]


Processing chunk 24


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:06<00:00, 1498.25it/s]


Processing chunk 25


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:26<00:00, 1158.53it/s]


Processing chunk 26


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:25<00:00, 1168.40it/s]


Processing chunk 27


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:31<00:00, 1097.31it/s]


Processing chunk 28


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:27<00:00, 1149.24it/s]


Processing chunk 29


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:27<00:00, 1141.02it/s]


Processing chunk 30


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:21<00:00, 1231.20it/s]


Processing chunk 31


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:33<00:00, 1072.28it/s]


Processing chunk 32


LangDetect: 100%|██████████████████████████████████████████████████████████████████| 100000/100000 [01:42<00:00, 977.85it/s]


Processing chunk 33


LangDetect: 100%|██████████████████████████████████████████████████████████████████| 100000/100000 [01:40<00:00, 999.12it/s]


Processing chunk 34


LangDetect: 100%|██████████████████████████████████████████████████████████████████| 100000/100000 [01:40<00:00, 998.04it/s]


Processing chunk 35


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:37<00:00, 1021.92it/s]


Processing chunk 36


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:32<00:00, 1076.37it/s]


Processing chunk 37


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:27<00:00, 1138.40it/s]


Processing chunk 38


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:12<00:00, 1371.88it/s]


Processing chunk 39


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:27<00:00, 1146.13it/s]


Processing chunk 40


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:29<00:00, 1112.50it/s]


Processing chunk 41


LangDetect: 100%|██████████████████████████████████████████████████████████████████| 100000/100000 [01:43<00:00, 963.94it/s]


Processing chunk 42


LangDetect: 100%|█████████████████████████████████████████████████████████████████| 100000/100000 [01:31<00:00, 1088.67it/s]


Processing chunk 43


LangDetect: 100%|███████████████████████████████████████████████████████████████████| 61493/61493 [00:55<00:00, 1107.46it/s]


In [34]:
df_all

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text,Year,langdetect_is_english
0,@Bill_Gross,439944,2017-10-17 21:48:10+00:00,@Bill_Gross_2017-10-17T21:48:10.000Z,1399480,"in the ""i'm getting old"" department.., a kid saw this and said, ""oh, you 3d-printed the 'save' icon.""",2017.0,True
1,@rustbeltlady,383384,2019-03-08 04:41:13+00:00,@rustbeltlady_2019-03-08T04:41:13.000Z,2703961,who gave my little brother a 3d printer,2019.0,True
2,@McJesse,348608,2021-12-31 00:34:14+00:00,@McJesse_2021-12-31T00:34:14.000Z,2887953,"got a 3d printer for christmas, realized i can use it to print any new year’s glasses i want.",2021.0,True
3,@olivelorraine_,283017,2021-08-15 20:47:32+00:00,@olivelorraine__2021-08-15T20:47:32.000Z,2963114,the vagina is the original 3d printer,2021.0,True
4,@rveenewman,213595,2019-01-09 13:25:07+00:00,@rveenewman_2019-01-09T13:25:07.000Z,2447524,a 3d printed light projected animation. proof that there's always new ways to animate everything. #3dprint #animation,2019.0,True
...,...,...,...,...,...,...,...,...
4261488,@cokreeate,0,2020-02-04 05:34:50+00:00,@cokreeate_2020-02-04T05:34:50.000Z,505690,congrats to our january winner @willow408 \nstay tuned for our february give away category for a chance to win a 3d print of yourself or a 6x4 photo.\n.\n. #3dprinted #3dprinting #3dscanned #3dscanner #3dminime ... ?igshid=z3vl889r6waz ...,2020.0,True
4261489,@SoCalERC,0,2023-05-30 17:31:24+00:00,@SoCalERC_2023-05-30T17:31:24.000Z,6396031,congrats to our ih graduate students who presented at the @aiha conference!\namelia chen & dorothy nguyen: particle characteristics of dust-induced pulmonary toxicity in mines.\nnatalie ireland: nanoparticle penetration through lab coats.\nconnor krause: emissions during 3d printing,2023.0,True
4261490,@CommunityFinale,0,2019-04-24 07:30:49+00:00,@CommunityFinale_2019-04-24T07:30:49.000Z,2596071,congrats to our friends at & on a 3d printed this rad skateboard for party tonight.,2019.0,True
4261491,@gravitytankinc,0,2016-05-23 22:43:27+00:00,@gravitytankinc_2016-05-23T22:43:27.000Z,2175252,congrats to our friends at @pii_inc for launching their 3d printing business! pii-launches-new-3d-printing-business-unit-300271492.html ...,2016.0,True


In [35]:
df_filtered = df_all[df_all['langdetect_is_english'] == False]
df_filtered 

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text,Year,langdetect_is_english
35,@oemonur,37315,2022-09-19 18:30:35+00:00,@oemonur_2022-09-19T18:30:35.000Z,6574681,işten kovulmadan önce son maaşımla 3d printer almıştım. yaptığım en mantıklı yatırım olabilir. son bi haftadır deliler gibi dantel basıyorum.,2022.0,False
48,@dr_chaku,26629,2020-03-25 15:06:14+00:00,@dr_chaku_2020-03-25T15:06:14.000Z,452635,terima kasih geng 3d printing malaysia sebab beri face shield 3d print secara percuma di hospital dan klinik seluruh malaysia. kami hargai!,2020.0,False
61,@canitti,21371,2020-03-16 19:48:52+00:00,@canitti_2020-03-16T19:48:52.000Z,322795,"yardımlaşmayı öğreneceğiz...\n\ni̇talya'nın brescia bölgesinde bir hastane, yoğun bakımdaki 250 korona virüs hastası için solunum cihazına bağlayıcı vana tükenince 3d printer üreticisi bir firma, 2 gün uyumayıp birim maliyeti 1 euro'dan az şekilde 100 tane üretip hastaneye vermiş.",2020.0,False
68,@printingguns,19855,2022-07-08 20:34:45+00:00,@printingguns_2022-07-08T20:34:45.000Z,6558904,i 3d printing,2022.0,False
110,@Puchiluh,12782,2020-08-21 10:23:03+00:00,@Puchiluh_2020-08-21T10:23:03.000Z,604431,"en la plaza de la luna de madrid hay un establecimiento donde el dueño enseña una palabra en chino cada día en una pizarrita os parecerá una chorrada pero que todos los días el dueño se ponga a pensar con ilusión qué va a poner, es muy dulce",2020.0,False
...,...,...,...,...,...,...,...,...
4261122,@RPES12,0,2012-02-27 14:27:38+00:00,@RPES12_2012-02-27 14:27:38+00:00,203148,congrats :-)“ @ttranpham : just joined objet geometries (largest private 3d printer company) | tuan.tranpham@objet.com | ”,2012.0,False
4261321,@3DPBelgian,0,2020-12-14 20:11:10+00:00,@3DPBelgian_2020-12-14T20:11:10.000Z,582464,congrats. atus/1338577100584968197 ...,2020.0,False
4261334,@elsajohnny1,0,2020-07-05 05:54:15+00:00,@elsajohnny1_2020-07-05T05:54:15.000Z,487333,congratulations,2020.0,False
4261374,@metaversalis,0,2012-02-27 18:14:16+00:00,@metaversalis_2012-02-27 18:14:16+00:00,203199,congrats! rt @ttranpham : just joined objet geometries (largest private 3d printer company) tuan.tranpham@objet.com ...,2012.0,False


In [39]:
df_filtered_for_processing = df_filtered[["Normalized Text", "row_num"]]
df_filtered_for_processing
df_filtered_for_processing.to_csv("non_english_tweets.csv", index=False)

In [40]:
df_selected = df_all #NOTE: anti-best practice

## 11. Tweets That Doesnt Contain Keywords and Has 0 Engagement

In [41]:
##----
# This pattern covers:
#pattern = r"(?i)(#?additive manufacturing|#?3d[-\s]?print(?:ed|ing|er)?)"
#pattern = r"(?i)(#?additive(?:\s*manufacturing|maufacturing)|#?3d[-\s]?print(?:ed|ing|er)?)"
#pattern = r"(?i)(#?additive(?:\s*manufacturing|maufacturing)?|#?3d[-\s]?print(?:ed|ing|er)?|#?metal)"
#pattern = r"(?i)(#?additive(?:\s*manufacturing|maufacturing)?|#?3[-\s]?d[-\s]?print(?:ed|ing|er)?|#?metal)"
# Updated regex pattern to capture multiple spaces or hyphens between words:
# - Matches "additive manufacturing" (with optional misspelling "maufacturing") with any spacing.
# - Matches 3D printing variants including forms like "3d  printed", "3-d printed", etc.
# - Also matches "metal".
#pattern = r"(?i)(#?additive(?:\s*manufacturing|maufacturing)?|#?3[-\s]*d[-\s]*print(?:ed|ing|er)?|#?metal)"
pattern = r"(?i)(#?additive(?:\s*manufacturing|maufacturing)?|#?3[-\s]*d|#?3[-\s]*d[-\s]*print(?:ed|ing|er)?|#?printing|#?metal)"
# Create a mask for tweets that do NOT contain any of the keywords
mask_no_keywords = ~df_selected["Normalized Text"].str.contains(pattern, regex=True)

# Create a mask for tweets with Total Engagement equal to 0
mask_zero_engagement = df_selected["Total Engagement"] == 0

# Ensure langdetect flag is False (i.e., not English)
mask_english = df_selected["langdetect_is_english"] == True

# Combine all three conditions:
# - No keywords
# - Zero engagement
# - Not detected as English
mask_combined = mask_no_keywords & mask_zero_engagement & mask_english

# Filter the DataFrame accordingly
tweets_without_keywords_and_zero_engagement = df_selected[mask_combined]

tweets_without_keywords_and_zero_engagement

  mask_no_keywords = ~df_selected["Normalized Text"].str.contains(pattern, regex=True)


Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text,Year,langdetect_is_english
1063773,@delukart,0,2021-07-08 23:42:27+00:00,@delukart_2021-07-08T23:42:27.000Z,2947940,picture mosaic is the glowing in the dark vape holder! the is only two of the of their kind @delukart #etsyshop #etsy #delukart #vapeholder #vape #glowinthedark #glowinthedarkvape #vaping #vapeholder #resin #glowinthedarkresin ...,2021.0,True
1063778,@3DPrintedPhone,0,2015-01-28 18:06:44+00:00,@3DPrintedPhone_2015-01-28T18:06:44.000Z,4205724,picture this: technology tightens focus on who's watching women,2015.0,True
1063841,@3d_monkey,0,2023-04-27 12:01:48+00:00,@3d_monkey_2023-04-27T12:01:48.000Z,6353304,new! anycubic photon mono 4k 6.23 inch 4k monochrome lcd screen! \n\nshop now!,2023.0,True
1063862,@3d_monkey,0,2023-04-29 12:00:35+00:00,@3d_monkey_2023-04-29T12:00:35.000Z,6428789,new! anycubic photon m3 max 13.6 inch 7k monochrome lcd screen \n\nshop today!,2023.0,True
1063898,@DscheyH,0,2013-06-15 07:39:53+00:00,@DscheyH_2013-06-15T07:39:53.000Z,5662384,"pictures of this years #tedx event in zug, switzerland. thanks @tedxzug tml ...",2013.0,True
...,...,...,...,...,...,...,...,...
4261353,@joeltelling,0,2015-12-15 20:58:45+00:00,@joeltelling_2015-12-15T20:58:45.000Z,4262985,congrats! keep the momentum going! tus/676866126568136709 ...,2015.0,True
4261383,@PrintingDDD,0,2018-06-25 18:18:56+00:00,@PrintingDDD_2018-06-25T18:18:56.000Z,3446026,congrats to our friends @madeinspace for winning nasa contract for next-gen ‘vulcan’ manufacturing system grats-to-our-friends-madeinspace-for-winning-nasa-contract-for-next-gen-vulcan-manufacturing-system/ ...,2018.0,True
4261459,@AxisProto,0,2012-11-26 14:51:41+00:00,@AxisProto_2012-11-26 14:51:41+00:00,30548,congrats to the #argos on the #greycup win. perhaps the #torontomapleleafs can learn from this...,2012.0,True
4261467,@Io3DP,0,2017-06-30 15:14:45+00:00,@Io3DP_2017-06-30T15:14:45.000Z,1854041,"congrats to simon, gavin & john, you've each won a £10 #rigidink voucher for taking part in our recent survey! we'll be in touch soon :)",2017.0,True


#### Writeout to csv

In [42]:
##----
# Write it out to csv for review
filename = f"no_keywords_tweets_{CURRENT_DATE_TIME}.csv"
tweets_without_keywords_and_zero_engagement.to_csv(filename, index=False)
# spammer_tweets.to_csv(filename, index=False)
# print(f"CSV file '{filename}' has been written.")
print(df_selected.shape)

(4261493, 8)


In [43]:
##--
### Remove them from df_selected
df_selected = df_selected[~mask_combined]
df_selected

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text,Year,langdetect_is_english
0,@Bill_Gross,439944,2017-10-17 21:48:10+00:00,@Bill_Gross_2017-10-17T21:48:10.000Z,1399480,"in the ""i'm getting old"" department.., a kid saw this and said, ""oh, you 3d-printed the 'save' icon.""",2017.0,True
1,@rustbeltlady,383384,2019-03-08 04:41:13+00:00,@rustbeltlady_2019-03-08T04:41:13.000Z,2703961,who gave my little brother a 3d printer,2019.0,True
2,@McJesse,348608,2021-12-31 00:34:14+00:00,@McJesse_2021-12-31T00:34:14.000Z,2887953,"got a 3d printer for christmas, realized i can use it to print any new year’s glasses i want.",2021.0,True
3,@olivelorraine_,283017,2021-08-15 20:47:32+00:00,@olivelorraine__2021-08-15T20:47:32.000Z,2963114,the vagina is the original 3d printer,2021.0,True
4,@rveenewman,213595,2019-01-09 13:25:07+00:00,@rveenewman_2019-01-09T13:25:07.000Z,2447524,a 3d printed light projected animation. proof that there's always new ways to animate everything. #3dprint #animation,2019.0,True
...,...,...,...,...,...,...,...,...
4261488,@cokreeate,0,2020-02-04 05:34:50+00:00,@cokreeate_2020-02-04T05:34:50.000Z,505690,congrats to our january winner @willow408 \nstay tuned for our february give away category for a chance to win a 3d print of yourself or a 6x4 photo.\n.\n. #3dprinted #3dprinting #3dscanned #3dscanner #3dminime ... ?igshid=z3vl889r6waz ...,2020.0,True
4261489,@SoCalERC,0,2023-05-30 17:31:24+00:00,@SoCalERC_2023-05-30T17:31:24.000Z,6396031,congrats to our ih graduate students who presented at the @aiha conference!\namelia chen & dorothy nguyen: particle characteristics of dust-induced pulmonary toxicity in mines.\nnatalie ireland: nanoparticle penetration through lab coats.\nconnor krause: emissions during 3d printing,2023.0,True
4261490,@CommunityFinale,0,2019-04-24 07:30:49+00:00,@CommunityFinale_2019-04-24T07:30:49.000Z,2596071,congrats to our friends at & on a 3d printed this rad skateboard for party tonight.,2019.0,True
4261491,@gravitytankinc,0,2016-05-23 22:43:27+00:00,@gravitytankinc_2016-05-23T22:43:27.000Z,2175252,congrats to our friends at @pii_inc for launching their 3d printing business! pii-launches-new-3d-printing-business-unit-300271492.html ...,2016.0,True


## Write the cleaned dataframe to csv

In [45]:
# Create filename
filename = f"{CURRENT_DATE_TIME}_cleaned_data_pre_near_duplicates_handled.csv"

# # Filter duplicates
# df_selected = df_selected[ddf_selected.duplicated(subset="Normalized Texts", keep=False)]
# print("Duplicates found:")
# print(duplicates)

# Save to CSV
df_selected.to_csv(filename, index=False)


In [None]:
#amir = df_selected[df_selected["Author ID"] == "@IAmr45"]
amir = df_all[df_selected["Author ID"] == "@IAmr45"]
amir

In [None]:
df_selected

In [None]:
# Filter out rows where both detectors say the tweet is English
# df_filtered = df_selected[~(
#     df_selected["fasttext_is_english"] & df_selected["langdetect_is_english"]
# )]
df_selected