In [4]:
import pandas as pd
import math
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

In [3]:
sampled_per_year = pd.read_csv("random_sample_cleaned_dataset_20250520_144016.csv")
sampled_per_year['Date'] = pd.to_datetime(sampled_per_year['Date'], utc=True)

In [5]:
import math
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Assume `sampled_per_year` is your existing DataFrame
df = sampled_per_year.copy()

# 1. Ensure your Date column is datetime and extract year
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
df['year'] = df['Date'].dt.year

# 2. Initialize the NER pipeline
MODEL = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model     = AutoModelForTokenClassification.from_pretrained(MODEL)
ner_pipe  = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    batch_size=64,
    device=-1
)

# 3. Define a simple chunker to avoid OOM on millions of rows
def chunker(iterable, size):
    it = iter(iterable)
    while True:
        batch = []
        try:
            for _ in range(size):
                batch.append(next(it))
        except StopIteration:
            if batch:
                yield batch
            break
        yield batch

# 4. Run NER in chunks with a progress bar and collect results
all_texts   = df['Normalized Text'].tolist()
batch_size  = 500
n_batches   = math.ceil(len(all_texts) / batch_size)
all_entities = []

for batch in tqdm(chunker(all_texts, batch_size),
                  total=n_batches,
                  desc="Running NER"):
    ents = ner_pipe(batch)
    all_entities.extend(ents)

# 5. Attach the extracted entities back to the DataFrame
df['entities'] = all_entities

# 6. Explode entities so that each entity is its own row
exploded = (
    df
    .explode('entities')
    .dropna(subset=['entities'])
    .reset_index(drop=True)
)

# 7. Pull out the pieces of each entity
exploded['entity_text']  = exploded['entities'].apply(lambda e: e['word'])
exploded['entity_type']  = exploded['entities'].apply(lambda e: e['entity_group'])
exploded['entity_score'] = exploded['entities'].apply(lambda e: e['score'])

# 8. Now group by year to get overall counts and top entities
# 8a. Total entity mentions per year
yearly_counts = (
    exploded
    .groupby('year')
    .size()
    .reset_index(name='total_entity_mentions')
)

# 8b. Top 10 entities per year
top_entities = (
    exploded
    .groupby(['year','entity_text'])
    .size()
    .reset_index(name='count')
    .sort_values(['year','count'], ascending=[True, False])
    .groupby('year')
    .head(10)
)

# 9. Inspect results
print(yearly_counts)
print(top_entities)


  df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Running NER:   0%|          | 0/1 [00:00<?, ?it/s]

   year  total_entity_mentions
0  2014                      1
1  2020                      2
2  2021                      1
   year entity_text  count
0  2014          at      1
1  2020       ##ish      1
2  2020          um      1
3  2021          al      1


In [6]:
top_entities

Unnamed: 0,year,entity_text,count
0,2014,at,1
1,2020,##ish,1
2,2020,um,1
3,2021,al,1


In [7]:
df

Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text,langdetect_is_english,Year,year,entities
0,@donttrythis,207,2012-11-14 18:57:16+00:00,@donttrythis_2012-11-14 18:57:16+00:00,42639,"yay! my makerbot replicator 2 is packed, shipp...",True,2012,2012,[]
1,@themarginalian,21,2012-06-02 17:00:50+00:00,@themarginalian_2012-06-02 17:00:50+00:00,150412,ooh! mit alums nervous system's gorgeous 3d-p...,True,2012,2012,[]
2,@EFF,64,2012-12-14 19:51:11+00:00,@EFF_2012-12-14 19:51:11+00:00,10264,. @eff is targeting dangerous 3d printing p...,True,2012,2012,[]
3,@paleofuture,21,2012-08-02 03:44:33+00:00,@paleofuture_2012-08-02 03:44:33+00:00,119345,a 3d printer that only prints 3d printers,True,2012,2012,[]
4,@medialab,23,2012-08-15 20:20:19+00:00,@medialab_2012-08-15 20:20:19+00:00,105660,another look at neri oxman’s incredible #3d ...,True,2012,2012,[]
...,...,...,...,...,...,...,...,...,...,...
115,@HashinJohnson,128,2023-01-16 00:50:32+00:00,@HashinJohnson_2023-01-16T00:50:32.000Z,6347230,tested my hash bucket s9 immersion cooled spac...,True,2023,2023,[]
116,@ravinash_kk,57,2023-12-05 19:34:52+00:00,@ravinash_kk_2023-12-05T19:34:52.000Z,6356457,1x pdra & 1 x phd position available @imperia...,True,2023,2023,[]
117,@loyalmoses,76,2023-10-07 20:14:34+00:00,@loyalmoses_2023-10-07T20:14:34.000Z,6381695,3d printer companies that are the last to inc...,True,2023,2023,[]
118,@dddpworld,33,2023-02-06 12:14:21+00:00,@dddpworld_2023-02-06T12:14:21.000Z,6394435,the impossible gear fidget on @thangs3d . thi...,True,2023,2023,[]


In [15]:
import pandas as pd
import pprint

# 1. Ensure pandas will show full contents
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

# 2. Filter to rows where the 'entities' list is non-empty
df_nonnull = df[df['entities'].apply(lambda ents: bool(ents))]

# 3a. Simply print the DataFrame (full rows, full colwidth)
df_nonnull



Unnamed: 0,Author ID,Total Engagement,Date,tweet_id,row_num,Normalized Text,langdetect_is_english,Year,year,entities
22,@kosikyou2,60,2014-11-09 16:24:39+00:00,@kosikyou2_2014-11-09T16:24:39.000Z,5280046,atlanta hawks logo wood pattern 3d printed iphone 6 plus case nta-hawks-logo-wood-pattern-3d-printed-iphone-6-plus-case.html ...,True,2014,2014,"[{'entity_group': 'ORG', 'score': 0.6056939, 'word': 'at', 'start': 0, 'end': 2}]"
89,@AUSTNigeria,62,2020-05-04 16:15:35+00:00,@AUSTNigeria_2020-05-04T16:15:35.000Z,493450,"earlier today: \n\nthe materials science and engineering department, donated 3d-printed reusable medical face shields to dr. aisha umar; director of clinical services & c-mac, national hospital, abuja.",True,2020,2020,"[{'entity_group': 'PER', 'score': 0.9735368, 'word': '##ish', 'start': 126, 'end': 129}, {'entity_group': 'PER', 'score': 0.6936282, 'word': 'um', 'start': 131, 'end': 133}]"
92,@dangerousladies,79,2021-10-27 17:13:09+00:00,@dangerousladies_2021-10-27T17:13:09.000Z,3021625,3d print files for alisaie and alphinaud's ponytail holders from final fantasy xiv! comes in wholes and halves for easy printing. 9705845/alisaie-and-alphinaud-leveilleurs ...,True,2021,2021,"[{'entity_group': 'PER', 'score': 0.9227886, 'word': 'al', 'start': 32, 'end': 34}]"
