# BERTrend quickstart
The purpose of this notebook is to complement the existing demos available in the directory `bertrend/demos` with some code examples that explain how to integrate BERTrend with your application code.

In [1]:
%load_ext autoreload
%autoreload 2

## BERTrend installation

In [2]:
from pathlib import Path
import pandas as pd
from pandas import Timestamp
from IPython.display import display
from loguru import logger
import os

from bertrend import DATA_PATH
from bertrend.BERTrend import BERTrend
from bertrend import MODELS_DIR
from bertrend.utils.data_loading import load_data, split_data, TEXT_COLUMN
from bertrend.services.embedding_service import EmbeddingService
from bertrend.BERTopicModel import BERTopicModel
from bertrend.topic_analysis.topic_description import generate_topic_description
from bertrend.trend_analysis.weak_signals import analyze_signal


In [3]:
#!pip install bertrend

### Configuration of topic models

In [4]:
# Topic model with default parameters - each parameter of BERTopic can be modified from the constructor or can be read from a configuration file
# overrides the default config to use English
config = '''
# Default configuration file to be used for topic model

# Global parameters
[global]
language = "English"

# BERTopic parameters: https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic.__init__
[bertopic_model]
top_n_words = 10
verbose = true
representation_model = ["MaximalMarginalRelevance"] # KeyBERTInspired, OpenAI
zeroshot_topic_list = []
zeroshot_min_similarity = 0

# UMAP parameters: https://umap-learn.readthedocs.io/en/latest/api.html
[umap_model]
n_neighbors = 5
n_components = 5
min_dist = 0.0
metric = "cosine"
random_state = 42

# HDBSCAN parameters: https://hdbscan.readthedocs.io/en/latest/api.html
[hdbscan_model]
min_cluster_size = 5
min_samples = 5
metric = "euclidean"
cluster_selection_method = "eom"
prediction_data = true

# CountVectorizer: https://scikit-learn.org/1.5/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
[vectorizer_model]
ngram_range = [1, 1]
stop_words = true # If true, will check `language` parameter and load associated stopwords file
min_df = 2

# ClassTfidfTransformer: https://maartengr.github.io/BERTopic/api/ctfidf.html
[ctfidf_model]
bm25_weighting = false
reduce_frequent_words = true

# MaximalMarginalRelevance: https://maartengr.github.io/BERTopic/api/representation/mmr.html
[mmr_model]
diversity = 0.3

# Reduce outliers: https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic.reduce_outliers
[reduce_outliers]
strategy = "c-tf-idf"
'''

topic_model = BERTopicModel(config)

In [5]:
# The TopicModel class is mainly a wrapper around BERTopic and can be used as-is, for example for a first analysis of data (without considering evolving trends, but this is not mandatory at all)


## Using BERTrend for retrospective analysis

### Instantiation of BERTrend


In the case of a **retrospective trend analysis** task, the goal is to identify and evaluate patterns or changes over time within a dataset, allowing for insights into historical performance, behaviors, or events that can inform future decision-making and strategy development.

In this context, the general principle consists in splitting the past data into different time slices. Then each dataset is used to train a separate topic models. Each topic model description corresponding to the older data slice is merged with the next one and decay factors are applied. This allows to have a vision of topic evolution over time

In [6]:
# Basic creation of the object and parametrization
# BERTrend uses several topic models; therefore, it is necessary to pass a topic_model object as a reference
bertrend = BERTrend(topic_model=topic_model)

### 1. Gather historical data to be analyzed


In [7]:
# Here some Trump tweets from: https://github.com/MarkHershey/CompleteTrumpTweetsArchive/blob/master/data/realDonaldTrump_in_office.csv
# !wget "https://raw.githubusercontent.com/MarkHershey/CompleteTrumpTweetsArchive/refs/heads/master/data/realDonaldTrump_in_office.csv"
df = pd.read_csv("../data/bertopic/trump_tweets.csv",  sep=',',quotechar='"', skipinitialspace=True)
# BERTrend expects specific data format
df = df.rename(columns={'Time': 'timestamp', 'Tweet URL': 'url', "Tweet Text": "text"})

In [8]:
print(df.columns.tolist())

['Unnamed: 0', 'id', 'text', 'isRetweet', 'isDeleted', 'device', 'favorites', 'retweets', 'timestamp', 'isFlagged']


In [9]:
df = (
    df                   # <- start with your frame
    .drop(columns=["Unnamed: 0"])              # optional: tidy up
    .rename(columns={"id": "ID"})              # give it the name your old code expects
)

df["source"] = df["ID"]                        # 1️⃣ identifier for source
df["url"] = "https://twitter.com/i/web/status/" + df["ID"].astype(str)  # 2️⃣ tweet URL
df["document_id"] = df.index                   # 3️⃣ unique document id

df.reset_index(inplace=True, drop=True)
df.head(5)

Unnamed: 0,ID,text,isRetweet,isDeleted,device,favorites,retweets,timestamp,isFlagged,source,url,document_id
0,98454970654916608,republicans and democrats have both created ou...,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f,98454970654916608,https://twitter.com/i/web/status/9845497065491...,0
1,1234653427789070336,i was thrilled to be back in the great city of...,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f,1234653427789070336,https://twitter.com/i/web/status/1234653427789...,1
2,1304875170860015617,the unsolicited mail in ballot scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f,1304875170860015617,https://twitter.com/i/web/status/1304875170860...,2
3,1223640662689689602,getting a little exercise this morning,f,f,Twitter for iPhone,285863,30209,2020-02-01 16:14:02,f,1223640662689689602,https://twitter.com/i/web/status/1223640662689...,3
4,1215247978966986752,thank you elise,f,f,Twitter for iPhone,48510,11608,2020-01-09 12:24:31,f,1215247978966986752,https://twitter.com/i/web/status/1215247978966...,4


In [10]:
df.index

RangeIndex(start=0, stop=45355, step=1)

In [11]:
# Selection of a subset of data
df = df.head(1000)

### 2. Embed data

In [12]:
client_secret = "bd76aa472dd91aed4a56bf1935dbb802583c119824380d8567086579c0ef3324"

# --- 1. Patch the two discovery methods ---------------------------------
from bertrend.services import embedding_client              # <-- import *module*, not just the class
EmbeddingAPIClient = embedding_client.EmbeddingAPIClient    # handy alias

def _lmstudio_model_name(self):
    # return whatever model LM Studio has loaded
    return "text-embedding-multilingual-e5-large-instruct"

def _lmstudio_num_workers(self):
    # LM Studio answers requests serially, so 1 is fine
    return 1

EmbeddingAPIClient.get_api_model_name = _lmstudio_model_name
EmbeddingAPIClient.get_num_workers    = _lmstudio_num_workers


# --- 2. Configure and build the service (NO num_workers kwarg) ----------
embedding_service = EmbeddingService(
    local=False,
    url="http://127.0.0.1:1234",
    model_name="text-embedding-multilingual-e5-large-instruct",
)

embeddings, *_ = embedding_service.embed(texts=df["text"])
print(len(embeddings), "embeddings OK ✔︎")

# --- 3. Use it -----------------------------------------------------------
embeddings, token_strings, token_embeddings = embedding_service.embed(texts=df["text"])


[32m2025-05-15 10:34:42.437[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_client[0m:[36m__init__[0m:[36m47[0m - [34m[1mEmbeddingAPIClient(model_name='text-embedding-bge-base-en-v1.5', num_workers=4)[0m
[32m2025-05-15 10:34:42.438[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_service[0m:[36m_remote_embed_documents[0m:[36m260[0m - [34m[1mComputing embeddings...[0m
[32m2025-05-15 10:34:42.438[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_client[0m:[36membed_documents[0m:[36m118[0m - [34m[1mCalling EmbeddingAPI using model: text-embedding-bge-base-en-v1.5[0m
[32m2025-05-15 10:34:42.439[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_client[0m:[36membed_documents[0m:[36m122[0m - [34m[1mComputing embeddings on 1000 documents using (1) batches...[0m
[32m2025-05-15 10:34:52.188[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_service[0m:[36m_remote_embed_documents[0m:[36m260

1000 embeddings OK ✔︎


In [13]:
from bertrend.services.embedding_client import EmbeddingAPIClient
client = EmbeddingAPIClient("http://127.0.0.1:1234", "text-embedding-multilingual-e5-large-instruct")
vecs = client.embed_documents(["alpha", "beta", "gamma"])
print(len(vecs), "vectors OK")



[32m2025-05-15 10:35:01.616[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_client[0m:[36m__init__[0m:[36m47[0m - [34m[1mEmbeddingAPIClient(model_name='text-embedding-bge-base-en-v1.5', num_workers=4)[0m
[32m2025-05-15 10:35:01.616[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_client[0m:[36membed_documents[0m:[36m118[0m - [34m[1mCalling EmbeddingAPI using model: text-embedding-bge-base-en-v1.5[0m
[32m2025-05-15 10:35:01.617[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_client[0m:[36membed_documents[0m:[36m122[0m - [34m[1mComputing embeddings on 3 documents using (1) batches...[0m


3 vectors OK


In [14]:
from bertrend.services.embedding_client import EmbeddingAPIClient
client = EmbeddingAPIClient("http://127.0.0.1:1234", "text-embedding-multilingual-e5-large-instruct")

# single-thread call
print(client.embed_documents(["one", "two"]))

# process backend call (if you chose option B)
from joblib import Parallel, delayed
Parallel(n_jobs=2)(delayed(client.embed_query)(w) for w in ["a", "b"])


[32m2025-05-15 10:35:01.707[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_client[0m:[36m__init__[0m:[36m47[0m - [34m[1mEmbeddingAPIClient(model_name='text-embedding-bge-base-en-v1.5', num_workers=4)[0m
[32m2025-05-15 10:35:01.708[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_client[0m:[36membed_documents[0m:[36m118[0m - [34m[1mCalling EmbeddingAPI using model: text-embedding-bge-base-en-v1.5[0m
[32m2025-05-15 10:35:01.708[0m | [34m[1mDEBUG   [0m | [36mbertrend.services.embedding_client[0m:[36membed_documents[0m:[36m122[0m - [34m[1mComputing embeddings on 2 documents using (1) batches...[0m


[[-0.0254928320646286, 0.02283906377851963, -0.03054661676287651, -0.06402581185102463, 0.04796752706170082, 0.014829971827566624, 0.04840866103768349, 0.01880030333995819, -0.021542904898524284, -0.07306718081235886, 0.013749617151916027, -0.038711968809366226, -0.054425884038209915, -0.0044035594910383224, -0.011418592184782028, 0.00541445380076766, 0.04740703105926514, -0.01243699062615633, -0.03242269158363342, -0.013793851248919964, 0.023723222315311432, 0.05899536609649658, -0.004368129651993513, 0.0010357870487496257, -0.0006234457832761109, -0.0015822353307157755, 0.012268541380763054, -0.04606388881802559, -0.07926363497972488, 0.01065139565616846, 0.06731739640235901, 0.04565108194947243, -0.0028805106412619352, -0.02104395627975464, 0.005085420794785023, 0.014398405328392982, 0.05362216383218765, -0.033050112426280975, 0.0034936992451548576, -0.00681263767182827, -0.042198602110147476, -0.0070241414941847324, 0.0011705330107361078, -0.0058874995447695255, -0.0462381690740585

2025-05-15 10:35:04.083 | DEBUG    | bertrend.services.embedding_client:embed_query:81 - POST http://127.0.0.1:1234/v1/embeddings [1 query]
2025-05-15 10:35:04.083 | DEBUG    | bertrend.services.embedding_client:embed_query:81 - POST http://127.0.0.1:1234/v1/embeddings [1 query]


[[0.0010414344724267721,
  0.04018814489245415,
  -0.022314293310046196,
  -0.02049190178513527,
  0.04621792212128639,
  0.021751003339886665,
  0.01844034343957901,
  -0.02910696342587471,
  -0.03024175576865673,
  -0.04458827152848244,
  -0.02504551038146019,
  0.011120066978037357,
  -0.09143443405628204,
  -0.006770624313503504,
  0.01668928749859333,
  0.04918559268116951,
  0.03294914960861206,
  0.004454770125448704,
  0.008042257279157639,
  -0.016272692009806633,
  -0.0003186255635228008,
  0.03271683305501938,
  0.028828030452132225,
  0.040923040360212326,
  0.039220698177814484,
  -0.046480823308229446,
  0.03508533164858818,
  -0.008057663217186928,
  -0.016612770035862923,
  -0.00370211573317647,
  0.015506848692893982,
  0.025460612028837204,
  0.012782738544046879,
  -0.03145822882652283,
  0.026704054325819016,
  -0.03587443754076958,
  0.0036219931207597256,
  -0.03224756568670273,
  -0.008498276583850384,
  0.00861964002251625,
  -0.03630916774272919,
  -0.005369517

In [15]:
from bertrend.services.embedding_client import EmbeddingAPIClient
print(EmbeddingAPIClient.mro())   # SecureAPIClient should NOT appear here


[<class 'bertrend.services.embedding_client.EmbeddingAPIClient'>, <class 'langchain_core.embeddings.embeddings.Embeddings'>, <class 'abc.ABC'>, <class 'object'>]


### 3. Split the data into time slices

This can be done manually for some reason or can be done automatically based on a specified time granularity

In [16]:
from bertrend.utils.data_loading import group_by_days, load_data

day_granularity = 30
grouped_data = group_by_days(df=df, day_granularity=day_granularity)

In [17]:
# Number of sliced data
len(grouped_data)

115

In [18]:
for p, g in grouped_data.items():
    sz = len(g)
    if sz < 3:
        print(f"{p}: {sz} doc → will skip")
    elif sz <= 6:
        print(f"{p}: {sz} doc → will down-tune UMAP (comp={min(5, sz-2)})")


2011-08-02 00:00:00: 1 doc → will skip
2011-09-01 00:00:00: 0 doc → will skip
2011-10-01 00:00:00: 0 doc → will skip
2011-10-31 00:00:00: 0 doc → will skip
2011-11-30 00:00:00: 0 doc → will skip
2011-12-30 00:00:00: 0 doc → will skip
2012-01-29 00:00:00: 0 doc → will skip
2012-02-28 00:00:00: 0 doc → will skip
2012-03-29 00:00:00: 0 doc → will skip
2012-04-28 00:00:00: 0 doc → will skip
2012-05-28 00:00:00: 0 doc → will skip
2012-06-27 00:00:00: 0 doc → will skip
2012-07-27 00:00:00: 0 doc → will skip
2012-08-26 00:00:00: 0 doc → will skip
2012-09-25 00:00:00: 0 doc → will skip
2012-10-25 00:00:00: 2 doc → will skip
2012-11-24 00:00:00: 4 doc → will down-tune UMAP (comp=2)
2012-12-24 00:00:00: 1 doc → will skip
2013-01-23 00:00:00: 1 doc → will skip
2013-02-22 00:00:00: 0 doc → will skip
2013-03-24 00:00:00: 0 doc → will skip
2013-04-23 00:00:00: 1 doc → will skip
2013-05-23 00:00:00: 0 doc → will skip
2013-06-22 00:00:00: 0 doc → will skip
2013-07-22 00:00:00: 0 doc → will skip
2013-0

### 4. Train topic models

In [19]:
bertrend.train_topic_models(grouped_data=grouped_data, embedding_model="text-embedding-bge-base-en-v1.5", embeddings=embeddings)

[32m2025-05-15 10:35:04.356[0m | [1mINFO    [0m | [36mbertrend.BERTrend[0m:[36mtrain_topic_models[0m:[36m473[0m - [1mTraining topic model 1/24...[0m
[32m2025-05-15 10:35:04.357[0m | [34m[1mDEBUG   [0m | [36mbertrend.BERTrend[0m:[36m_train_by_period[0m:[36m242[0m - [34m[1mProcessing period: 2011-08-02 00:00:00[0m
[32m2025-05-15 10:35:04.357[0m | [34m[1mDEBUG   [0m | [36mbertrend.BERTrend[0m:[36m_train_by_period[0m:[36m243[0m - [34m[1mNumber of documents: 1[0m
[32m2025-05-15 10:35:04.358[0m | [1mINFO    [0m | [36mbertrend.BERTrend[0m:[36mtrain_topic_models[0m:[36m473[0m - [1mTraining topic model 2/24...[0m
[32m2025-05-15 10:35:04.359[0m | [34m[1mDEBUG   [0m | [36mbertrend.BERTrend[0m:[36m_train_by_period[0m:[36m242[0m - [34m[1mProcessing period: 2012-10-25 00:00:00[0m
[32m2025-05-15 10:35:04.359[0m | [34m[1mDEBUG   [0m | [36mbertrend.BERTrend[0m:[36m_train_by_period[0m:[36m243[0m - [34m[1mNumber of documents: 

### 5. (Optional) Save trained_models

In [20]:
bertrend.save_model()

[32m2025-05-15 10:35:10.689[0m | [1mINFO    [0m | [36mbertrend.BERTrend[0m:[36msave_model[0m:[36m951[0m - [1mBERTrend model saved to: /Users/cnm13ryan/git/cache/models[0m


### 7. Calculate signal popularity

In [21]:
bertrend.calculate_signal_popularity()

In [31]:
window_size = 50

# List of strong and weak signals over time
for ts in bertrend.doc_groups.keys():
    print(ts)
    noise_topics_df, weak_signal_topics_df, strong_signal_topics_df = bertrend.classify_signals(window_size, ts)
    if not weak_signal_topics_df.empty:
        print("Weak signals")
        display(weak_signal_topics_df[["Topic","Representation"]].head(10))
    if not strong_signal_topics_df.empty:
        print("Strong signals")
        display(strong_signal_topics_df[["Topic","Representation"]].head(10))
    print()


2011-08-02 00:00:00

2012-10-25 00:00:00

2012-11-24 00:00:00

2012-12-24 00:00:00

2013-01-23 00:00:00

2013-04-23 00:00:00

2013-09-20 00:00:00

2013-10-20 00:00:00

2019-04-22 00:00:00

2019-06-21 00:00:00

2019-07-21 00:00:00

2019-12-18 00:00:00

2020-01-17 00:00:00

2020-02-16 00:00:00

2020-03-17 00:00:00

2020-04-16 00:00:00

2020-05-16 00:00:00

2020-06-15 00:00:00

2020-07-15 00:00:00
Strong signals


Unnamed: 0,Topic,Representation
0,0,fake_news_about_corrupt_but_zach_the_new_trump...
1,1,to_your_vote_the_are_in_and_not_if_amp



2020-08-14 00:00:00
Strong signals


Unnamed: 0,Topic,Representation
0,0,great_world_watch_tonight_thank_harry_enjoy_bo...



2020-09-13 00:00:00
Strong signals


Unnamed: 0,Topic,Representation
0,0,volunteer_watcher_sign_poll_today_video_pollwa...
1,1,pelosi_puppet_nd_weak_parnell_opponent_sean_he...



2020-10-13 00:00:00
Weak signals


Unnamed: 0,Topic,Representation
0,1,dump_receives_atlanta_milwaukee_philadelphia_p...


Strong signals


Unnamed: 0,Topic,Representation
0,0,morocco_western_sovereignty_sahara_breakthroug...



2020-11-12 00:00:00
Weak signals


Unnamed: 0,Topic,Representation
0,1,court_supreme_united_president_than_merits_its...
1,2,many_are_ballots_out_voter_rigged_eyes_he_that...



2020-12-12 00:00:00
Weak signals


Unnamed: 0,Topic,Representation
0,1,court_supreme_united_president_than_merits_its...
1,2,many_are_ballots_out_voter_rigged_eyes_he_that...





In [32]:
# selection of one particular timestamp to look at
selected_timestamp = Timestamp('2017-04-20 00:00:00')
selected_topic_model = bertrend.restore_topic_model(selected_timestamp)


### Get topic description


In [33]:
desc = generate_topic_description(topic_model=selected_topic_model, topic_number=5, filtered_docs=df, language_code="en")


[32m2025-05-15 10:51:43.511[0m | [31m[1mERROR   [0m | [36mbertrend.topic_analysis.topic_description[0m:[36mgenerate_topic_description[0m:[36m51[0m - [31m[1mgenerate_topic_description: topic_model is None[0m


In [26]:
desc.title

AttributeError: 'NoneType' object has no attribute 'title'

In [27]:
desc.description

AttributeError: 'NoneType' object has no attribute 'description'

### Get topic analysis

In [28]:
summary, analysis, formatted_html = analyze_signal(bertrend, 1, selected_timestamp)

[32m2025-05-15 10:38:38.834[0m | [31m[1mERROR   [0m | [36mbertrend.trend_analysis.weak_signals[0m:[36manalyze_signal[0m:[36m424[0m - [31m[1mNo data available for topic 1 within the specified date range. Please enter a valid topic number.[0m


ValueError: not enough values to unpack (expected 3, got 2)

In [29]:
from IPython.display import display, HTML
display(HTML(formatted_html))

NameError: name 'formatted_html' is not defined

## Using BERTrend for prospective analysis

In the case of a **prospective trend analysis task**, the goal is to **forecast future** developments or outcomes based on current data and trends, enabling organizations to make informed decisions, allocate resources effectively, and strategize for upcoming challenges or opportunities.


In this example, we are going to simulate a prospective task:
- we simulate new data coming in
- for each new data, we will compute the new topic model, merge it to previous one and detect at each iteration strong and weak signals


In [None]:
MY_DATA_DIR = Path("/DSIA/nlp/bertrend/data") / "feeds/feed_sobriete"

input_data = [
    MY_DATA_DIR / "2024-12-30_feed_sobriete.jsonl",
    MY_DATA_DIR / "2025-01-06_feed_sobriete.jsonl",
    MY_DATA_DIR / "2025-01-20_feed_sobriete.jsonl",
]

window_size = 7

In [None]:
embedding_service_cfg = {"local": False, "url":"https://10.132.5.44:6464", "client_secret":client_secret}

embedding_service = EmbeddingService(**embedding_service_cfg)
embedding_model_name = embedding_service.embedding_model_name

In [None]:
BERTREND_MODELS_PATH = MODELS_DIR / "sobriete_models"

In [None]:
def process_new_data(data_slice_path: Path, timestamp: pd.Timestamp):
    logger.debug(f"Processing new data: {data_slice_path}")

    # Restore previous models
    try:
        bertrend = BERTrend.restore_model(BERTREND_MODELS_PATH)
    except:
        logger.warning("Cannot restore previous models, creating new one")
        bertrend = BERTrend(topic_model=BERTopicModel())

    # Read data
    df = load_data(data_slice_path, language="French")
    df = split_data(df)
    text = df[TEXT_COLUMN]

    # Embed new data
    embeddings, token_strings, token_embeddings = embedding_service.embed(
                texts=text,
    )

    # Create topic model for new data
    bertrend.train_topic_models({timestamp: df}, embeddings=embeddings, embedding_model=embedding_model_name)
    
    logger.info(f"BERTrend processed {len(bertrend.doc_groups)} time periods")
    
    # Save models
    bertrend.save_model(models_path=BERTREND_MODELS_PATH)

    
    if len(bertrend.doc_groups)<2:
        return None
        
    # Compute popularities
    bertrend.calculate_signal_popularity()
    
    # classify last signals
    noise_topics_df, weak_signal_topics_df, strong_signal_topics_df = bertrend.classify_signals(window_size, timestamp)
    # TODO: save dfs

    if weak_signal_topics_df.empty:
        return None
        
    wt = weak_signal_topics_df['Topic']
    logger.info(f"Weak topics: {wt}")
    wt_list = []
    for topic in wt:
        topic_model = bertrend.restore_topic_model(timestamp)
        desc = generate_topic_description(topic_model=topic_model, topic_number=topic, filtered_docs=df, language_code="fr")
        wt_list.append({"timestamp": timestamp, "topic": topic, "title": desc.title, "description": desc.description})

    return pd.DataFrame(wt_list)


In [None]:
for data_file in input_data:
    timestamp = pd.Timestamp(data_file.name.split('_')[0])
    display(process_new_data(data_file, timestamp))