In [3]:
import pandas as pd
from glob import glob

def merge_csv_files(tweet_input_folder, output_file):
    csv_files = glob(os.path.join(tweet_input_folder, '*.csv'))

    df_list = []
    for file in csv_files:
        if os.path.isfile(file) and file.endswith('.csv') and os.path.getsize(file) > 10:
            df_list.append(pd.read_csv(file, low_memory=False))

    merged_df = pd.concat(df_list, ignore_index=True)
    merged_df.to_csv(output_file, index=False)
    return merged_df


merged_df = merge_csv_files("../data/tweets", "../data/merged_tweets.csv")
merged_df.head()


Unnamed: 0,id,geo,lang,text,user,place,id_str,source,entities,favorited,...,withheld_scope,withheld_copyright,withheld_in_countries,deleted,retrieved,legacy_imported,urls,hashtags,user_mentions,media
0,1273769788767117315,,en,@groo_wonderer @ADHD_Capital @rationalwalk The...,"{'id': 381289719, 'url': 'https://t.co/oTK7zoT...",,1273769788767117315,"<a href=""http://twitter.com/download/iphone"" r...","{'urls': [], 'symbols': [], 'hashtags': [], 'u...",False,...,,,,,,,,,,
1,1273769207839285255,,en,RT @nntaleb: @ADHD_Capital @rationalwalk You h...,"{'id': 381289719, 'url': 'https://t.co/oTK7zoT...",,1273769207839285255,"<a href=""http://twitter.com/download/iphone"" r...","{'urls': [], 'symbols': [], 'hashtags': [], 'u...",False,...,,,,,,,,,,
2,1273769135349084168,,en,@ADHD_Capital @rationalwalk You have the under...,"{'id': 381289719, 'url': 'https://t.co/oTK7zoT...",,1273769135349084168,"<a href=""http://twitter.com/download/iphone"" r...","{'urls': [{'url': 'https://t.co/xeX76EvJde', '...",False,...,,,,,,,,,,
3,1273758103754944512,,en,@QifaNabki A7ikar preceded Aesop with his fables.,"{'id': 381289719, 'url': 'https://t.co/oTK7zoT...",,1273758103754944512,"<a href=""http://twitter.com/download/iphone"" r...","{'urls': [], 'symbols': [], 'hashtags': [], 'u...",False,...,,,,,,,,,,
4,1273756881501200387,,en,@QifaNabki A few years behind my tweet! Got to...,"{'id': 381289719, 'url': 'https://t.co/oTK7zoT...",,1273756881501200387,"<a href=""http://twitter.com/download/iphone"" r...","{'urls': [], 'symbols': [], 'hashtags': [], 'u...",False,...,,,,,,,,,,


In [4]:
import ast

def preprocess_tweets(merged_csv_path):
    # Load merged tweets
    df = pd.read_csv(merged_csv_path, low_memory=False)

    # Extract user_id and handle from nested 'user' column
    def extract_user_info(user_str):
        try:
            user_dict = ast.literal_eval(user_str)
            return user_dict.get("id"), user_dict.get("screen_name", None)
        except:
            return None, None

    df[['user_id', 'handle']] = df['user'].apply(lambda x: pd.Series(extract_user_info(x)))

    # Convert to datetime
    if 'timestamp_ms' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp_ms'], unit='ms', errors='coerce')
    elif 'created_at' in df.columns:
        df['timestamp'] = pd.to_datetime(df['created_at'], errors='coerce')
    else:
        df['timestamp'] = pd.NaT

    # Normalize to UTC
    df['timestamp'] = df['timestamp'].dt.tz_localize('UTC', nonexistent='NaT', ambiguous='NaT')

    # Clean tweet text
    def clean_text(text):
        if pd.isna(text):
            return ""
        return text.replace('\n', ' ').replace('\r', '').strip()

    df['clean_text'] = df['text'].apply(clean_text)

    # Final cleaned DataFrame
    columns_to_keep = ['id', 'user_id', 'handle', 'clean_text', 'lang', 'timestamp']
    clean_df = df[columns_to_keep].dropna(subset=['clean_text', 'timestamp'])

    return clean_df


In [5]:
tweets_df = preprocess_tweets("../data/merged_tweets.csv")
tweets_df.head()

Unnamed: 0,id,user_id,handle,clean_text,lang,timestamp
17,1273603287275708416,381289719.0,nntaleb,@DrWissam @ElieLB92 @HsenAndil https://t.co/E6...,und,2020-06-18 13:07:40.748000+00:00
32,1273346018155933701,381289719.0,nntaleb,"@kenosplethon Kalam is from kalamos, Greek, be...",en,2020-06-17 20:05:23.010000+00:00
41,1273249835483893763,381289719.0,nntaleb,"OK, OK, in modern Greek Δύναμη από τους νέους...",el,2020-06-17 13:43:11.274000+00:00
42,1273249593405382661,381289719.0,nntaleb,@StavrosZenios In modern Greek? Δύναμη από το...,el,2020-06-17 13:42:13.558000+00:00
43,1273245786478915584,381289719.0,nntaleb,RT @nntaleb: @MccluskyMark No branch of tradit...,en,2020-06-17 13:27:05.916000+00:00


In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Device set to use cpu


In [None]:
from tqdm import tqdm

def run_finbert(text):
    try:
        result = finbert(text[:512])[0]  # Truncate to 512 tokens
        return pd.Series([result['label'], result['score']])
    except:
        return pd.Series(["ERROR", np.nan])

tqdm.pandas(desc="FinBERT sentiment")
tweets_df[['sentiment_label', 'sentiment_score']] = tweets_df['clean_text'].progress_apply(run_finbert)


FinBERT sentiment: 100%|██████████| 5059/5059 [15:49<00:00,  5.33it/s]


In [None]:
tweets_df[['timestamp', 'handle', 'clean_text', 'sentiment_label', 'sentiment_score']].sample(10)

Unnamed: 0,timestamp,handle,clean_text,sentiment_label,sentiment_score
7096,2020-12-14 03:47:03.081000+00:00,realDonaldTrump,RT @realDonaldTrump: https://t.co/dZ56TExPWf,Neutral,0.998673
57569,2021-07-13 15:09:37.625000+00:00,paulkrugman,"Of course, being anti-vax is even worse: it's ...",Negative,0.927461
23414,2022-11-06 10:33:41.453000+00:00,nntaleb,Another shoe dropped.,Negative,0.998173
102359,2021-04-14 18:41:17.206000+00:00,POTUS,It is time to end America’s longest war. It ...,Neutral,0.999907
30106,2021-07-30 01:37:51.299000+00:00,nntaleb,RT @NachoOliveras: Trying to capture some pear...,Neutral,0.98447
29670,2021-08-12 11:13:20.050000+00:00,nntaleb,@KhalenDwyer Hihi public works!,Neutral,0.998899
99500,2022-06-17 12:32:46.669000+00:00,POTUS,Tune in as I host the Major Economies Forum on...,Neutral,0.999374
84143,2021-02-21 13:09:58.846000+00:00,elonmusk,@RationalEtienne 🤣🤣,Neutral,0.995796
30179,2021-07-27 14:30:11.117000+00:00,nntaleb,RT @rorysutherland: @nntaleb @paulg The Beatle...,Neutral,0.994124
115850,2019-04-23 10:35:04.712000+00:00,richardbranson,"In Finding My Virginity, I shared 75 close sha...",Neutral,0.999969


In [None]:
tweets_df.to_csv("../data/tweets_with_sentiment.csv", index=False)

In [None]:
from datetime import timedelta

tweets_df['window_start'] = tweets_df['timestamp'] - timedelta(minutes=10)
tweets_df['window_end'] = tweets_df['timestamp'] + timedelta(minutes=10)

NameError: name 'tweets_df' is not defined