In [14]:
import pandas as pd
import re
import torch
from transformers import BertTokenizer
import numpy as np

# Load the CSV file
df = pd.read_csv('./data/tweets.csv')

# Clean the text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#\w+', '', text)     # Remove hashtags
    text = re.sub(r'\s+', ' ', text)     # Remove extra whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text.strip()

df['cleaned_text'] = df['text'].apply(clean_text)

# Tokenize the text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_text(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )


In [15]:
import transformers
print(transformers.__version__)

4.50.3


In [16]:
df.head()

Unnamed: 0,created_at,user,text,cleaned_text
0,created_at,author_id,text,text
1,2025-03-31 04:10:53+00:00,3186601492,RT @moh_ai_r: The Eiffel Tower #mohair #mohair...,RT The Eiffel Tower
2,2025-03-31 04:10:53+00:00,2213637488,"RT @ThebilalSEO: Pinterest + ChatGPT = $19,845...",RT Pinterest ChatGPT 19845 Monthly And I ca...
3,2025-03-31 04:10:53+00:00,2962828127,I use deep research instead of google. Cross r...,I use deep research instead of google Cross re...
4,2025-03-31 04:10:52+00:00,3382,Spent the day writing. It felt like a year of ...,Spent the day writing It felt like a year of t...


In [17]:

df['tokens'] = df['cleaned_text'].apply(tokenize_text)

# Prepare input tensors
input_ids = torch.cat([item['input_ids'] for item in df['tokens']], dim=0)
attention_masks = torch.cat([item['attention_mask'] for item in df['tokens']], dim=0)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [18]:
df.head()

Unnamed: 0,created_at,user,text,cleaned_text,tokens
0,created_at,author_id,text,text,"[input_ids, token_type_ids, attention_mask]"
1,2025-03-31 04:10:53+00:00,3186601492,RT @moh_ai_r: The Eiffel Tower #mohair #mohair...,RT The Eiffel Tower,"[input_ids, token_type_ids, attention_mask]"
2,2025-03-31 04:10:53+00:00,2213637488,"RT @ThebilalSEO: Pinterest + ChatGPT = $19,845...",RT Pinterest ChatGPT 19845 Monthly And I ca...,"[input_ids, token_type_ids, attention_mask]"
3,2025-03-31 04:10:53+00:00,2962828127,I use deep research instead of google. Cross r...,I use deep research instead of google Cross re...,"[input_ids, token_type_ids, attention_mask]"
4,2025-03-31 04:10:52+00:00,3382,Spent the day writing. It felt like a year of ...,Spent the day writing It felt like a year of t...,"[input_ids, token_type_ids, attention_mask]"


In [None]:
from transformers import pipeline
# Load the sentiment analysis pipeline
sentiment_pipeline = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

In [19]:
# Classify sentiment
df['sentiment'] = df['cleaned_text'].apply(lambda x: sentiment_pipeline(x)[0]['label'])

# Display the results
print(df[['cleaned_text', 'sentiment']].head())

                                        cleaned_text sentiment
0                                               text  POSITIVE
1                               RT  The Eiffel Tower  POSITIVE
2  RT  Pinterest  ChatGPT  19845 Monthly And I ca...  NEGATIVE
3  I use deep research instead of google Cross re...  NEGATIVE
4  Spent the day writing It felt like a year of t...  NEGATIVE


In [20]:
# Save the results to a new CSV file
df.to_csv('./data/tweets_with_sentiment.csv', index=False)