In [3]:
import pandas as pd

df = pd.read_csv('tweets-data.csv')
print("Data shape:", df.shape)
print(df.head())


Data shape: (3010, 6)
   Unnamed: 0               Date Created  Number of Likes  Source of Tweet  \
0           0  2023-06-25 19:16:20+00:00                0              NaN   
1           1  2023-06-25 19:16:18+00:00                0              NaN   
2           2  2023-06-25 19:16:07+00:00                0              NaN   
3           3  2023-06-25 19:15:56+00:00                0              NaN   
4           4  2023-06-25 19:15:54+00:00                0              NaN   

                                              Tweets hashtag  
0  @jacksonhinklle #wagner with 6.2 billion dolla...  wagner  
1  Pobrecito es discapacitado\n#Reddetuiterosdemo...  wagner  
2  News from the EIR Daily Alert\n\nâ€œ#Putin Addre...  wagner  
3  It's Messi day #Messið“ƒµ #Messi36 #Russia #bigst...  wagner  
4  Il passaggio chiave di Machiavelli era questo ...  wagner  


In [4]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)  # remove URLs
    text = re.sub(r'@\w+', '', text)            # remove mentions
    text = re.sub(r'#', '', text)               # remove hashtag symbol
    text = re.sub(r'[^a-z\s]', '', text)        # remove punctuation/numbers
    text = re.sub(r'\s+', ' ', text).strip()    # remove extra spaces
    return text

df['cleaned_text'] = df['Tweets'].astype(str).apply(clean_text)
print(df[['Tweets', 'cleaned_text']].head())


                                              Tweets  \
0  @jacksonhinklle #wagner with 6.2 billion dolla...   
1  Pobrecito es discapacitado\n#Reddetuiterosdemo...   
2  News from the EIR Daily Alert\n\nâ€œ#Putin Addre...   
3  It's Messi day #Messið“ƒµ #Messi36 #Russia #bigst...   
4  Il passaggio chiave di Machiavelli era questo ...   

                                        cleaned_text  
0                         wagner with billion dollar  
1  pobrecito es discapacitado reddetuiterosdemocr...  
2  news from the eir daily alert putin addressed ...  
3  its messi day messi messi russia bigstage wagn...  
4  il passaggio chiave di machiavelli era questo ...  


In [5]:
!pip install transformers




In [11]:
from transformers import pipeline

# Create pipeline with automatic truncation at max 512 tokens
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    tokenizer="distilbert-base-uncased-finetuned-sst-2-english",
    truncation=True,
    max_length=512,
)


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


In [12]:
results = df_sample['cleaned_text'].apply(lambda x: sentiment_pipeline(x)[0])

df_sample['sentiment_label'] = results.apply(lambda x: x['label'])
df_sample['sentiment_score'] = results.apply(lambda x: x['score'])

print(df_sample[['cleaned_text', 'sentiment_label', 'sentiment_score']].head())


                                        cleaned_text sentiment_label  \
0  le dessindepresse de sanaga ls sont morts comm...        NEGATIVE   
1                       russia wagner russiacivilwar        NEGATIVE   
2  exclusive content cosplay japan titan titanics...        NEGATIVE   
3  auch heute geht die politische nachricht des t...        NEGATIVE   
4  same type that would take a homemade playstati...        NEGATIVE   

   sentiment_score  
0         0.981537  
1         0.962062  
2         0.961531  
3         0.975570  
4         0.994473  
