In [1]:
import pandas as pd
import numpy as np
import time
from IPython.display import display

import torch
import sentencepiece
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from transformers import pipeline
from huggingface_hub import notebook_login
from scipy.special import softmax

from sklearn.decomposition import LatentDirichletAllocation as LDA

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("PyTorch version:", torch.__version__)
print("CUDA version supported by PyTorch:", torch.version.cuda)

PyTorch version: 2.4.1+cu124
CUDA version supported by PyTorch: 12.4


In [3]:
!nvidia-smi

Wed Nov 13 16:52:20 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94                 Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3070      WDDM  |   00000000:06:00.0  On |                  N/A |
|  0%   43C    P8             17W /  220W |    1150MiB /   8192MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
#Check if torch is available
print(torch.cuda.is_available())

True


In [5]:
#Get the model out
MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
pipe = pipeline("text-classification", model=MODEL, device =0)

tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to("cuda")

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
#Download the english tweet directly
english_tweets = pd.read_csv('data/english_tweets.csv', encoding = 'utf-8-sig')

In [8]:
label_mapping = {
    "positive": 1,
    "negative": -1,
    "neutral": 0
}

In [9]:
def get_highest_sentiment(text):
    """
    Tokenizes the input text, computes sentiment scores using a model, and
    returns the highest sentiment label with its corresponding score.

    Args:
        text (str): Input text for sentiment analysis.

    Returns:
        tuple: (highest_sentiment_value, highest_sentiment_score)
               The sentiment label's integer value and its probability score.
    """
    try:
        encoded_input = tokenizer(text, return_tensors='pt', max_length=512, truncation=True).to(device)
        output = model(**encoded_input)
        scores = output[0][0].detach().cpu().numpy()
        scores = softmax(scores)
        max_index = int(np.argmax(scores))
        highest_sentiment_value = int(label_mapping[config.id2label[max_index]])
        highest_sentiment_score = float(scores[max_index])
        return highest_sentiment_value, highest_sentiment_score
    except Exception as e:
        print(f"Error processing text: {text} | Exception: {e}")
        return np.nan, np.nan

In [10]:
# Check if 'sentiment' and 'confidence' column exists and remove it if it does
if 'sentiment' in english_tweets.columns:
    english_tweets.drop('sentiment', axis=1, inplace=True)

if 'confidence' in english_tweets.columns:
    english_tweets.drop('confidence', axis=1, inplace=True)

# Start timing
start_time = time.time()

# Apply the function only on the first 100 rows
test_results = english_tweets['clean_tweet'].iloc[:1000].apply(get_highest_sentiment)

# Convert the result into a DataFrame with two columns (one for value, one for score)
english_tweets[['sentiment', 'confidence']] = pd.DataFrame(test_results.tolist(), index=test_results.index)

# End timing
end_time = time.time()

# Calculate the time taken
execution_time = end_time - start_time

# Print the result
print(f"Time taken to process the first 1000 rows: {execution_time:.4f} seconds")

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Time taken to process the first 1000 rows: 10.8627 seconds


In [11]:
#Check to make sure that the results are printed properly
print(english_tweets.head(1000)['sentiment'].unique())
print(english_tweets['clean_tweet'].isnull().sum())
print(english_tweets['clean_tweet'].nunique())

[-1.  0.  1.]
0
252563


In [12]:
english_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252563 entries, 0 to 252562
Data columns (total 30 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   created_at            252563 non-null  object 
 1   tweet_id              252563 non-null  float64
 2   tweet                 252563 non-null  object 
 3   likes                 252563 non-null  int64  
 4   retweet_count         252563 non-null  int64  
 5   source                252547 non-null  object 
 6   user_id               252563 non-null  float64
 7   user_name             252562 non-null  object 
 8   user_screen_name      252563 non-null  object 
 9   user_description      238194 non-null  object 
 10  user_join_date        252563 non-null  object 
 11  user_followers_count  252563 non-null  int64  
 12  user_location         252563 non-null  object 
 13  lat                   252563 non-null  float64
 14  long                  252563 non-null  float64
 15  

In [13]:
# Check if 'sentiment' and 'confidence' column exists and remove it if it does
if 'sentiment' in english_tweets.columns:
    english_tweets.drop('sentiment', axis=1, inplace=True)

if 'confidence' in english_tweets.columns:
    english_tweets.drop('confidence', axis=1, inplace=True)

results = english_tweets['clean_tweet'].apply(get_highest_sentiment)

In [14]:
# Convert the result into a DataFrame with two columns (one for value, one for score)
english_tweets[['sentiment', 'confidence']] = pd.DataFrame(results.tolist(), index=results.index)

# Ensure 'sentiment' is stored as integer and 'confidence' as float
english_tweets['sentiment'] = english_tweets['sentiment'].astype(int)
english_tweets['confidence'] = english_tweets['confidence'].astype(float)

In [15]:
#Make sure that the column is there
english_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252563 entries, 0 to 252562
Data columns (total 30 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   created_at            252563 non-null  object 
 1   tweet_id              252563 non-null  float64
 2   tweet                 252563 non-null  object 
 3   likes                 252563 non-null  int64  
 4   retweet_count         252563 non-null  int64  
 5   source                252547 non-null  object 
 6   user_id               252563 non-null  float64
 7   user_name             252562 non-null  object 
 8   user_screen_name      252563 non-null  object 
 9   user_description      238194 non-null  object 
 10  user_join_date        252563 non-null  object 
 11  user_followers_count  252563 non-null  int64  
 12  user_location         252563 non-null  object 
 13  lat                   252563 non-null  float64
 14  long                  252563 non-null  float64
 15  

In [16]:
# Display the updated DataFrame to check the new column
display(english_tweets[['clean_tweet', 'sentiment', 'confidence']].head(10))

Unnamed: 0,clean_tweet,sentiment,confidence
0,"#Trump: As a student I used to hear for years,...",-1,0.492327
1,You get a tie! And you get a tie! #Trump ‘s ra...,0,0.642429
2,@user Her 15 minutes were over long time ago. ...,-1,0.882416
3,@user @user @user There won’t be many of them....,-1,0.908475
4,One of the single most effective remedies to e...,1,0.501533
5,#Trump #PresidentTrump #Trump2020LandslideVict...,0,0.56666
6,"@user #Trump owes #RicardoAguirre $730,000 to ...",-1,0.85471
7,"#Trump: Nobody likes to tell you this, but som...",-1,0.605701
8,@user @user @user You right @user Censorship s...,-1,0.818584
9,@user @user Grab @user by the balls &amp; chuc...,-1,0.735748


In [17]:
print(english_tweets['clean_tweet'].isnull().sum())
print(english_tweets['hashtag'].value_counts())

0
hashtag
trump    127989
biden     99234
both      25340
Name: count, dtype: int64


In [19]:
#Save results
english_tweets.to_csv('data/results_xlm.csv', index=False, encoding = 'utf-8-sig')