In [11]:
from transformers import pipeline
    
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest"
)

text1 = "This is a good cup of coffee."
text2 = "This cup of coffee is good but not the best I've had."

prediction = sentiment_analyzer(text1)[0]
prediction2 = sentiment_analyzer(text2)[0]


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [12]:
print(prediction)
print(prediction2)

{'label': 'positive', 'score': 0.9566965103149414}
{'label': 'negative', 'score': 0.6155491471290588}


In [23]:
for r in exposure:
    print(r)

In [45]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

def perform_sentiment_analysis(df, text_column):
    """
    Perform sentiment analysis on the given DataFrame using twitter-roberta-base-sentiment-latest
    """
    # Load tokenizer and model
    print("Loading tokenizer and model...")
    tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
    model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
    
    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    model = model.to(device)
    
    # Initialize lists to store results
    sentiment_labels = []
    sentiment_scores = []
    
    # Process texts in batches
    batch_size = 32
    texts = df[text_column].tolist()
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        
        # Tokenize texts
        encoded = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
        encoded = {k: v.to(device) for k, v in encoded.items()}
        
        # Get model predictions
        with torch.no_grad():
            outputs = model(**encoded)
            scores = torch.nn.functional.softmax(outputs.logits, dim=1)
            scores = scores.cpu().numpy()
        
        # Process each prediction in the batch
        for score in scores:
            # Get label (0: negative, 1: neutral, 2: positive)
            label_id = score.argmax()
            label_map = {0: "negative", 1: "neutral", 2: "positive"}
            label = label_map[label_id]
            
            # Calculate sentiment score (-1 to 1)
            # Negative score is -1 * negative_probability
            # Positive score is 1 * positive_probability
            # Neutral score is 0 * neutral_probability
            sentiment_score = score[2] - score[0]  # positive_prob - negative_prob
            
            sentiment_labels.append(label)
            sentiment_scores.append(round(float(sentiment_score), 3))
    
    return sentiment_labels, sentiment_scores

def main():
    # Read the input CSV file
    input_file = "risk_contexts.csv"  # Change this to your input file name
    print(f"Reading input file: {input_file}")
    df = pd.read_csv(input_file)
    
    # Print column names to verify the text column name
    print("Columns in the DataFrame:", df.columns)
    
    # Perform sentiment analysis
    print("Performing sentiment analysis...")
    sentiment_labels, sentiment_scores = perform_sentiment_analysis(df, "context")  # Change "context" to your text column name
    
    # Add results to DataFrame
    df["sentiment"] = sentiment_labels
    df["sentiment_score"] = sentiment_scores
    
    # Save results to new CSV file
    output_file = "risk_contexts_with_sentiment.csv"
    df.to_csv(output_file, index=False)
    print(f"\nResults saved to {output_file}")
    
    # Print summary statistics
    print("\nSentiment Distribution:")
    print(pd.Series(sentiment_labels).value_counts())
    print("\nSentiment Score Statistics:")
    print(pd.Series(sentiment_scores).describe())

if __name__ == "__main__":
    main()

Reading input file: risk_contexts.csv
Columns in the DataFrame: Index(['Table 1', 'context', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Unnamed: 6'],
      dtype='object')
Performing sentiment analysis...
Loading tokenizer and model...


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using device: cpu


100%|██████████| 1/1 [00:00<00:00,  1.21it/s]



Results saved to risk_contexts_with_sentiment.csv

Sentiment Distribution:
neutral     2
negative    2
positive    2
Name: count, dtype: int64

Sentiment Score Statistics:
count    6.000000
mean     0.102333
std      0.585018
min     -0.596000
25%     -0.316750
50%      0.059000
75%      0.481250
max      0.912000
dtype: float64


Sand box

In [4]:
#pip install scipy


Collecting scipy
  Downloading scipy-1.15.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (61 kB)
Downloading scipy-1.15.1-cp311-cp311-macosx_14_0_arm64.whl (24.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.8/24.8 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: scipy
Successfully installed scipy-1.15.1
Note: you may need to restart the kernel to use updated packages.


In [9]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

In [None]:
# Preprocessing of texts

In [10]:
# Model "cardiffnlp/twitter-roberta-base-sentiment-latest"
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
# scoring and sentiment
text = "Trump is putting 25% tariff on Mexican and Canadian goods"
# text = preprocess(text) # This is the function that needed to implement the extraction of risk words interval
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

In [12]:
# Labels and scores
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) neutral 0.6107
2) negative 0.3366
3) positive 0.0527


In [13]:
def analyze_sentiment_twitter(text, model_path):
    """
    Analyze sentiment of a given text using a pre-trained model from Hugging Face.

    Parameters:
    - text (str): The input text to analyze.
    - model_path (str): The path to the pre-trained model.

    Returns:
    - label (str): The sentiment label of the text.
    - scores (dict): The sentiment scores for each class.
    """
    # Load the model and tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt")

    # Perform the forward pass
    outputs = model(**inputs)

    # Get the predicted class label
    predicted_class_idx = outputs.logits.argmax().item()
    label = model.config.id2label[predicted_class_idx]

    #
    
    

In [None]:
# CSV file reading
def analyze_csv_sentiments(input_csv, output_csv):
    # Load the input CSV file
    df=pd.read_csv(input_csv)
    # Analyze sentiment for each row using our model aand add result to new column
    df[f'twitter-roberta-base-sentiment-latest_sentiment'] = df['text'].apply(lambda text: analyze_sentiment_twitter(text, "cardiffnlp/twitter-roberta-base-sentiment"))
    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_csv, index=False)

    return df


In [None]:
import warnings
warnings.filterwarnings("ignore")
input_csv = "test_file.csv"  # Path to your input CSV file
output_csv = "output_with_sentiments.csv"  # Path to save the output CSV file with sentiments
result_df = analyze_csv_sentiments(input_csv, output_csv)
print(result_df.head())  

In [42]:
# Final function 
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

def perform_sentiment_analysis(df, text_column):
    """
    Perform sentiment analysis on the given DataFrame using twitter-roberta-base-sentiment-latest
    """
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
    model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
    
    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # Initialize lists to store results
    sentiment_labels = []
    sentiment_scores = []
    
    # Process texts in batches
    batch_size = 32
    texts = df[text_column].tolist()
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        
        # Tokenize texts
        encoded = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
        encoded = {k: v.to(device) for k, v in encoded.items()}
        
        # Get model predictions
        with torch.no_grad():
            outputs = model(**encoded)
            scores = torch.nn.functional.softmax(outputs.logits, dim=1)
            scores = scores.cpu().numpy()
        
        # Process each prediction in the batch
        for score in scores:
            # Get label (0: negative, 1: neutral, 2: positive)
            label_id = score.argmax()
            label_map = {0: "negative", 1: "neutral", 2: "positive"}
            label = label_map[label_id]
            
            # Calculate sentiment score (-1 to 1)
            # Negative score is -1 * negative_probability
            # Positive score is 1 * positive_probability
            # Neutral score is 0 * neutral_probability
            sentiment_score = score[2] - score[0]  # positive_prob - negative_prob
            
            sentiment_labels.append(label)
            sentiment_scores.append(round(float(sentiment_score), 3))
    
    return sentiment_labels, sentiment_scores

def main():
    # Read the input CSV file
    input_file = "risk_contexts.csv"  # Change this to your input file name
    df = pd.read_csv(input_file)
    
    # Perform sentiment analysis
    print("Performing sentiment analysis...")
    sentiment_labels, sentiment_scores = perform_sentiment_analysis(df, "context")  # Change "context" to your text column name
    
    # Add results to DataFrame
    df["sentiment"] = sentiment_labels
    df["sentiment_score"] = sentiment_scores
    
    # Save results to new CSV file
    output_file = "risk_contexts_with_sentiment.csv"
    df.to_csv(output_file, index=False)
    print(f"\nResults saved to {output_file}")
    
    # Print summary statistics
    print("\nSentiment Distribution:")
    print(pd.Series(sentiment_labels).value_counts())
    print("\nSentiment Score Statistics:")
    print(pd.Series(sentiment_scores).describe())

if __name__ == "__main__":
    main()

Performing sentiment analysis...


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyError: 'context'

In [1]:
from functions import extract_exposure2

print(extract_company_info("/Users/efang/Desktop/coding/research/src/data/earnings_calls/ex1.xml"))

['Ryder System Inc', 'R', '02-02-2016', 'MIAMI']
