In [None]:
!pip install transformers
!pip install pandas



In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import pandas as pd
import torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# input file path
input_path = "/content/drive/My Drive/Colab Notebooks/preprocessed_output.txt"

# Read the text from the file and remove newline characters
with open(input_path, 'r') as file:
    text = [line.rstrip() for line in file.readlines()]

# Create a DataFrame with the text
result_df = pd.DataFrame({'Text': text})

# Display the DataFrame
print(result_df.head())

                            Text
0                 yeah sad liked
1            sad hear rest peace
2              sad one even care
3                       perioddd
4  someone hated twitter account


In [None]:
import os

# Output directory in your Google Drive
output_dir = "/content/drive/My Drive/Colab Notebooks/"

# Load the model and tokenizer
roberta = "cardiffnlp/twitter-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)

labels = ['Negative', 'Neutral', 'Positive']
batch_size = 5000  # Batch size for processing
num_rows = len(result_df)  # Total number of rows
start_index = 0  # Start index for the first batch

while start_index < num_rows:
    end_index = min(start_index + batch_size, num_rows)  # Calculate the end index for the current batch

    # Initialize lists to store sentiment predictions and scores for the current batch
    batch_predicted_sentiments = []
    batch_sentiment_scores = {'Negative': [], 'Neutral': [], 'Positive': []}

    # Iterate over the current batch of rows
    for index, row in result_df.iloc[start_index:end_index].iterrows():
        preprocessed_tweet = row['Text']
        encoded_tweet = tokenizer(preprocessed_tweet, return_tensors='pt')

        output = model(**encoded_tweet)

        predicted_sentiment = labels[torch.argmax(output.logits).item()]
        batch_predicted_sentiments.append(predicted_sentiment)

        scores = torch.nn.functional.softmax(output.logits, dim=1).detach().numpy()[0]
        batch_sentiment_scores['Negative'].append(scores[0])
        batch_sentiment_scores['Neutral'].append(scores[1])
        batch_sentiment_scores['Positive'].append(scores[2])

    # Create DataFrame for the current batch of rows
    batch_df = result_df.iloc[start_index:end_index].copy()

    # Drop 'Text' column if present
    #if 'Text' in batch_df.columns:
    #    batch_df.drop('Text', inplace=True, axis=1)

    # Add sentiment scores and predicted sentiment to DataFrame
    batch_df['Negative Score'] = batch_sentiment_scores['Negative']
    batch_df['Neutral Score'] = batch_sentiment_scores['Neutral']
    batch_df['Positive Score'] = batch_sentiment_scores['Positive']
    batch_df['Predicted Sentiment'] = batch_predicted_sentiments

    # Save the DataFrame to a CSV file
    output_csv_path = os.path.join(output_dir, f"predicted_sentiments_{start_index}_{end_index}.csv")
    batch_df.to_csv(output_csv_path, index=False)
    print(f"CSV file saved successfully: {output_csv_path}")

    start_index = end_index  # Update the start index for the next batch
