## Data Processing: Sentiment Analysis w/ Cookie Banner Text

Recent studies show that pretrained BERT model achieves the highest accuracy on sentiment analysis tasks ([Source](https://typeset.io/questions/what-are-the-best-models-for-sentiment-analysis-2rho2gnvpu#)). BERT accuracy is further enhanced using BiLSTM and BiGRU.

In [None]:
'''
!pip install torch transformers pandas
'''

In [None]:
# Load data into DF

In [None]:
# Normalize text, remove special characters

In [None]:
from transformers import pipeline, BertTokenizer
import numpy as np

# Load tokenizer and sentiment analysis pipeline
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sentiment_pipeline = pipeline("sentiment-analysis")

In [None]:
def chunk_text(text, max_length=508):
    '''
    Second chunk method: take both 254 words from head and tail: 8 min for 1 batch out of 31    
    '''
    # Tokenize the entire text
    tokens = tokenizer.tokenize(text)

    # Calculate the number of tokens to select from the beginning and the end
    half_max_length = max_length // 2

    # Select the first and last portions of the tokens
    if len(tokens) > max_length:
        beginning_tokens = tokens[:half_max_length]
        end_tokens = tokens[-half_max_length:]
    else:
        beginning_tokens = tokens
        end_tokens = []

    # Convert tokens back to strings
    beginning_chunk = tokenizer.convert_tokens_to_string(beginning_tokens)
    end_chunk = tokenizer.convert_tokens_to_string(end_tokens) if end_tokens else ''

    # Combine the two chunks into a list, omitting the end chunk if it's empty
    chunks = [beginning_chunk, end_chunk] if end_chunk else [beginning_chunk]

    return chunks

In [None]:
def analyze_sentiment_separate(text_chunks):
    '''
    Do separate sentiment analysis with chunks
    '''
    results = sentiment_pipeline(text_chunks)

    # Initialize variables to hold weighted sums and total lengths
    positive_weighted_sum = 0
    negative_weighted_sum = 0
    total_length = 0

    # Calculate total length for weighting
    for chunk in text_chunks:
        total_length += len(tokenizer.tokenize(chunk))

    # Iterate through results and categorize scores with weights
    for chunk, result in zip(text_chunks, results):
        chunk_length = len(tokenizer.tokenize(chunk))
        if total_length == 0:
          # Handle the zero total_length case, e.g., by continuing to the next iteration
          weight = 0
        else:
          weight = chunk_length / total_length  # Calculate weight for the current chunk

        if result['label'] == 'POSITIVE':
            positive_weighted_sum += result['score'] * weight
        else:
            # Transforming score to maintain consistency and applying weight
            negative_weighted_sum += (1 - result['score']) * weight

    # Calculate weighted averages
    if total_length > 0:  # Ensure division by zero does not occur
        average_positive = positive_weighted_sum
        average_negative = negative_weighted_sum
    else:
        average_positive, average_negative = None, None

    return average_positive, average_negative

In [None]:
def process_lyrics_sentiment_separate(row):
    '''
    Do sentiment analysis for each row in the record
    '''
    text_chunks = chunk_text(row['lyrics'])
    if not text_chunks:
        return None, None
    return analyze_sentiment_separate(text_chunks)

In [None]:
def process_in_batches(df, start_batch, batch_size=1000):
    '''
    Do batch processing for the sentiment analysis to avoid data loss due to breakdown
    '''
    # Number of batches
    num_batches = len(df) // batch_size + (1 if len(df) % batch_size else 0)

    for i in range(start_batch-1, num_batches):
        start_idx = i * batch_size
        end_idx = start_idx + batch_size
        print(f"Processing batch {i+1}/{num_batches} (records {start_idx} to {end_idx})")

        # Apply the sentiment analysis function to the batch
        batch_results = df.iloc[start_idx:end_idx].apply(lambda row: process_lyrics_sentiment_separate(row), axis=1, result_type='expand')

        # Correct assignment to df
        df.loc[start_idx:end_idx-1, ['average_positive', 'average_negative']] = batch_results.values # Use .values to assign correctly

        # Save the batch results to a CSV file
        batch_df = df.iloc[start_idx:end_idx]
        batch_df.to_csv(f'lyrics_sentiment_batch_{i+1}.csv', index=False)

        print(f"Saved batch {i+1} to CSV")

    return df

In [None]:
# Process in batches to avoid data loss - it crashed once, so I changed the start_batch from 0 to 11 to continue
# df = process_in_batches(df, 11)

In [None]:
# Save df