# Sentiment Analysis of Titles

In [2]:
# Import relevant libraries
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [3]:
# UTF-8 encoding setup and Vader Sentiment Analysis example
import locale
import sys
import os

# Set UTF-8 encoding
if sys.platform.startswith('win'):
    # For Windows
    os.environ['PYTHONIOENCODING'] = 'utf-8'
else:
    # For macOS/Linux
    locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

# Ensure pandas displays UTF-8 characters properly
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.unicode.ambiguous_as_wide', True)

In [4]:
# Import datasets

df0_cleaned = pd.read_parquet('data/cleaned_data_0.parquet')
df1_cleaned = pd.read_parquet('data/cleaned_data_1.parquet')
df2_cleaned = pd.read_parquet('data/cleaned_data_2.parquet')    
df3_cleaned = pd.read_parquet('data/cleaned_data_3.parquet')
df4_cleaned = pd.read_parquet('data/cleaned_data_4.parquet')
df5_cleaned = pd.read_parquet('data/cleaned_data_5.parquet')
df6_cleaned = pd.read_parquet('data/cleaned_data_6.parquet')

# check first rows for df0_cleaned
df0_cleaned.head()


Unnamed: 0,ID,publishedAt,instances,source-name,location_code,location,category,year,month,new_title
0,12436,2020-08-06T09:21:27Z,"[{'category': 'general', 'collectedAt': '2020-...",Albidda.net,ae,United Arab Emirates,general,2020,8,A doctor warns of new symptoms of ‚ÄúCorona‚Äù tha...
1,12541,2020-08-06T15:45:39Z,"[{'category': 'general', 'collectedAt': '2020-...",Middle East Online,ae,United Arab Emirates,general,2020,8,Foldable phones lead Samsung to climb the top ...
2,12568,2020-08-06T16:43:05Z,"[{'category': 'general', 'collectedAt': '2020-...",Alanba.com.kw,ae,United Arab Emirates,general,2020,8,Explosive stars create calcium in - Kuwait New...
3,12795,2020-08-07T07:08:19Z,"[{'category': 'general', 'collectedAt': '2020-...",Al-ain.com,ae,United Arab Emirates,general,2020,8,Twitter secures the accounts of governments an...
4,13129,2020-08-07T13:33:00Z,"[{'category': 'general', 'collectedAt': '2020-...",ŸÖÿ≠ŸÑŸäÿßÿ™,ae,United Arab Emirates,general,2020,8,Corona patients without symptoms carry a viral...


In [5]:
# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Test with a sample text to ensure UTF-8 works
sample_text = "This is a great day! üòä"
print(f"Sample text: {sample_text}")
print(f"Sentiment scores: {analyzer.polarity_scores(sample_text)}")

Sample text: This is a great day! üòä
Sentiment scores: {'neg': 0.0, 'neu': 0.402, 'pos': 0.598, 'compound': 0.8858}


In [7]:
# Add sentiment scores to each dataframe based on the 'title' column

def add_sentiment_scores(df):
    # Create a copy to avoid modifying the original dataframe
    df_copy = df.copy()
    
    # Handle missing or null values in the 'en-title' column
    def safe_sentiment_analysis(text):
        if pd.isna(text) or text is None or text == '':
            # Return neutral sentiment for missing/empty text
            return {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
        try:
            return analyzer.polarity_scores(str(text))
        except Exception as e:
            print(f"Error processing text: {text[:50]}... Error: {e}")
            return {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
    
    sentiments = df_copy['new_title'].apply(safe_sentiment_analysis)
    sentiment_df = pd.DataFrame(list(sentiments))
    df_result = pd.concat([df_copy.reset_index(drop=True), sentiment_df.reset_index(drop=True)], axis=1)
    return df_result

df0_sentiment = add_sentiment_scores(df0_cleaned)
df1_sentiment = add_sentiment_scores(df1_cleaned)
df2_sentiment = add_sentiment_scores(df2_cleaned)
df3_sentiment = add_sentiment_scores(df3_cleaned)
df4_sentiment = add_sentiment_scores(df4_cleaned)
df5_sentiment = add_sentiment_scores(df5_cleaned)
df6_sentiment = add_sentiment_scores(df6_cleaned)

# check first rows for df0_sentiment
df0_sentiment.head()

Unnamed: 0,ID,publishedAt,instances,source-name,location_code,location,category,year,month,new_title,neg,neu,pos,compound
0,12436,2020-08-06T09:21:27Z,"[{'category': 'general', 'collectedAt': '2020-...",Albidda.net,ae,United Arab Emirates,general,2020,8,A doctor warns of new symptoms of ‚ÄúCorona‚Äù tha...,0.149,0.851,0.0,-0.2023
1,12541,2020-08-06T15:45:39Z,"[{'category': 'general', 'collectedAt': '2020-...",Middle East Online,ae,United Arab Emirates,general,2020,8,Foldable phones lead Samsung to climb the top ...,0.0,0.893,0.107,0.2023
2,12568,2020-08-06T16:43:05Z,"[{'category': 'general', 'collectedAt': '2020-...",Alanba.com.kw,ae,United Arab Emirates,general,2020,8,Explosive stars create calcium in - Kuwait New...,0.0,0.769,0.231,0.2732
3,12795,2020-08-07T07:08:19Z,"[{'category': 'general', 'collectedAt': '2020-...",Al-ain.com,ae,United Arab Emirates,general,2020,8,Twitter secures the accounts of governments an...,0.0,0.881,0.119,0.3182
4,13129,2020-08-07T13:33:00Z,"[{'category': 'general', 'collectedAt': '2020-...",ŸÖÿ≠ŸÑŸäÿßÿ™,ae,United Arab Emirates,general,2020,8,Corona patients without symptoms carry a viral...,0.176,0.824,0.0,-0.4939


In [8]:
# Add sentiment scores based on compound score where:
# positive sentiment: compound score >= 0.05
# neutral sentiment: (compound score > -0.05) and (compound score < 0.05)
# negative sentiment: compound score <= -0.05

def categorize_sentiment(compound_score):
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'
    
df0_sentiment['sentiment_category'] = df0_sentiment['compound'].apply(categorize_sentiment)
df1_sentiment['sentiment_category'] = df1_sentiment['compound'].apply(categorize_sentiment)
df2_sentiment['sentiment_category'] = df2_sentiment['compound'].apply(categorize_sentiment)
df3_sentiment['sentiment_category'] = df3_sentiment['compound'].apply(categorize_sentiment)
df4_sentiment['sentiment_category'] = df4_sentiment['compound'].apply(categorize_sentiment)
df5_sentiment['sentiment_category'] = df5_sentiment['compound'].apply(categorize_sentiment)
df6_sentiment['sentiment_category'] = df6_sentiment['compound'].apply(categorize_sentiment)

# check first rows for df0_sentiment with sentiment category
df0_sentiment.head()

Unnamed: 0,ID,publishedAt,instances,source-name,location_code,location,category,year,month,new_title,neg,neu,pos,compound,sentiment_category
0,12436,2020-08-06T09:21:27Z,"[{'category': 'general', 'collectedAt': '2020-...",Albidda.net,ae,United Arab Emirates,general,2020,8,A doctor warns of new symptoms of ‚ÄúCorona‚Äù tha...,0.149,0.851,0.0,-0.2023,negative
1,12541,2020-08-06T15:45:39Z,"[{'category': 'general', 'collectedAt': '2020-...",Middle East Online,ae,United Arab Emirates,general,2020,8,Foldable phones lead Samsung to climb the top ...,0.0,0.893,0.107,0.2023,positive
2,12568,2020-08-06T16:43:05Z,"[{'category': 'general', 'collectedAt': '2020-...",Alanba.com.kw,ae,United Arab Emirates,general,2020,8,Explosive stars create calcium in - Kuwait New...,0.0,0.769,0.231,0.2732,positive
3,12795,2020-08-07T07:08:19Z,"[{'category': 'general', 'collectedAt': '2020-...",Al-ain.com,ae,United Arab Emirates,general,2020,8,Twitter secures the accounts of governments an...,0.0,0.881,0.119,0.3182,positive
4,13129,2020-08-07T13:33:00Z,"[{'category': 'general', 'collectedAt': '2020-...",ŸÖÿ≠ŸÑŸäÿßÿ™,ae,United Arab Emirates,general,2020,8,Corona patients without symptoms carry a viral...,0.176,0.824,0.0,-0.4939,negative


In [9]:
# Save sentiment scores to new file
df0_sentiment.to_parquet("data/sentiment_data_0.parquet", index=False)
df1_sentiment.to_parquet("data/sentiment_data_1.parquet", index=False)
df2_sentiment.to_parquet("data/sentiment_data_2.parquet", index=False)
df3_sentiment.to_parquet("data/sentiment_data_3.parquet", index=False)
df4_sentiment.to_parquet("data/sentiment_data_4.parquet", index=False)
df5_sentiment.to_parquet("data/sentiment_data_5.parquet", index=False)
df6_sentiment.to_parquet("data/sentiment_data_6.parquet", index=False)