In [12]:
# Standard library imports
import os
import re
import sys
import json
from datetime import datetime
from typing import Optional
from great_tables import GT
from pattern.en import sentiment


# Third-party library imports
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import polars as pl

# Dagster imports
from dagster import (
    AssetExecutionContext,
    MaterializeResult,
    asset
)

# Add project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.abspath(''), '../..')))

# Local project utility imports
from utils.azure_blob_utils import (
    create_blob_client_with_connection_string, 
    read_all_parquets_from_container, 
    write_blob_to_container, 
    read_blob_from_container, 
    merge_dataframes_on_id
)
from utils.common_helpers import generate_hash

# load assets bronze_scrappe_epl_news
# in order to be used as dependency
from assets.bronze_assets.scrappe_epl_news import bronze_scrappe_epl_news


load_dotenv()

# Get path of the config file
scrapper_config_path = os.path.join(sys.path[-1], 'scrapper_config.json')

In [13]:
# Load the JSON file
with open(scrapper_config_path, 'r') as file:
    scrapper_config = json.load(file)

# Load environment variables
connection_string = os.environ.get("CONN_STRING_AZURE_STORAGE")
if connection_string is None:
    raise EnvironmentError("Azure storage connection string not found in environment variables.")

# Create a blob client for Azure Blob Storage
blob_service_client = create_blob_client_with_connection_string(connection_string)
# List all blobs in the container

In [15]:
gold_container_name = scrapper_config['gold_container_name']
folder_name = scrapper_config['folder_name']

df_date = read_blob_from_container(gold_container_name, f"{folder_name}/dim_date.parquet", blob_service_client)
df_article = read_blob_from_container(gold_container_name, f"{folder_name}/dim_article.parquet", blob_service_client)
df_team = read_blob_from_container(gold_container_name, f"{folder_name}/dim_team.parquet", blob_service_client)
df_reaction = read_blob_from_container(gold_container_name, f"{folder_name}/reaction.parquet", blob_service_client)

Successfully read blob from gold/epl_news/dim_date.parquet
Successfully read blob from gold/epl_news/dim_article.parquet
Successfully read blob from gold/epl_news/dim_team.parquet
Successfully read blob from gold/epl_news/reaction.parquet


In [47]:
def get_sentiment(polarity: float, threshold: float) -> str:
    """
    Determines the sentiment based on the polarity value and a threshold.
    
    :param polarity: The polarity score, a float in the range [-1, 1].
    :param threshold: A threshold value for distinguishing between neutral and other sentiments.
    :return: A string representing the sentiment ('negative', 'positive', or 'neutral').
    """

    # Check if the polarity is in the negative range: [-1, 0 - threshold[
    if (polarity >= -1) and (polarity < (0 - threshold)):
        return 'negative'
    
    # Check if the polarity is in the positive range: [0 + threshold, 1]
    elif (polarity <= 1) and (polarity > (0 + threshold)):
        return 'positive'
    
    # If the polarity is in the neutral range: [-threshold, threshold[
    else:
        return 'neutral'

def is_subjectivity(subjectivity: float) -> str:
    """
    Determines whether the given subjectivity score corresponds to a subjective or objective sentiment.
    
    :param subjectivity: The subjectivity score, a float in the range [0, 1].
    :return: A string representing whether the sentiment is 'subjective' or 'objective'.
    """

    # If the subjectivity score is in the range [0.5, 1], the sentiment is subjective
    if (subjectivity >= 0.5) and (subjectivity <= 1):
        return 'subjective'
    
    # If the subjectivity score is in the range [0, 0.5[, the sentiment is objective
    else:
        return 'objective'

def extract_sentiment(reaction_id: str, content: str, threshold_polarity: float) -> tuple:
    """
    Extracts sentiment and subjectivity from the content and returns detailed sentiment analysis.
    
    :param reaction_id: The unique identifier for the reaction.
    :param content: The content or text from which sentiment and subjectivity are to be extracted.
    :param threshold_polarity: The threshold used to classify sentiment as neutral.
    :return: A tuple containing the following:
        - reaction_id (str): The ID of the reaction.
        - polarity_value (float): The raw polarity score, a float in the range [-1, 1].
        - polarity (str): The classified sentiment ('negative', 'positive', or 'neutral').
        - subjectivity_value (float): The raw subjectivity score, a float in the range [0, 1].
        - subjectivity (str): The classified subjectivity ('subjective' or 'objective').
    """
    
    # Extract the sentiment analysis results from the content (polarity and subjectivity)
    res = sentiment(content)
    polarity_value = res[0]  # Polarity value is in the range [-1, 1]
    subjectivity_value = res[1]  # Subjectivity value is in the range [0, 1]

    # Classify the polarity based on the polarity value and threshold
    polarity = get_sentiment(polarity_value, threshold_polarity)

    # Classify the subjectivity based on the subjectivity score
    subjectivity = is_subjectivity(subjectivity_value)

    # Return a tuple with the reaction_id and the sentiment analysis details
    return (reaction_id+"_sentiment", polarity_value, polarity, subjectivity_value, subjectivity)

In [51]:
def update_sentiment(df_reaction: pl.DataFrame, threshold: float) -> pl.DataFrame:
    """
    Applies sentiment extraction to the reactions in the DataFrame and returns a new Polars DataFrame
    containing the sentiment analysis for each reaction.

    :param df_reaction: A Polars DataFrame containing reaction data with columns 'reaction_id' and 'content'.
    :param threshold: A threshold value to classify neutral sentiment.
    :return: A new Polars DataFrame with sentiment analysis for each reaction.
    """

    # Initialize an empty list to store the sentiment updates
    sentiment_update_list = []

    # Loop through each row in the df_reaction DataFrame
    for row in df_reaction.iter_rows(named=True):
        # Extract sentiment for each reaction and append the result to the list
        sentiment_update_list.append(
            extract_sentiment(row['reaction_id'], row['content'], threshold)
        )

    # Convert the sentiment_update_list into a Polars DataFrame
    sentiment_df = pl.DataFrame(
        sentiment_update_list, 
        schema=["reaction_id", "polarity_value", "polarity", "subjectivity_value", "subjectivity"],
        orient="row"
    )

    # Return the new Polars DataFrame containing sentiment analysis
    return sentiment_df


In [52]:
df = update_sentiment(df_reaction, 0.2) 

In [53]:
df

reaction_id,polarity_value,polarity,subjectivity_value,subjectivity
str,f64,str,f64,str
"""f9537e31501382e9_fan_1_sentime…",0.33,"""positive""",0.366667,"""objective"""
"""f9537e31501382e9_fan_2_sentime…",0.166667,"""neutral""",0.316667,"""objective"""
"""f9537e31501382e9_fan_3_sentime…",0.105556,"""neutral""",0.333333,"""objective"""
"""f9537e31501382e9_fan_4_sentime…",0.575,"""positive""",0.645833,"""subjective"""
"""f9537e31501382e9_fan_5_sentime…",0.086111,"""neutral""",0.432937,"""objective"""
…,…,…,…,…
"""438ecf071ed80d0a_pro_sentiment""",0.196154,"""neutral""",0.684615,"""subjective"""
"""8de8c844fd21e63f_pro_sentiment""",0.118034,"""neutral""",0.225556,"""objective"""
"""b8f47ad44b06a0bf_pro_sentiment""",-0.015549,"""neutral""",0.426996,"""objective"""
"""c6eccc66a251d8a4_pro_sentiment""",0.141396,"""neutral""",0.477706,"""objective"""
