In [13]:
# Standard library imports
import os
import sys
import json
from pattern.en import sentiment
from datetime import datetime
import hashlib


# Third-party library imports
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import polars as pl

# Add project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.abspath(''), '../..')))

# Local project utility imports
from utils.azure_blob_utils import (
    create_blob_client_with_connection_string, 
    read_blob_from_container,
    read_all_parquets_from_container
)

# load assets bronze_scrappe_epl_news
# in order to be used as dependency
from assets.bronze_assets.scrappe_epl_news import scrappe_epl_news


load_dotenv()

# Get path of the config file
scrapper_config_path = os.path.join(sys.path[-1], 'scrapper_config.json')

In [14]:
# Load the JSON file
with open(scrapper_config_path, 'r') as file:
    scrapper_config = json.load(file)

# Load environment variables
connection_string = os.environ.get("CONN_STRING_AZURE_STORAGE")
if connection_string is None:
    raise EnvironmentError("Azure storage connection string not found in environment variables.")

# Create a blob client for Azure Blob Storage
blob_service_client = create_blob_client_with_connection_string(connection_string)
# List all blobs in the container

In [15]:
folder_name = scrapper_config['folder_name']

### BRONZE

In [16]:
bronze_container_name = scrapper_config['bronze_container_name']

df_epl_news = read_all_parquets_from_container(bronze_container_name, folder_name, blob_service_client)
df_epl_news.head()

Successfully read parquet file from bronze/epl_news/epl_news_2024_10_14.parquet


_hashedId,_extractedDate,teamName,page,html
str,datetime[μs],str,i8,str
"""9d4702a75dda24a4457b469e9e2577…",2024-10-14 20:50:02,"""AFC Bournemouth""",1,"""<!DOCTYPE html><html lang=""en-…"
"""63cd46598c3f0164ccc77b4ba38090…",2024-10-14 20:50:02,"""AFC Bournemouth""",2,"""<!DOCTYPE html><html lang=""en-…"
"""4b44c64ebb2c9e14de13eb15292330…",2024-10-14 20:50:02,"""Arsenal""",1,"""<!DOCTYPE html><html lang=""en-…"
"""18bdeafc8ca33ccfa530d07470a354…",2024-10-14 20:50:02,"""Arsenal""",2,"""<!DOCTYPE html><html lang=""en-…"
"""3cef8d9424daf9ff6a38c1c2a325d7…",2024-10-14 20:50:02,"""Aston Villa""",1,"""<!DOCTYPE html><html lang=""en-…"


### SILVER

In [17]:
silver_container_name = scrapper_config['silver_container_name']

df_epl_news = read_blob_from_container(silver_container_name, f"{folder_name}/processed_data.parquet", blob_service_client)
df_epl_news.head()

Successfully read blob from silver/epl_news/processed_data.parquet


teamName,publishedDate,title,content,id
str,date,str,str,str
"""Everton""",2024-10-01,"""McNeil 'only deals in goal-of-…","""Ten minutes in and the overwhe…","""f7d04530e69d8f68"""
"""Nottingham Forest""",2024-10-06,"""Are Forest the most advanced d…","""Nottingham Forest have taken a…","""cdd2fbfbceedcd95"""
"""Liverpool""",2024-10-10,"""Can managers openly admit ment…","""This video can not be played …","""c78b9e565ad3a1ad"""
"""Brentford""",2024-10-01,"""Fast starts... thin bench""","""Don't say I didn't warn you! I…","""d62c96c9f56a4946"""
"""Aston Villa""",2024-10-03,"""'The atmosphere was something …","""Former Aston Villa midfielder …","""2915c60e807fad38"""


### GOLD

In [18]:
gold_container_name = scrapper_config['gold_container_name']
folder_name = scrapper_config['folder_name']

df_date = read_blob_from_container(gold_container_name, f"{folder_name}/dim_date.parquet", blob_service_client)
df_article = read_blob_from_container(gold_container_name, f"{folder_name}/article.parquet", blob_service_client)
df_team = read_blob_from_container(gold_container_name, f"{folder_name}/dim_team.parquet", blob_service_client)
df_reaction = read_blob_from_container(gold_container_name, f"{folder_name}/reaction.parquet", blob_service_client)
df_sentiment = read_blob_from_container(gold_container_name, f"{folder_name}/dim_sentiment.parquet", blob_service_client)
df_type = read_blob_from_container(gold_container_name, f"{folder_name}/dim_type.parquet", blob_service_client)
df_fact_reaction = read_blob_from_container(gold_container_name, f"{folder_name}/fact_reaction.parquet", blob_service_client)


Successfully read blob from gold/epl_news/dim_date.parquet
Successfully read blob from gold/epl_news/dim_article.parquet
Successfully read blob from gold/epl_news/dim_team.parquet
Successfully read blob from gold/epl_news/reaction.parquet
Successfully read blob from gold/epl_news/dim_sentiment.parquet
Successfully read blob from gold/epl_news/dim_type.parquet
Successfully read blob from gold/epl_news/fact_reaction.parquet


In [19]:
def get_sentiment(polarity: float, threshold: float) -> str:
    """
    Determines the sentiment based on the polarity value and a threshold.
    
    :param polarity: The polarity score, a float in the range [-1, 1].
    :param threshold: A threshold value for distinguishing between neutral and other sentiments.
    :return: A string representing the sentiment ('negative', 'positive', or 'neutral').
    """

    # Check if the polarity is in the negative range: [-1, 0 - threshold[
    if (polarity >= -1) and (polarity < (0 - threshold)):
        return 'negative'
    
    # Check if the polarity is in the positive range: [0 + threshold, 1]
    elif (polarity <= 1) and (polarity > (0 + threshold)):
        return 'positive'
    
    # If the polarity is in the neutral range: [-threshold, threshold[
    else:
        return 'neutral'

def is_subjectivity(subjectivity: float) -> str:
    """
    Determines whether the given subjectivity score corresponds to a subjective or objective sentiment.
    
    :param subjectivity: The subjectivity score, a float in the range [0, 1].
    :return: A string representing whether the sentiment is 'subjective' or 'objective'.
    """

    # If the subjectivity score is in the range [0.5, 1], the sentiment is subjective
    if (subjectivity >= 0.5) and (subjectivity <= 1):
        return 'subjective'
    
    # If the subjectivity score is in the range [0, 0.5[, the sentiment is objective
    else:
        return 'objective'

def extract_sentiment(reaction_id: str, content: str, threshold_polarity: float) -> tuple:
    """
    Extracts sentiment and subjectivity from the content and returns detailed sentiment analysis.
    
    :param reaction_id: The unique identifier for the reaction.
    :param content: The content or text from which sentiment and subjectivity are to be extracted.
    :param threshold_polarity: The threshold used to classify sentiment as neutral.
    :return: A tuple containing the following:
        - reaction_id (str): The ID of the reaction.
        - polarity_value (float): The raw polarity score, a float in the range [-1, 1].
        - polarity (str): The classified sentiment ('negative', 'positive', or 'neutral').
        - subjectivity_value (float): The raw subjectivity score, a float in the range [0, 1].
        - subjectivity (str): The classified subjectivity ('subjective' or 'objective').
    """
    
    # Extract the sentiment analysis results from the content (polarity and subjectivity)
    res = sentiment(content)
    polarity_value = res[0]  # Polarity value is in the range [-1, 1]
    subjectivity_value = res[1]  # Subjectivity value is in the range [0, 1]

    # Classify the polarity based on the polarity value and threshold
    polarity = get_sentiment(polarity_value, threshold_polarity)

    # Classify the subjectivity based on the subjectivity score
    subjectivity = is_subjectivity(subjectivity_value)

    # Return a tuple with the reaction_id and the sentiment analysis details
    return (reaction_id, polarity_value, polarity, subjectivity_value, subjectivity)


def create_fact_reaction(df_reaction: pl.DataFrame, threshold: float) -> pl.DataFrame:
    """
    Applies sentiment extraction to the reactions in the DataFrame and returns a new Polars DataFrame
    containing the sentiment analysis for each reaction.

    :param df_reaction: A Polars DataFrame containing reaction data with columns 'reaction_id' and 'content'.
    :param threshold: A threshold value to classify neutral sentiment.
    :return: A new Polars DataFrame with sentiment analysis for each reaction.
    """

    # Initialize an empty list to store the sentiment updates
    sentiment_update_list = []

    # Loop through each row in the df_reaction DataFrame
    for row in df_reaction.iter_rows(named=True):
        # Extract sentiment for each reaction and append the result to the list
        sentiment_update_list.append(
            extract_sentiment(row['reaction_id'], row['content'], threshold)
        )

    # Convert the sentiment_update_list into a Polars DataFrame
    sentiment_df = pl.DataFrame(
        sentiment_update_list, 
        schema=["reaction_id", "sentiment_score", "sentiment_label", "subjectivity_score", "is_subjective"],
        orient="row"
    )

    # Return the new Polars DataFrame containing sentiment analysis
    return sentiment_df

In [31]:
df_article = df_article.with_columns(
    fk_title_id = pl.concat_str(
        [
            pl.col('article_id'),
            pl.lit('title')
        ],
        separator='_'
    )
)

In [32]:
df_article

article_id,fk_team_id,article_title,published_at,fk_title_id
str,i64,str,date,str
"""f7d04530e69d8f68""",8,"""McNeil 'only deals in goal-of-…",2024-10-01,"""f7d04530e69d8f68_title"""
"""cdd2fbfbceedcd95""",15,"""Are Forest the most advanced d…",2024-10-06,"""cdd2fbfbceedcd95_title"""
"""c78b9e565ad3a1ad""",11,"""Can managers openly admit ment…",2024-10-10,"""c78b9e565ad3a1ad_title"""
"""d62c96c9f56a4946""",4,"""Fast starts... thin bench""",2024-10-01,"""d62c96c9f56a4946_title"""
"""2915c60e807fad38""",3,"""'The atmosphere was something …",2024-10-03,"""2915c60e807fad38_title"""
…,…,…,…,…
"""4c464c176e47df86""",4,"""Tottenham 3-1 Brentford: Bees …",2024-09-21,"""4c464c176e47df86_title"""
"""3b9c837ccf9fd114""",13,"""Gossip: Ten Hag's future in th…",2024-10-08,"""3b9c837ccf9fd114_title"""
"""a3bef726179d42a8""",1,"""Liverpool 3-0 Bournemouth: Did…",2024-09-21,"""a3bef726179d42a8_title"""
"""9000289e879afec1""",5,"""Brighton's extreme approach un…",2024-09-28,"""9000289e879afec1_title"""
