In [142]:
# Standard library imports
import os
import sys
import json
from pattern.en import sentiment
from datetime import datetime
import hashlib


# Third-party library imports
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import polars as pl

# Add project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.abspath(''), '../..')))

# Local project utility imports
from utils.azure_blob_utils import (
    create_blob_client_with_connection_string, 
    read_blob_from_container,
    read_all_parquets_from_container
)

# load assets bronze_scrappe_epl_news
# in order to be used as dependency
from assets.bronze_assets.scrappe_epl_news import scrappe_epl_news


load_dotenv()

# Get path of the config file
scrapper_config_path = os.path.join(sys.path[-1], 'scrapper_config.json')

In [143]:
# Load the JSON file
with open(scrapper_config_path, 'r') as file:
    scrapper_config = json.load(file)

# Load environment variables
connection_string = os.environ.get("CONN_STRING_AZURE_STORAGE")
if connection_string is None:
    raise EnvironmentError("Azure storage connection string not found in environment variables.")

# Create a blob client for Azure Blob Storage
blob_service_client = create_blob_client_with_connection_string(connection_string)
# List all blobs in the container

In [144]:
folder_name = scrapper_config['folder_name']

### BRONZE

In [145]:
bronze_container_name = scrapper_config['bronze_container_name']

df_epl_news = read_all_parquets_from_container(bronze_container_name, folder_name, blob_service_client)
df_epl_news.head()

Successfully read parquet file from bronze/epl_news/epl_news_2024_10_14.parquet
Successfully read parquet file from bronze/epl_news/epl_news_2024_10_19.parquet


_hashedId,_extractedDate,teamName,page,html
str,datetime[μs],str,i8,str
"""9d4702a75dda24a4457b469e9e2577…",2024-10-14 20:50:02,"""AFC Bournemouth""",1,"""<!DOCTYPE html><html lang=""en-…"
"""63cd46598c3f0164ccc77b4ba38090…",2024-10-14 20:50:02,"""AFC Bournemouth""",2,"""<!DOCTYPE html><html lang=""en-…"
"""4b44c64ebb2c9e14de13eb15292330…",2024-10-14 20:50:02,"""Arsenal""",1,"""<!DOCTYPE html><html lang=""en-…"
"""18bdeafc8ca33ccfa530d07470a354…",2024-10-14 20:50:02,"""Arsenal""",2,"""<!DOCTYPE html><html lang=""en-…"
"""3cef8d9424daf9ff6a38c1c2a325d7…",2024-10-14 20:50:02,"""Aston Villa""",1,"""<!DOCTYPE html><html lang=""en-…"


### SILVER

In [146]:
silver_container_name = scrapper_config['silver_container_name']

df_epl_news = read_blob_from_container(silver_container_name, f"{folder_name}/processed_data.parquet", blob_service_client)
df_epl_news.head()

Successfully read blob from silver/epl_news/processed_data.parquet


teamName,publishedDate,title,content,id
str,date,str,str,str
"""Nottingham Forest""",2024-10-08,"""Is the new VAR working?""","""A VAR related question was put…","""1a3517a2e392df62"""
"""Ipswich Town""",2024-10-14,"""'I'm sticking with my relegati…","""A relegation-related question …","""7003ce288ae187d6"""
"""Brighton & Hove Albion""",2024-09-28,"""Brighton's extreme approach un…","""There is no shame in losing th…","""9000289e879afec1"""
"""Chelsea""",2024-10-16,"""'Liverpool beware' - Chelsea a…","""I can't wait to get back to th…","""3bd89bd4d4eca995"""
"""AFC Bournemouth""",2024-10-05,"""'We have to be more ruthless' …","""Bournemouth manager Andoni Ira…","""962e77cc0727a069"""


### GOLD

In [147]:
gold_container_name = scrapper_config['gold_container_name']
folder_name = scrapper_config['folder_name']

df_date = read_blob_from_container(gold_container_name, f"{folder_name}/dim_date.parquet", blob_service_client)
df_article = read_blob_from_container(gold_container_name, f"{folder_name}/article.parquet", blob_service_client)
df_team = read_blob_from_container(gold_container_name, f"{folder_name}/dim_team.parquet", blob_service_client)
df_reaction = read_blob_from_container(gold_container_name, f"{folder_name}/reaction.parquet", blob_service_client)
df_sentiment = read_blob_from_container(gold_container_name, f"{folder_name}/dim_sentiment.parquet", blob_service_client)
df_fact_reaction = read_blob_from_container(gold_container_name, f"{folder_name}/fact_reaction.parquet", blob_service_client)
df_fact_title = read_blob_from_container(gold_container_name, f"{folder_name}/fact_title.parquet", blob_service_client)


Successfully read blob from gold/epl_news/dim_date.parquet
Successfully read blob from gold/epl_news/article.parquet
Successfully read blob from gold/epl_news/dim_team.parquet
Successfully read blob from gold/epl_news/reaction.parquet
Successfully read blob from gold/epl_news/dim_sentiment.parquet
Successfully read blob from gold/epl_news/fact_reaction.parquet
Successfully read blob from gold/epl_news/fact_title.parquet


In [177]:
def create_sentiment_trend_table(
        df_fact_reaction: pl.DataFrame,
        df_fact_title: pl.DataFrame,
        df_date: pl.DataFrame) -> pl.DataFrame:

    df_fact_reaction= df_fact_reaction.select(
        [
            'fk_sentiment_id', 'fk_team_id', 'fk_date_id', 'sentiment_score',
            'subjectivity_score', 'is_subjective',
        ]
    )

    df_fact_title= df_fact_title.select(
        [
            'fk_sentiment_id', 'fk_team_id', 'fk_date_id', 'sentiment_score',
            'subjectivity_score', 'is_subjective',
        ]
    )

    df_fact_sentiment_trend = pl.concat([df_fact_reaction, df_fact_title])

    df_date = df_date.select(['date_id', 'year', 'week_of_year'])

    df_fact_sentiment_trend = df_fact_sentiment_trend \
        .join(df_date, left_on='fk_date_id', right_on='date_id', how='left')

    df_fact_sentiment_trend = df_fact_sentiment_trend \
        .join(df_sentiment, left_on='fk_sentiment_id', right_on='sentiment_id', how='left')

    df_fact_sentiment_trend = df_fact_sentiment_trend.with_columns(
        trend_id = pl.concat_str(
            [
                pl.lit('trend'), pl.col('fk_team_id'), pl.col('year'), pl.col('week_of_year')
            ], separator='_'
        )
    )

    # Count the sentiment_label occurrences for each trend_id
    df_fact_sentiment_trend = df_fact_sentiment_trend.group_by("trend_id").agg([
        pl.col("fk_team_id").first().alias("fk_team_id"),
        pl.col("fk_date_id").first().alias("fk_date_id"),
        pl.len().alias("total_articles"),  # Count the total number of articles
        (pl.col("is_subjective").mean() * 100).alias("subjectivity_level"),  # Calculate subjectivity level as a percentage
        (pl.col("sentiment_value").mean() * 100).alias("trend_value"),  # Adding trend value as mean sentiment_value
        (pl.col("sentiment_label") == "positive").sum().alias("total_positive_articles"),
        (pl.col("sentiment_label") == "negative").sum().alias("total_negative_articles"),
        (pl.col("sentiment_label") == "neutral").sum().alias("total_neutral_articles")
    ])
    
    return df_fact_sentiment_trend

In [178]:
create_sentiment_trend_table(df_fact_reaction, df_fact_title, df_date)

InvalidOperationError: arithmetic on string and numeric not allowed, try an explicit cast first

Resolved plan until failure:

	---> FAILED HERE RESOLVING 'group_by' <---
AGGREGATE
	[col("fk_team_id").first().alias("fk_team_id"), col("fk_date_id").first().alias("fk_date_id"), len().alias("total_articles"), [(col("is_subjective").mean()) * (dyn int: 100)].alias("subjectivity_level"), [(col("sentiment_value").mean()) * (100.0)].alias("trend_value")] BY [col("trend_id")] FROM
  DF ["fk_sentiment_id", "fk_team_id", "fk_date_id", "sentiment_score"]; PROJECT */11 COLUMNS; SELECTION: None