In [2]:
# Standard library imports
import os
import re
import sys
import json
from datetime import datetime
from typing import Optional
from great_tables import GT
from pattern.en import sentiment


# Third-party library imports
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import polars as pl

# Dagster imports
from dagster import (
    AssetExecutionContext,
    MaterializeResult,
    asset
)

# Add project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.abspath(''), '../..')))

# Local project utility imports
from utils.azure_blob_utils import (
    create_blob_client_with_connection_string, 
    read_all_parquets_from_container, 
    write_blob_to_container, 
    read_blob_from_container, 
    merge_dataframes_on_id
)
from utils.common_helpers import generate_hash

# load assets bronze_scrappe_epl_news
# in order to be used as dependency
from assets.bronze_assets.scrappe_epl_news import bronze_scrappe_epl_news


load_dotenv()

# Get path of the config file
scrapper_config_path = os.path.join(sys.path[-1], 'scrapper_config.json')

In [3]:
# Load the JSON file
with open(scrapper_config_path, 'r') as file:
    scrapper_config = json.load(file)

# Load environment variables
connection_string = os.environ.get("CONN_STRING_AZURE_STORAGE")
if connection_string is None:
    raise EnvironmentError("Azure storage connection string not found in environment variables.")

# Create a blob client for Azure Blob Storage
blob_service_client = create_blob_client_with_connection_string(connection_string)
# List all blobs in the container

In [4]:
silver_container_name = scrapper_config['silver_container_name']
folder_name = scrapper_config['folder_name']

df = read_all_parquets_from_container(silver_container_name, folder_name, blob_service_client)

Successfully read parquet file from silver/epl_news/processed_data.parquet


In [5]:
df.shape

(98, 5)

### Create table 'dim_article' -> ok

In [7]:
def get_dim_article_table(df):
    team_df = get_team_table(df)

    df_processed = df \
        .rename({"id": "article_id"}) \
        .rename({"title": "article_title"}) \
        .rename({"publishedDate": "published_at"}) \
        .rename({"teamName": "team_name"}) \
        .join(team_df, on="team_name") \
        .rename({"team_id": "fk_team_id"}) \
        .select(["article_id", "fk_team_id", "article_title", "published_at"])
    
    return df_processed

### Create 'Reaction' table

In [9]:
def keep_pro_reactions(df):
    # Define regex patterns to match unwanted strings in the 'title' column
    patterns_to_filter = [
        r"Did you know?",
        r"the fans' verdict",
        r"Gossip"
    ]

    # Combine the patterns into a single regex expression with the OR operator (|)
    combined_pattern = "|".join(patterns_to_filter)

    # Filter out rows where the 'title' column contains any of the unwanted patterns
    df_filtered = df.filter(~pl.col("title").str.contains(combined_pattern))

    return df_filtered

def get_fact_pro_reaction_table(df):
    df_processed = keep_pro_reactions(df)

    df_processed = df_processed \
        .rename({"id": "fk_article_id"}) \
        .rename({"publishedDate": "published_at"})

    df_processed = df_processed.with_columns(
        reaction_id = pl.col("fk_article_id") + '_pro'
    )

    df_processed = df_processed.with_columns(
        is_fan = False
    )

    return df_processed.select(["reaction_id", "fk_article_id", "content", "published_at", "is_fan"])

# Define a function to extract fan reactions
def extract_reactions(content, publishedDate, article_id):
    reactions = []

    # Start extracting after "Here are some of your comments:"
    content_after_comments = content.split("Here are some of your comments:")[-1].strip()

    # Regex pattern to extract fan reactions (assuming "Fan Name: Reaction" format)
    pattern = re.compile(r'(\w+):\s+(.+?)(?=\w+:|$)', re.DOTALL)
    matches = pattern.findall(content_after_comments)
    
    for idx, (fan_name, reaction) in enumerate(matches, start=1):
        reactionId = f"{article_id}_fan_{idx}"  # Create unique reactionId using articleId and index
        reactions.append((reactionId, reaction.strip(), publishedDate, article_id))
    
    return reactions

def get_fact_fan_reaction_table(df):
    # Define regex patterns to match unwanted strings in the 'title' column
    patterns_to_filter = r"the fans' verdict"

    # Filter rows where the 'title' column contain a particular pattern
    df_filtered = df.filter(pl.col("title").str.contains(patterns_to_filter))

    # Apply the extraction function to the dataframe
    reaction_list = []
    for row in df_filtered.iter_rows(named=True):
        reaction_list.extend(extract_reactions(row['content'], row['publishedDate'], row['id']))

    # Create a new dataframe for the extracted reactions
    df_processed = pl.DataFrame(reaction_list, schema=['reaction_id', 'content', 'published_at', 'fk_article_id'], orient="row")

    df_processed = df_processed.with_columns(
        is_fan = True
    )

    return df_processed.select(['reaction_id', 'fk_article_id', 'content', 'published_at', 'is_fan'])

def create_reaction_table(df):
    df_fan = get_fact_fan_reaction_table(df)
    df_pro = get_fact_pro_reaction_table(df)
    return pl.concat([df_fan, df_pro])