In [12]:
# Standard library imports
import os
import sys
import json
from datetime import datetime
import hashlib
from textblob import TextBlob


# Third-party library imports
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import polars as pl

# Add project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.abspath(''), '../..')))

# Local project utility imports
from utils.azure_blob_utils import (
    create_blob_client_with_connection_string, 
    read_blob_from_container,
    read_all_parquets_from_container
)

# load assets bronze_scrappe_epl_news
# in order to be used as dependency
from assets.bronze_assets.scrappe_epl_news import scrappe_epl_news


load_dotenv()

# Get path of the config file
scrapper_config_path = os.path.join(sys.path[-1], 'scrapper_config.json')

In [13]:
# Load the JSON file
with open(scrapper_config_path, 'r') as file:
    scrapper_config = json.load(file)

# Load environment variables
connection_string = os.environ.get("CONN_STRING_AZURE_STORAGE")
if connection_string is None:
    raise EnvironmentError("Azure storage connection string not found in environment variables.")

# Create a blob client for Azure Blob Storage
blob_service_client = create_blob_client_with_connection_string(connection_string)
# List all blobs in the container

In [204]:
folder_name = scrapper_config['folder_name']

### BRONZE

In [205]:
bronze_container_name = scrapper_config['bronze_container_name']

df_epl_news = read_all_parquets_from_container(bronze_container_name, folder_name, blob_service_client)
df_epl_news.head()

Successfully read parquet file from bronze/epl_news/epl_news_2024_10_14.parquet
Successfully read parquet file from bronze/epl_news/epl_news_2024_10_19.parquet
Successfully read parquet file from bronze/epl_news/epl_news_2024_10_20.parquet


_hashedId,_extractedDate,teamName,page,html
str,datetime[μs],str,i8,str
"""9d4702a75dda24a4457b469e9e2577…",2024-10-14 20:50:02,"""AFC Bournemouth""",1,"""<!DOCTYPE html><html lang=""en-…"
"""63cd46598c3f0164ccc77b4ba38090…",2024-10-14 20:50:02,"""AFC Bournemouth""",2,"""<!DOCTYPE html><html lang=""en-…"
"""4b44c64ebb2c9e14de13eb15292330…",2024-10-14 20:50:02,"""Arsenal""",1,"""<!DOCTYPE html><html lang=""en-…"
"""18bdeafc8ca33ccfa530d07470a354…",2024-10-14 20:50:02,"""Arsenal""",2,"""<!DOCTYPE html><html lang=""en-…"
"""3cef8d9424daf9ff6a38c1c2a325d7…",2024-10-14 20:50:02,"""Aston Villa""",1,"""<!DOCTYPE html><html lang=""en-…"


In [212]:
df_epl_news.shape

(280, 5)

In [214]:
df_epl_news = df_epl_news.filter(pl.col('html') == 'HTTP error 500')

### SILVER

In [199]:
silver_container_name = scrapper_config['silver_container_name']

df_epl_news = read_blob_from_container(silver_container_name, f"{folder_name}/processed_data.parquet", blob_service_client)
df_epl_news.head()

Successfully read blob from silver/epl_news/processed_data.parquet


teamName,publishedDate,title,content,id
str,date,str,str,str
"""Manchester United""",2024-10-06,"""'We are all on board together'…","""Manchester United manager Erik…","""b5a547d6bd133998"""
"""Brighton & Hove Albion""",2024-09-20,"""Gossip: 'Next Haaland' attract…","""Newcastle are monitoring 18-ye…","""228cc1b3c0579234"""
"""AFC Bournemouth""",2024-10-09,"""'In career-best form for club …","""As Bournemouth reflect on the …","""d7779ed3529fccd4"""
"""Brentford""",2024-09-21,"""Tottenham 3-1 Brentford: Bees …","""It doesn't get much tougher th…","""4c464c176e47df86"""
"""Ipswich Town""",2024-09-21,"""Sutton's predictions: Southamp…","""Sutton is making predictions f…","""d8dee1f07bcad112"""


### GOLD

In [200]:
gold_container_name = scrapper_config['gold_container_name']
folder_name = scrapper_config['folder_name']

df_date = read_blob_from_container(gold_container_name, f"{folder_name}/dim_date.parquet", blob_service_client)
df_article = read_blob_from_container(gold_container_name, f"{folder_name}/article.parquet", blob_service_client)
df_team = read_blob_from_container(gold_container_name, f"{folder_name}/dim_team.parquet", blob_service_client)
df_reaction = read_blob_from_container(gold_container_name, f"{folder_name}/reaction.parquet", blob_service_client)
df_sentiment = read_blob_from_container(gold_container_name, f"{folder_name}/dim_sentiment.parquet", blob_service_client)
df_fact_reaction = read_blob_from_container(gold_container_name, f"{folder_name}/fact_reaction.parquet", blob_service_client)
df_fact_title = read_blob_from_container(gold_container_name, f"{folder_name}/fact_title.parquet", blob_service_client)


Successfully read blob from gold/epl_news/dim_date.parquet
Successfully read blob from gold/epl_news/article.parquet
Successfully read blob from gold/epl_news/dim_team.parquet
Successfully read blob from gold/epl_news/reaction.parquet
Successfully read blob from gold/epl_news/dim_sentiment.parquet
Successfully read blob from gold/epl_news/fact_reaction.parquet
Successfully read blob from gold/epl_news/fact_title.parquet


In [201]:
df_epl_news

teamName,publishedDate,title,content,id
str,date,str,str,str
"""Manchester United""",2024-10-06,"""'We are all on board together'…","""Manchester United manager Erik…","""b5a547d6bd133998"""
"""Brighton & Hove Albion""",2024-09-20,"""Gossip: 'Next Haaland' attract…","""Newcastle are monitoring 18-ye…","""228cc1b3c0579234"""
"""AFC Bournemouth""",2024-10-09,"""'In career-best form for club …","""As Bournemouth reflect on the …","""d7779ed3529fccd4"""
"""Brentford""",2024-09-21,"""Tottenham 3-1 Brentford: Bees …","""It doesn't get much tougher th…","""4c464c176e47df86"""
"""Ipswich Town""",2024-09-21,"""Sutton's predictions: Southamp…","""Sutton is making predictions f…","""d8dee1f07bcad112"""
…,…,…,…,…
"""Leicester City""",2024-10-15,"""'A better team than their posi…","""Former Leicester City winger M…","""7693c9b40a225c22"""
"""Manchester United""",2024-10-10,"""Man Utd need to 'find a spark'…","""One of the misconceptions we d…","""83582d92aca9bdcd"""
"""Nottingham Forest""",2024-08-30,"""'We are confident Ward-Prowse …","""Nottingham Forest's newest rec…","""70b451b76b36be39"""
"""Brentford""",2024-10-01,"""Fast starts... thin bench""","""Don't say I didn't warn you! I…","""d62c96c9f56a4946"""
