In [3]:
# Standard library imports
import os
import sys
import json
from datetime import datetime
from typing import Optional

# Third-party library imports
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import polars as pl

# Dagster imports
from dagster import (
    AssetExecutionContext,
    MaterializeResult,
    asset
)

# Add project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.abspath(''), '../..')))

# Local project utility imports
from utils.azure_blob_utils import (
    create_blob_client_with_connection_string, 
    read_all_parquets_from_container, 
    write_blob_to_container, 
    read_blob_from_container, 
    merge_dataframes_on_id
)
from utils.common_helpers import generate_hash

# load assets bronze_scrappe_epl_news
# in order to be used as dependency
from assets.bronze_assets.scrappe_epl_news import bronze_scrappe_epl_news


load_dotenv()

# Get path of the config file
scrapper_config_path = os.path.join(sys.path[-1], 'scrapper_config.json')

In [4]:
# Load the JSON file
with open(scrapper_config_path, 'r') as file:
    scrapper_config = json.load(file)

# Load environment variables
connection_string = os.environ.get("CONN_STRING_AZURE_STORAGE")
if connection_string is None:
    raise EnvironmentError("Azure storage connection string not found in environment variables.")

# Create a blob client for Azure Blob Storage
blob_service_client = create_blob_client_with_connection_string(connection_string)
# List all blobs in the container

In [5]:
silver_container_name = scrapper_config['silver_container_name']
folder_name = scrapper_config['folder_name']

df = read_all_parquets_from_container(silver_container_name, folder_name, blob_service_client)

Successfully read parquet file from silver/epl_news/processed_data.parquet


In [6]:
from great_tables import GT

In [37]:
#gt = GT(df.filter(pl.col("teamName") == 'Manchester United').head())
#gt = GT(df.filter(pl.col("teamName") == 'Manchester United').head())
#gt = GT(df.filter(pl.col("teamName") == 'Liverpool'))
gt = GT(df)

In [38]:
gt

teamName,publishedDate,title,content,id
West Ham United,2024-09-26 08:28:00,Liverpool 5-1 West Ham - the fans' verdict,"We asked for your thoughts after Wednesday's Carabao Cup game between Liverpool and West Ham. Here are some of your comments: Liverpool fans Daniel: A bit slow and out of shape for the first 20 minutes of the game, which ultimately cost us a goal. But as soon as Diogo Jota scored, we started to find our rhythm again and the second half was much better. More clinical, more consistent and more productive going forward. Need to keep this momentum going as much as we can! Chris: The 5-1 win is fab and going forward we look great. I wish for more from Darwin Nunez but his work rate is great. The downside is that we conceded a sloppy goal, and even Bradley let too many people ghost by him. Caiomhin Kelleher was fab too with lots of good saves and he was very commanding. Callum: I had a feeling Liverpool would be at the races today. There’s a good group there that are dying to break into the first team and results like that are asking Slot the right questions. Come on you Reds. West Ham fans Mark: Deserved losers, but like all our defeats this season, the scoreline flatters the opposition. It's nice we now have a manager who makes substitutions, but he keeps getting the team selection wrong. Michail Antonio should have started. James: Can’t help but feel hard done by. First goal looked like an offside in the buildup, second goal comes right after we should’ve had a penalty, third goal comes from a goal kick that should’ve been a corner to us. Fifth goal huge deflection. The scoreline did not reflect the game. Hoping some luck goes our way on Saturday. Richard: Another horror show. This is starting to get worrying. Going forward, at times it looks slick and promising but the midfield is just not working hard enough to protect the defence. Apart from Crysencio Summerville, the new additions are just not up to speed.",f68ca7a88483ec12
Nottingham Forest,2024-09-20 09:30:00,"'We could fill a 50,000 capacity stadium' - Marinakis","Nottingham Forest owner Evangelos Marinakis has been speaking about his plans for the City Ground on the Shut Up And Show More Football podcast: ""What we need to do is have a bigger stadium. The first priority is to do it at the City Ground - that's where the team belongs and where the tradition is. ""We have a lot of supporters and a huge waiting list for season tickets. I'm sure a 50,000 [capacity] stadium would be full to watch the team. ""This is important for the team, and most importantly for Nottingham. This is something I want to finalise, and have one of the best stadiums in England for years to come. This is my prime target right now."" Marinakis also owns Olympiakos, who became the first Greek side to win a European trophy last season and he is eyeing continental football for Forest: ""Why not? We did it for the first time ever. We were the underdogs. Why not with Nottingham Forest? We have what is needed to go all the way, but the first target is playing well in the Premier League. ""If Europe comes, it's something we will look to perform in."" Listen to the full episode on BBC Sounds",36a74633be35a7f8
Everton,2024-09-24 12:51:00,'The light at the end of the tunnel is almost blinding',"There is never a dull day at Everton Football Club. No sooner were we absorbing Saturday's draw at Leicester City and seeing the return of Jarrad Branthwaite for the Under-21s, were we treated to a Monday that was far from blue and quite simply unbelievable. The saga of selling the football club has been well documented. Two years of false dawns, questionable suitors and scaremongering about bankruptcy have been the theme as Farhad Moshiri has looked to exit the club, seemingly unconcerned by the mess that may be left behind. This all made yesterday’s events even more remarkable. A little over 10 days ago, John Textor publicly declared he saw owning Everton as being comparable to having the keys to The White House. Fast forward to yesterday and The Friedkin Group agree a deal with Moshiri, leaving Textor out in the cold and Evertonians in raptures. The speed of the deal appeared to be as quick as the dazzling feet of Iliman Ndiaye. Murmurings of the Roma owners being back on the scene surfaced last week, but in all honesty, they never went away. Having already loaned the club £200m towards the stadium, many people felt the abrupt end to the proposed deal to buy the club over the summer was all part of the bartering process. A sense of relief greeted yesterday’s update. The Friedkin Group are the preferred suitors for many Evertonians and the much craved stability at the club is within touching distance. The information we’ve seen suggests the club could well move into Bramley Moore almost debt free, a far cry from the financial concerns that have been the back story of the club for what feels like an eternity. Obviously, there is still some work to be done but at last, the light at the end of the tunnel is almost blinding. Find more from Mike Richards at Unholy Trinity, external",28cc78307647dda9
Leicester City,2024-09-25 22:32:00,Carabao Cup fourth-round draw - who is your team facing?,"The draw has been made for the Carabao Cup fourth round, with 12 Premier League clubs having made it through and Newcastle set to play their postponed tie against Wimbledon on 1 October. There are four confirmed all-Premier League fixtures, including Tottenham hosting Manchester City and holders Liverpool travelling to Brighton. Ties are scheduled to take place the week commencing 28 October. All the fixtures featuring top-flight sides are shown below: Brentford v Sheffield Wednesday Southampton v Stoke Tottenham v Manchester City AFC Wimbledon/Newcastle v Chelsea Manchester United v Leicester Brighton v Liverpool Preston North End v Arsenal Aston Villa v Crystal Palace See the full draw",bc7924ee35e4ec09
Chelsea,2024-09-30 07:31:00,Gossip: Chelsea look to offload Chilwell,"Chelsea will encourage January bids for England left-back Ben Chilwell - despite the 27-year-old's return to their first-team squad. (Mirror), external Want more transfer news? Read Monday's full gossip column",49d1574b36b19a2b
Chelsea,2024-09-25 15:37:00,Are Jackson and Palmer enough for Chelsea?,"Chelsea fans got a bit excited at the weekend with their team cruising past West Ham for an easy 3-0 win in East London. Nicolas Jackson was zooming past West Ham’s defence at will without having to do anything particularly complicated. Was it his incredible pace or was it West Ham’s defenders simply being quite slow at this level? Probably a bit of both, but Jackson is seriously rapid, among the quickest in the league. Considering Chelsea’s midfield creativity, with his speed and just average finishing, Jackson could top 20 league goals this season, the accepted bar for a top striker on form. With Cole Palmer capable of doing at least as well after his 22 league goals last season, this would make them a real threat as a top-four team. Chelsea have lacked consistency since the ownership change - and even the consistency of the ownership appears to be under threat right now. But two trustworthy top scorers would be enough to get them to where they need to be. Sign up to read more from Nevin in his Football Extra newsletter here",c6eccc66a251d8a4
Manchester United,2024-09-30 07:29:00,Catch up on the Premier League action,"Highlights and analysis from Sunday's two Premier League fixtures, plus the best of the action from the rest of the weekend. If you missed Match of the Day 2, catch up now on BBC iPlayer. And you can watch Saturday's Match of the Day here. Listen back to the weekend's full match commentaries on BBC Sounds: Arsenal 4-2 Leicester City Wolves 1-2 Liverpool Ipswich Town 2-2 Aston Villa Manchester United 0-3 Tottenham",b6dd1c4e7a49133d
Brentford,2024-09-25 08:01:00,Gossip: Brentford face competition for Wilson,"Brentford and Fulham are interested in a January move for Newcastle's 32-year-old England striker Callum Wilson. (Sun), external Before moving to Al-Ahli, former Brentford striker Ivan Toney was considered by Manchester United, but he did not fit the profile desired by manager Erik ten Hag. (ESPN), external Want more transfer stories? Read Wednesday's full gossip column",2a2d80b7684d7501
Nottingham Forest,2024-09-20 15:08:00,"Nuno on Boly, owner ambitions and attacking play","Nottingham Forest boss Nuno Espirito Santo has been speaking to the media before Sunday's Premier League trip to Brighton (kick-off 14:00 BST). Here are the key lines from his news conference: He was unable to give positive injury updates: ""The same players are out. [Willy] Boly is really close to joining the group. Unfortunately, Ibrahim Sangare and Danilo are out for a while. On Evangelos Marinakis' ambitions for the club: ""It drives everybody. He's the biggest authority in the club, so his ambition is what guides us. He wants to make this club grow and make it stable. If it [the City Ground] has 50,000, great. As long as it's here. That is what drives us. This goes from the owner to everyone at the training ground. Everyone has to commit themselves and be aware they are part of something that wants to grow. We need all on board."" After being told his Forest side have had more 'direct attacks' than any other Premier League side this season: ""I think we are creating a lot of things [but] we still have to improve accuracy. Sometimes the last pass or finish isn't good. The idea reflects the stats - we want to go forward and attack."" On the fixture schedule: ""We have to consider the people that decide the schedule. I know how hard it is to find dates, especially with the international dates. But we cannot forget the game is played by footballers and we have to respect the player. It's very difficult to play a game when you have less than 48 or 72 hours. If you repeat this over and over, then problems appear and the level of the game decreases. Let's try to protect the players."" On Sunday's opponents: ""They are a very good team with good players and a good manager. They have started well. We have to compete very well with them."" More on the Albion: ""Brighton make more offsides for their opponents than any other Premier League team. That shows how hard it is to play them. They are aggressive and have a lot of talent. They play possession football that is really difficult to play against."" Follow all of Friday's Premier League news conferences and the rest of the day's football news Listen to full commentary of Brighton v Nottingham Forest from 14:00 BST on Sunday on BBC Radio 5 Live",5984d046a358b41d
Wolverhampton Wanderers,2024-09-19 12:29:00,'Is there a point where board pressure O'Neil?',"This video can not be played Find more from Dave Azzopardi at Talking Wolves, external",87aab1190e7cc451


In [49]:
df.shape

(115, 5)

In [None]:
# Define regex patterns for the strings you want to filter out
patterns_to_filter = [
    r"Catch up on the Premier League action", # ok (on title)
    r"Follow (Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)'s (Premier League games|Carabao Cup)",  # ok (on tile)
    r"Follow\s+([A-Za-z\s]+)\s+v\s+([A-Za-z\s]+)",
    r"who is your team facing\?" # ok (on title)
]


In [58]:
# Define regex patterns for the strings you want to filter out
patterns_to_filter = [
    r"Follow\s+([A-Za-z\s]+)\s+v\s+([A-Za-z\s]+)"
]

# Combine the patterns into a single regular expression using | (OR) operator
combined_pattern = "|".join(patterns_to_filter)

# Filter out rows that do not contain the specified patterns
df_filtered = df.filter(pl.col("title").str.contains(combined_pattern))

print(df_filtered.shape)
GT(df_filtered)

(1, 5)


teamName,publishedDate,title,content,id
Tottenham Hotspur,2024-09-26 18:43:00,Follow Spurs v Qarabag,"Tottenham Hotspur get their Europa League campaign under way at 20:00 BST, with Qarabag of Azerbaijan their first opponents of the league stage. Follow all of the action and reaction here",4f7b84ff2463f3bd


In [36]:
# Define regex patterns for the strings you want to filter out
patterns_to_filter = [
    "Catch up on the Premier League action",             # Exact match
    r"Follow (Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)'s Premier League games",  # Match "Follow [day]'s Premier League games"
    r"Team \w+ v Team \w+",                             # Match "Team A v Team B"
    r"who is your team facing\?"                         # Match "who is your team facing?"
]

# Combine the patterns into a single regular expression using | (OR) operator
combined_pattern = "|".join(patterns_to_filter)

# Filter out rows that do not contain the specified patterns
df_filtered = df.filter(~pl.col("content").str.contains(combined_pattern))

df_filtered

teamName,publishedDate,title,content,id
str,str,str,str,str
"""West Ham United""","""2024-09-26 08:28:00""","""Liverpool 5-1 West Ham - the f…","""We asked for your thoughts aft…","""f68ca7a88483ec12"""
"""Nottingham Forest""","""2024-09-20 09:30:00""","""'We could fill a 50,000 capaci…","""Nottingham Forest owner Evange…","""36a74633be35a7f8"""
"""Everton""","""2024-09-24 12:51:00""","""'The light at the end of the t…","""There is never a dull day at E…","""28cc78307647dda9"""
"""Leicester City""","""2024-09-25 22:32:00""","""Carabao Cup fourth-round draw …","""The draw has been made for the…","""bc7924ee35e4ec09"""
"""Chelsea""","""2024-09-30 07:31:00""","""Gossip: Chelsea look to offloa…","""Chelsea will encourage January…","""49d1574b36b19a2b"""
…,…,…,…,…
"""Nottingham Forest""","""2024-09-26 22:34:00""","""Nuno still trusts officials de…","""Morgan Gibbs-White will miss t…","""1fe2ef947881dabb"""
"""Arsenal""","""2024-10-01 13:56:00""","""How PSG loan move shaped Artet…","""In January 2001 Mikel Arteta l…","""e11ddacf53adb925"""
"""Manchester City""","""2024-09-27 07:58:00""","""What's with the dark arts?""","""There has been a lot of conver…","""acfe0ee1aedd8dc2"""
"""Aston Villa""","""2024-09-30 15:24:00""","""'Rogers' performances have bee…","""When interim manager Lee Carsl…","""962fee1f3bd60a37"""
