In [42]:
import sys
import os
import polars as pl
from great_tables import GT

# Add project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.abspath(''), '..')))

from AzureBlobLoader import AzureBlobLoader
from utils.config import CONFIG, CONN_STRING_AZURE_STORAGE

def load_data():
    try:
        loader = AzureBlobLoader(CONFIG, CONN_STRING_AZURE_STORAGE)
        dataframes = loader.load_dataframes()
        print("Data loaded successfully!")
        return dataframes
    except Exception as e:
        print(f"Error loading data: {e}", exc_info=True)

def count_article_reaction(df_fact_title, df_fact_reaction, df_team):
    df_nb_articles = df_fact_title \
        .select(["fk_team_id", "article_id"]) \
        .group_by("fk_team_id") \
        .n_unique() \
        .rename({"article_id": "count_article"}) \
        .join(df_team, left_on="fk_team_id", right_on="team_id", how="inner") \
        .drop("fk_team_id") \
        .sort("count_article")

    df_nb_reaction = df_fact_reaction \
        .select(["fk_team_id", "reaction_id"]) \
        .group_by("fk_team_id") \
        .n_unique() \
        .rename({"reaction_id": "count_reaction"}) \
        .join(df_team, left_on="fk_team_id", right_on="team_id", how="inner") \
        .drop("fk_team_id") \
        .sort("count_reaction")

    df_count = df_nb_articles \
        .join(df_nb_reaction, on="team_name", how="left") \
        .with_columns((pl.col("count_article")+pl.col("count_reaction")).alias("count"))

    return df_count

In [43]:
dataframes = load_data()

Successfully read blob from gold/epl_news/dim_date.parquet
Successfully read blob from gold/epl_news/article.parquet
Successfully read blob from gold/epl_news/dim_team.parquet
Successfully read blob from gold/epl_news/reaction.parquet
Successfully read blob from gold/epl_news/dim_sentiment.parquet
Successfully read blob from gold/epl_news/fact_reaction.parquet
Successfully read blob from gold/epl_news/fact_title.parquet
Data loaded successfully!


In [44]:
df_date = dataframes['df_date']
df_article = dataframes['df_article']
df_team = dataframes['df_team']
df_sentiment = dataframes['df_sentiment']
df_fact_reaction = dataframes['df_fact_reaction']
df_fact_title = dataframes['df_fact_title']

### clean dataframes

In [45]:
df_fact_title = dataframes['df_article']
df_fact_title = df_fact_title.filter(pl.col("article_id").is_not_null())
df_fact_title = df_fact_title.unique()
df_fact_title.head()

article_id,fk_team_id,article_title,published_at
str,i64,str,date
"""be406f7ac21ceebc""",4,"""Frank on injury difficulties, …",2024-09-20
"""d55846dad4602adc""",9,"""Benfica 0-1 Fulham - fans' ver…",2024-08-05
"""05c7661423faa4a6""",19,"""Hammers scour out-of-contract …",2024-09-04
"""fea18ed8677643bb""",13,"""Will Guardiola take my advice?""",2024-10-09
"""33184f575985025a""",14,"""Evans and Maguire 'United's be…",2024-10-07


In [46]:
df_fact_reaction = df_fact_reaction.filter(pl.col("fk_team_id").is_not_null())
df_fact_reaction.head()

reaction_id,fk_article_id,fk_sentiment_id,fk_team_id,fk_date_id,content,sentiment_score,subjectivity_score,is_subjective,is_fan
str,str,i64,i64,date,str,f64,f64,str,bool
"""aa2f26130f904f8f_fan_1""","""aa2f26130f904f8f""",1,12,2024-10-07,"""For the second game in a row w…",-0.007576,0.487879,"""objective""",True
"""aa2f26130f904f8f_fan_2""","""aa2f26130f904f8f""",3,12,2024-10-07,"""Liverpool were nothing special…",0.464286,0.505357,"""subjective""",True
"""aa2f26130f904f8f_fan_3""","""aa2f26130f904f8f""",1,12,2024-10-07,"""So frustrating. Woeful in the …",0.187778,0.447778,"""objective""",True
"""aa2f26130f904f8f_fan_4""","""aa2f26130f904f8f""",1,12,2024-10-07,"""Tough start to the season for …",0.059503,0.491466,"""objective""",True
"""aa2f26130f904f8f_fan_5""","""aa2f26130f904f8f""",1,12,2024-10-07,"""Hard-fought win and made harde…",0.198333,0.32,"""objective""",True


In [47]:
df_sentiment

sentiment_label,sentiment_id,sentiment_value
str,i64,i64
"""negative""",0,-1
"""neutral""",1,0
"""positive""",3,1


### team bringing the most subjectivity

In [48]:
subjectivity_per_team = df_fact_reaction \
    .group_by("fk_team_id").agg([
        pl.col("subjectivity_score").mean().alias("subjectivity_mean")
        ]) \
    .join(df_team, left_on="fk_team_id", right_on="team_id", how="inner")

subjectivity_per_team.head()

fk_team_id,subjectivity_mean,team_name
i64,f64,str
1,0.454066,"""AFC Bournemouth"""
2,0.430597,"""Arsenal"""
3,0.450268,"""Aston Villa"""
4,0.379566,"""Brentford"""
5,0.441804,"""Brighton & Hove Albion"""


### Positive, neutral and negative for each team

In [49]:
#### [team, n_pos, n_neg, n_neu, n, pos_percentage, neg_percentage, neutral_percentage, n_overal_feeling, overal_feeling]

result_1 = df_fact_reaction \
    .group_by('fk_team_id', 'fk_sentiment_id').agg([
        pl.col("reaction_id").n_unique().alias("count_reaction")
        ]) \
    .join(df_sentiment, left_on="fk_sentiment_id", right_on="sentiment_id", how="left") \
    .join(df_team, left_on="fk_team_id", right_on="team_id", how="inner") \
    .drop(["fk_team_id", "fk_sentiment_id", "sentiment_value"])

result_1 = result_1 \
    .pivot("sentiment_label", index="team_name", values="count_reaction") \
    .fill_null(0) \
    .sort("team_name") \
    .with_columns() \
    .with_columns((pl.col("positive")+pl.col("neutral")+pl.col("negative")).alias("total_count")) \
    .with_columns(
        (pl.col("positive")/pl.col("total_count")).alias("p_positive"),
        (pl.col("neutral")/pl.col("total_count")).alias("p_neutral"),
        (pl.col("negative")/pl.col("total_count")).alias("p_negative")
    )

result_1.head()

team_name,positive,neutral,negative,total_count,p_positive,p_neutral,p_negative
str,u32,u32,u32,u32,f64,f64,f64
"""AFC Bournemouth""",3,7,0,10,0.3,0.7,0.0
"""Arsenal""",2,7,0,9,0.222222,0.777778,0.0
"""Aston Villa""",4,1,0,5,0.8,0.2,0.0
"""Brentford""",2,6,0,8,0.25,0.75,0.0
"""Brighton & Hove Albion""",4,5,0,9,0.444444,0.555556,0.0


### Sum of positives, negative and neutral

In [53]:
result_2 = result_1 \
    .select(['positive', 'neutral', 'negative']) \
    .select(
        pl.sum("positive").alias("total_positive"),
        pl.sum("negative").alias("total_negative"),
        pl.sum("neutral").alias("total_neutral")
    )

result_2

total_positive,total_negative,total_neutral
u32,u32,u32
79,2,118


### sentiment score of each team over the season

In [116]:
result_2 = df_fact_reaction \
    .group_by("fk_team_id").agg([
        pl.col("sentiment_score").mean().alias("mean_overal_feeling")
        ]) \
    .join(df_team, left_on="fk_team_id", right_on="team_id", how="inner") \
    .drop(["fk_team_id"])

result_2.head()

mean_overal_feeling,team_name
f64,str
0.129788,"""AFC Bournemouth"""
0.152835,"""Arsenal"""
0.272458,"""Aston Villa"""
0.146895,"""Brentford"""
0.156211,"""Brighton & Hove Albion"""


### Evolution of sentiment_score through the season

In [145]:
result_1 = df_fact_reaction \
    .group_by(["fk_team_id", "fk_date_id"]) \
    .agg([
        pl.col("sentiment_score").sum().alias("sentiment_score")
    ]) \
    .sort(["fk_team_id", "fk_date_id"])

result_2 = result_1.select(
    pl.col("fk_date_id", "fk_team_id"),
    pl.col("sentiment_score"),
    pl.cum_sum("sentiment_score").over("fk_team_id").alias("Value"),
)

result_2.head()


fk_date_id,fk_team_id,sentiment_score,Value
date,i64,f64,f64
2024-08-09,1,0.0,0.0
2024-08-15,1,0.385714,0.385714
2024-08-27,1,-0.004762,0.380952
2024-08-31,1,0.1,0.480952
2024-09-11,1,0.204937,0.68589


#### filters

In [151]:
teams_name_list = df_team['team_name']

In [152]:
sentiments_list = df_sentiment['sentiment_label']