In [111]:

from azure.storage.blob import ContainerClient
import time
from io import BytesIO
import pandas as pd

In [112]:
def download_parquet(dataframes, container_client, blob_name):
    blob_client = container_client.get_blob_client(blob=blob_name)
    download_stream = blob_client.download_blob()
    stream = BytesIO()
    download_stream.readinto(stream)
    df = pd.read_parquet(stream, engine='pyarrow')

    dataframes.append(df)
    return dataframes


def extract_dataset_bis():

    con_str = "DefaultEndpointsProtocol=https;AccountName=storagefootanalysis;AccountKey=UHMmYUJDVHJI1IhTCy/2UXVqjoRJYw2gJTKNPQ8jL9juuD5cJeNMIYXwXbkpfSEIE3cByx+kQ29e+AStk2zvmQ==;EndpointSuffix=core.windows.net"

    #  Container and folder name
    container_name = 'gold'
    folder_name = '/epl'

    #  Create a blob client
    container_client = ContainerClient.from_connection_string(conn_str=con_str, container_name=container_name)
    """
    blob_service_client = BlobServiceClient.from_connection_string(con_str)
    container_client = blob_service_client.get_container_client(
        container=container_name
    )
    """
    # List to hold each DataFrame
    team_games = []
    fixtures = []
    game_statistics = []
    poisson_probabilities = []

    # List and read parquet files
    blob_list = container_client.list_blobs(name_starts_with=folder_name)

    for blob in blob_list:
        if blob.name.startswith('epl/team_games/') and blob.name.endswith('.parquet'):
            team_games = download_parquet(team_games, container_client, blob.name)
        elif blob.name.startswith('epl/fixtures/') and blob.name.endswith('.parquet'):
            fixtures = download_parquet(fixtures, container_client, blob.name)
        elif blob.name.startswith('epl/game_statistics/') and blob.name.endswith('.parquet'):
            game_statistics = download_parquet(game_statistics, container_client, blob.name)
        elif blob.name.startswith('epl/poisson_probabilities/') and blob.name.endswith('.parquet'):
            poisson_probabilities = download_parquet(poisson_probabilities, container_client, blob.name)

    # Concatenate all dataframes
    team_games = pd.concat(team_games, ignore_index=True)
    fixtures = pd.concat(fixtures, ignore_index=True)
    game_statistics = pd.concat(game_statistics, ignore_index=True)
    poisson_probabilities = pd.concat(poisson_probabilities, ignore_index=True)

    return team_games, fixtures, game_statistics, poisson_probabilities


#team_games, fixtures, game_statistics, poisson_probabilities = extract_dataset_bis()

In [None]:
import streamlit as st
from sqlalchemy.engine import URL
from sqlalchemy import create_engine
from sqlalchemy.pool import NullPool
from sqlalchemy.exc import SQLAlchemyError
import sqlalchemy as sa
def extract_dataset():

    server = st.secrets["SERVER"]
    database = st.secrets["DATABASE"]
    username = st.secrets["USERNAME"]
    password = st.secrets["PASSWORD"]
    driver = st.secrets["DRIVER"]

    connection_string = f"Driver={driver};Server=tcp:{server};Database={database};Uid={username};Pwd={password};Encrypt=yes;"  # noqa: E501
    connection_url = URL.create(
        "mssql+pyodbc", query={"odbc_connect": connection_string})

    engine = create_engine(connection_url, poolclass=NullPool)

    with engine.begin() as conn:
        team_games = pd.read_sql_query(
            sa.text("SELECT * FROM dbo.team_games"), conn)
        game_statistics = pd.read_sql_query(
            sa.text("SELECT * FROM dbo.game_statistics"), conn)
        fixtures = pd.read_sql_query(
            sa.text("SELECT * FROM dbo.fixtues"), conn)
        poisson_probabilities = pd.read_sql_query(
            sa.text("SELECT * FROM dbo.poisson_probabilities"), conn)

        conn.close()

    return game_statistics, team_games, fixtures, poisson_probabilities

In [113]:
def download_parquet(container_client, blob_name):
    blob_client = container_client.get_blob_client(blob=blob_name)
    download_stream = blob_client.download_blob()
    stream = BytesIO()
    download_stream.readinto(stream)
    df = pd.read_parquet(stream, engine='pyarrow')
    return df


def extract_dataset_bis_bis():

    con_str = "DefaultEndpointsProtocol=https;AccountName=storagefootanalysis;AccountKey=UHMmYUJDVHJI1IhTCy/2UXVqjoRJYw2gJTKNPQ8jL9juuD5cJeNMIYXwXbkpfSEIE3cByx+kQ29e+AStk2zvmQ==;EndpointSuffix=core.windows.net"

    #  Container and folder name
    container_name = 'gold'
    folder_name = f'/epl'

    #  Create a blob client
    container_client = ContainerClient.from_connection_string(
        conn_str=con_str, container_name=container_name
    )

    # Datasets
    datasets = {
        'epl/team_games': [],
        'epl/fixtures': [],
        'epl/game_statistics': [],
        'epl/poisson_probabilities': []
    }

    # List and read parquet files
    blob_list = container_client.list_blobs(name_starts_with=folder_name)

    for blob in blob_list:
        for dataset in datasets:
            if (blob.name.startswith(dataset) and blob.name.endswith('.parquet')):
                try:
                    df = download_parquet(container_client, blob.name)
                    datasets[dataset].append(df)
                except Exception as e:
                    logger.error(f'Error downloading {blob.name}: {e}')

    # Concatenate all dataframes
    for dataset in datasets:
        datasets[dataset] = pd.concat(datasets[dataset], ignore_index=True)

    return datasets

In [117]:
team_games, fixtures, game_statistics, poisson_probabilities = extract_dataset_bis_bis().values()