In [1]:
from pyspark.sql import SparkSession

def calculate_red_violations(data_source, output_uri):
    """
    Processes sample food establishment inspection data and queries the data to find the top 10 establishments
    with the most Red violations from 2006 to 2020.

    :param data_source: The URI of your food establishment data CSV, such as 's3://amzn-s3-demo-bucket/food-establishment-data.csv'.
    :param output_uri: The URI where output is written, such as 's3://amzn-s3-demo-bucket/restaurant_violation_results'.
    """
    with SparkSession.builder.appName("Calculate Red Health Violations").getOrCreate() as spark:
        # Load the restaurant violation CSV data
        if data_source is not None:
            restaurants_df = spark.read.option("header", "true").csv(data_source)

        # Create an in-memory DataFrame to query
        restaurants_df.createOrReplaceTempView("restaurant_violations")

        # Create a DataFrame of the top 10 restaurants with the most Red violations
        top_red_violation_restaurants = spark.sql("""SELECT name, count(*) AS total_red_violations 
          FROM restaurant_violations 
          WHERE violation_type = 'RED' 
          GROUP BY name 
          ORDER BY total_red_violations DESC LIMIT 10""")

        # Write the results to the specified output URI
        top_red_violation_restaurants.write.option("header", "true").mode("overwrite").csv(output_uri)

data_source = 's3://emr-edb016-machicao/food_establishment_data.csv'
output_uri   ='s3://emr-edb016-machicao/logs'

calculate_red_violations(data_source, output_uri)


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1,application_1748727286175_0003,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [1]:
from pyspark.sql import SparkSession

def calculate_red_violations(data_source):
    """
    Processa dados de inspeção de restaurantes e exibe os 10 com mais violações RED.
    
    :param data_source: URI do CSV com dados dos estabelecimentos, como 's3://...'
    """
    # Cria ou obtém uma SparkSession existente
    spark = SparkSession.builder.appName("Calculate Red Health Violations").getOrCreate()

    # Lê o arquivo CSV do S3 (ou outro caminho)
    restaurants_df = spark.read.option("header", "true").csv(data_source)

    # Cria uma view temporária para consultas SQL
    restaurants_df.createOrReplaceTempView("restaurant_violations")

    # Executa a consulta SQL para obter os top 10 restaurantes com mais violações RED
    top_red_violation_restaurants = spark.sql("""
        SELECT name, count(*) AS total_red_violations 
        FROM restaurant_violations 
        WHERE violation_type = 'RED' 
        GROUP BY name 
        ORDER BY total_red_violations DESC 
        LIMIT 10
    """)

    # Mostra o resultado como DataFrame no notebook
    top_red_violation_restaurants.show(truncate=False)  # ou display(...) no EMR Studio

    # Também pode retornar o DataFrame se quiser salvar ou reutilizar
    return top_red_violation_restaurants


# Use o caminho do seu CSV no S3 ou HDFS
data_source = 's3://emr-edb016-machicao/food_establishment_data.csv'

# Executa a função e exibe o resultado
result_df = calculate_red_violations(data_source)


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2,application_1748727286175_0004,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------------+--------------------+
|name                  |total_red_violations|
+----------------------+--------------------+
|SUBWAY                |322                 |
|T-MOBILE PARK         |315                 |
|WHOLE FOODS MARKET    |299                 |
|PCC COMMUNITY MARKETS |251                 |
|TACO TIME             |240                 |
|MCDONALD'S            |177                 |
|THAI GINGER           |153                 |
|SAFEWAY INC #1508     |143                 |
|TAQUERIA EL RINCONSITO|134                 |
|HIMITSU TERIYAKI      |128                 |
+----------------------+--------------------+