In [None]:
# Instalar PySpark en Google Colab
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
import pandas as pd

In [None]:
# Usar pandas para descargar el archivo
url = "https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23e1e9f0&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=on&txtcolor=%23444444&ts=12&tts=12&width=1320&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=CPIAUCSL&scale=left&cosd=1947-01-01&coed=2024-07-01&line_color=%234572a7&link_values=false&line_style=solid&mark_type=none&mw=3&lw=2&ost=-99999&oet=99999&mma=0&fml=a&fq=Monthly&fam=avg&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=2024-08-31&revision_date=2024-08-31&nd=1947-01-01"

# Descargar el CSV
cpi_data = pd.read_csv(url)

# Guardar el archivo localmente
cpi_data.to_csv("CPIAUCSL.csv", index=False)


Acá empieza la parte de SQL/PySpark

In [None]:
# Crear una sesión de Spark
spark = SparkSession.builder.appName("SQL Basics").getOrCreate()

In [None]:
# Cargar el dataset desde el archivo CSV local
cpi_df = spark.read.csv("CPIAUCSL.csv", header=True, inferSchema=True)

In [None]:
# Mostrar la estructura del DataFrame
cpi_df.printSchema()

In [None]:
# Mostrar las primeras filas del dataset
cpi_df.show(5)

In [None]:
# Registrar el DataFrame como una tabla SQL temporal
cpi_df.createOrReplaceTempView("cpi_data")

In [None]:
# Consultar información básica del dataset
spark.sql("SELECT COUNT(*) AS guille FROM cpi_data").show()

In [None]:
# Calcular estadísticas descriptivas (media, máximo, mínimo) para el índice de precios al consumidor (CPIAUCSL)
spark.sql("SELECT AVG(CPIAUCSL) AS federico, MAX(CPIAUCSL) AS max_cpi, MIN(CPIAUCSL) AS min_cpi, MEDIAN(CPIAUCSL) AS median_cpi  FROM cpi_data").show()

In [None]:
# Filtrar datos para observar el CPI después del año 2000
spark.sql("""
    SELECT *
    FROM cpi_data
    WHERE DATE >= '2020-01-01'
""").show()

In [None]:
# Extraer el año y calcular el CPI promedio por año
spark.sql("""
    SELECT
        SUBSTRING(DATE, 1, 4) AS Year,
        AVG(CPIAUCSL) AS average_cpi
    FROM cpi_data
    GROUP BY Year
    ORDER BY Year
""").show()


In [None]:
# Agrupar por década y calcular el CPI promedio
spark.sql("""
    SELECT
        CONCAT(SUBSTRING(DATE, 1, 3), '0s') AS Decade,
        AVG(CPIAUCSL) AS average_cpi
    FROM cpi_data
    GROUP BY Decade
    ORDER BY Decade
""").show()

In [None]:
# Calcular el cambio porcentual mensual en el CPI
spark.sql("""
    SELECT
        DATE,
        CPIAUCSL,
        (CPIAUCSL - LAG(CPIAUCSL, 1) OVER (ORDER BY DATE)) / LAG(CPIAUCSL, 1) OVER (ORDER BY DATE) AS monthly_change
    FROM cpi_data
    ORDER BY DATE
""").show()


In [None]:
# Encontrar las fechas con el CPI más alto y más bajo
spark.sql("""
    SELECT DATE, CPIAUCSL
    FROM cpi_data
    WHERE CPIAUCSL = (SELECT MAX(CPIAUCSL) FROM cpi_data)
       OR CPIAUCSL = (SELECT MIN(CPIAUCSL) FROM cpi_data)
""").show()


In [None]:
# Calcular la media móvil (rolling average) de 12 meses del CPI
spark.sql("""
    SELECT
        DATE,
        CPIAUCSL,
        AVG(CPIAUCSL) OVER (ORDER BY DATE ROWS BETWEEN 11 PRECEDING AND CURRENT ROW) AS moving_avg
    FROM cpi_data
""").show()


In [None]:
# Crear un índice simple basado en un año base (por ejemplo, 2000)
spark.sql("""
    WITH base_year AS (
        SELECT AVG(CPIAUCSL) AS base_cpi
        FROM cpi_data
        WHERE DATE LIKE '2000%'
    )
    SELECT
        DATE,
        CPIAUCSL,
        CPIAUCSL / (SELECT base_cpi FROM base_year) * 100 AS cpi_index
    FROM cpi_data
""").show()
