## DIAPOSITIVAS
https://docs.google.com/presentation/d/1Uu6CkgPyi6ka7Ojm42cI1qWBEI3b_zrQ/edit?usp=share_link&ouid=108221443273434880064&rtpof=true&sd=true

In [0]:
datos = [(None,'Smith   ','36636','M',3500),
         ('Michael','   Rose','40288','M',4750),
         ('Robert','Williams','42114','M',None),
         ('Maria','    Jones    ','39192','F',4000)
        ]
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

esquema = StructType([
    StructField('firstname', StringType(), True),
    StructField('lastname', StringType(), False),
    StructField('id', StringType(), False),
    StructField('gender', StringType(), True),
    StructField('salary', IntegerType(), True)
])


In [0]:
df = spark.createDataFrame(data=datos, schema=esquema)# crea el df (data frame)

df.printSchema() # imprime el esquema del df

df.show(truncate=False)# muestra el contenido del df, se puede especificar el num de registros entre parentesis, al igual que si recorta o no la salida por pantalla (10,truncate=False)


root
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = false)
 |-- id: string (nullable = false)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+-------------+-----+------+------+
|firstname|lastname     |id   |gender|salary|
+---------+-------------+-----+------+------+
|null     |Smith        |36636|M     |3500  |
|Michael  |   Rose      |40288|M     |4750  |
|Robert   |Williams     |42114|M     |null  |
|Maria    |    Jones    |39192|F     |4000  |
+---------+-------------+-----+------+------+



In [0]:
from pyspark.sql import Row

empleado_1 = Row("James","Smith","36636","M",3500)

print('Este empleado se llama: ',empleado_1[0])# se puede acceder de forma individual a los campos

datos = [Row("James","Smith","36636","M",3500),
         Row("Michael","Rose","40288","M",4750),
         Row("Robert","Williams","42114","M",4200),
         Row("Maria","Jones","39192","F",4000)
        ]



Este empleado se llama:  James


In [0]:
# subir archivo sales.csv , menú Archivo-> Upload datos...
# copiar link del formato de la API de Spark
# en este caso -> dbfs:/FileStore/shared_uploads/ciberniainfo@gmail.com/sales.csv
file = 'dbfs:/FileStore/shared_uploads/ciberniainfo@gmail.com/sales.csv'
sales_df = (spark.read
            .format("csv")
            .option("header", "true")
            .option("inferSchema", True)
            .load(file))

sales_df.printSchema()
sales_df.show(5)


root
 |-- Order_ID: integer (nullable = true)
 |-- Order_Date: date (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Units_Sold: integer (nullable = true)
 |-- Unit_Price: double (nullable = true)
 |-- Region: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Ship_Date: date (nullable = true)

+---------+----------+-------------+----------+----------+--------------------+--------------------+----------+
| Order_ID|Order_Date|    Item_Type|Units_Sold|Unit_Price|              Region|             Country| Ship_Date|
+---------+----------+-------------+----------+----------+--------------------+--------------------+----------+
|535113847|2014-10-08|       Snacks|       934|    152.58|Middle East and N...|          Azerbaijan|2014-10-23|
|874708545|2015-02-22|    Cosmetics|      4551|     437.2|Central America a...|              Panama|2015-02-27|
|854349935|2015-12-09|       Fruits|      9986|      9.33|  Sub-Saharan Africa|Sao Tome and Prin...|2016-01-18|

In [0]:
# subir archivo persons.json , menú Archivo-> Upload datos...
# copiar formato de la API de Spark
# en este caso -> dbfs:/FileStore/shared_uploads/ciberniainfo@gmail.com/persons.json

from pyspark.sql.types import IntegerType, StringType, FloatType, ArrayType, DateType, BooleanType

persons_schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('first_name', StringType(), True),
    StructField('last_name', StringType(), True),
    StructField('fav_movies', ArrayType(StringType()), True),
    StructField('salary', FloatType(), True),
    StructField('image_url', StringType(), True),
    StructField('date_of_birth', DateType(), True),
    StructField('active', BooleanType(), True)
])

file = 'dbfs:/FileStore/shared_uploads/ciberniainfo@gmail.com/persons.json'

persons_df = (spark.read.format('json')
             .option('multiline', True)
              .schema(persons_schema)
              .load(file)
             )
persons_df.show()

+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
| id|first_name|  last_name|          fav_movies| salary|           image_url|date_of_birth|active|
+---+----------+-----------+--------------------+-------+--------------------+-------------+------+
|  1|     Drucy|      Poppy|  [I giorni contati]|1463.36|http://dummyimage...|   1991-02-16|  true|
|  2|   Emelyne|      Blaza|[Musketeer, The, ...|3006.04|http://dummyimage...|   1991-11-02| false|
|  3|       Max|     Rettie|[The Forgotten Sp...|1422.88|http://dummyimage...|   1990-03-03| false|
|  4|    Ilario|       Kean|[Up Close and Per...|3561.36|http://dummyimage...|   1987-06-09|  true|
|  5|     Toddy|     Drexel|[Walk in the Clou...|4934.87|http://dummyimage...|   1992-10-28|  true|
|  6|    Oswald|   Petrolli|[Wing and the Thi...|1153.23|http://dummyimage...|   1986-09-02| false|
|  7|    Adrian|     Clarey|[Walking Tall, Pa...|1044.73|http://dummyimage...|   1971-08-24| false|


In [0]:
#select: Devuelve dataframe con las columnas y/o expresiones especificadas.
#Podemos escribir directamente los nombres de las columnas (cadena)
# OJO necesita acción show (evaluación perezosa)
sales_df.select('Order_ID','Item_Type','Units_Sold','Unit_Price','Country').show(10,truncate=False)


+---------+-------------+----------+----------+---------------------+
|Order_ID |Item_Type    |Units_Sold|Unit_Price|Country              |
+---------+-------------+----------+----------+---------------------+
|535113847|Snacks       |934       |152.58    |Azerbaijan           |
|874708545|Cosmetics    |4551      |437.2     |Panama               |
|854349935|Fruits       |9986      |9.33      |Sao Tome and Principe|
|892836844|Personal Care|9118      |81.73     |Sao Tome and Principe|
|129280602|Household    |5858      |668.27    |Belize               |
|473105037|Clothes      |1149      |109.28    |Denmark              |
|754046475|Cosmetics    |7964      |437.2     |Germany              |
|772153747|Fruits       |6307      |9.33      |Turkey               |
|847788178|Snacks       |8217      |152.58    |United Kingdom       |
|471623599|Cosmetics    |2758      |437.2     |Kazakhstan           |
+---------+-------------+----------+----------+---------------------+
only showing top 10 

In [0]:
# para expresiones mas colplejas podemos hacer uso de col(columna), expr(expresión) alias
from pyspark.sql.functions import col, expr

sales_df.select(col('Order_ID'), col('Item_Type'), expr("Units_Sold * Unit_Price as TOTAL_PRICE")).show(10)


+---------+-------------+------------------+
| Order_ID|    Item_Type|       TOTAL_PRICE|
+---------+-------------+------------------+
|535113847|       Snacks|         142509.72|
|874708545|    Cosmetics|         1989697.2|
|854349935|       Fruits|          93169.38|
|892836844|Personal Care|         745214.14|
|129280602|    Household|3914725.6599999997|
|473105037|      Clothes|         125562.72|
|754046475|    Cosmetics|         3481860.8|
|772153747|       Fruits|          58844.31|
|847788178|       Snacks|        1253749.86|
|471623599|    Cosmetics|1205797.5999999999|
+---------+-------------+------------------+
only showing top 10 rows



In [0]:
#no podemos emplear ambas formas, solo cadenas frente al uso col, expr, mismo select
sales_df.select(col('Order_ID'), col('Item_Type'), (col('Units_Sold') * col('Unit_Price')).alias('TOTAL_PRICE')).show(10)

+---------+-------------+------------------+
| Order_ID|    Item_Type|       TOTAL_PRICE|
+---------+-------------+------------------+
|535113847|       Snacks|         142509.72|
|874708545|    Cosmetics|         1989697.2|
|854349935|       Fruits|          93169.38|
|892836844|Personal Care|         745214.14|
|129280602|    Household|3914725.6599999997|
|473105037|      Clothes|         125562.72|
|754046475|    Cosmetics|         3481860.8|
|772153747|       Fruits|          58844.31|
|847788178|       Snacks|        1253749.86|
|471623599|    Cosmetics|1205797.5999999999|
+---------+-------------+------------------+
only showing top 10 rows



In [0]:
#Filter/where devuelve dataframe con los registros que cumplan la condición expresada, se obtiene el mismo resultado con ambas (where por semejanza SQL)
# debemos emplear como operadores booleanos:
# - “&” como “AND “          - “|”  como “OR” (AltGR+1)	-“~” como “NOT” (AltGR+4+ESP) 

(sales_df.filter((col('Region')=='Europe') & (col('Country')=='Spain'))
	   .select(col('Order_ID'), col('Country'),  col('Item_Type'),expr("Units_Sold * Unit_Price as TOTAL_PRICE"))).show(5)


+---------+-------+-------------+------------------+
| Order_ID|Country|    Item_Type|       TOTAL_PRICE|
+---------+-------+-------------+------------------+
|860891091|  Spain|Personal Care|462591.80000000005|
|413236844|  Spain|    Household|         3221061.4|
|621470248|  Spain|      Clothes|         624207.36|
|337587821|  Spain|   Vegetables|        1323837.58|
|420354354|  Spain|       Snacks|         160056.42|
+---------+-------+-------------+------------------+
only showing top 5 rows



In [0]:
#si encadenamos (notación punto) dos where/filter, el efecto es el del operador &
(sales_df.where(col('Region')=='Europe').where(col('Country')=='Spain')
	   .select(col('Order_ID'), col('Country'),  col('Item_Type'),expr("Units_Sold * Unit_Price as TOTAL_PRICE"))).show(5)


+---------+-------+-------------+------------------+
| Order_ID|Country|    Item_Type|       TOTAL_PRICE|
+---------+-------+-------------+------------------+
|860891091|  Spain|Personal Care|462591.80000000005|
|413236844|  Spain|    Household|         3221061.4|
|621470248|  Spain|      Clothes|         624207.36|
|337587821|  Spain|   Vegetables|        1323837.58|
|420354354|  Spain|       Snacks|         160056.42|
+---------+-------+-------------+------------------+
only showing top 5 rows



In [0]:
# orderby devuelve dataframe con los valores ordenados por las columnas especificadas. Podemos usar asc() (por defecto) y desc() para especificar el orden

(sales_df.select(col('Order_ID'), col('Country'), col('Item_Type'), col('Units_Sold')).orderBy(col('Units_Sold').desc())).show(10)


+---------+--------------------+---------------+----------+
| Order_ID|             Country|      Item_Type|Units_Sold|
+---------+--------------------+---------------+----------+
|143555104|             Iceland|  Personal Care|     10000|
|885743367|              Panama|           Meat|     10000|
|122941577|Federated States ...|     Vegetables|     10000|
|230469834|               Qatar|           Meat|     10000|
|257909476|          Cape Verde|      Household|     10000|
|261322534|             Comoros|      Cosmetics|     10000|
|573532950|                Iran|         Snacks|     10000|
|225874030|               Ghana|         Cereal|     10000|
|936026847|               Nepal|Office Supplies|      9999|
|485435473|        South Africa|      Beverages|      9999|
+---------+--------------------+---------------+----------+
only showing top 10 rows



In [0]:
(sales_df.select(col('Order_ID'),col('Country'),col('Item_Type'),col('Units_Sold')) .orderBy(col('Units_Sold').desc(),col('Country').asc())).show(20,truncate=False)

+---------+------------------------------+---------------+----------+
|Order_ID |Country                       |Item_Type      |Units_Sold|
+---------+------------------------------+---------------+----------+
|257909476|Cape Verde                    |Household      |10000     |
|261322534|Comoros                       |Cosmetics      |10000     |
|122941577|Federated States of Micronesia|Vegetables     |10000     |
|225874030|Ghana                         |Cereal         |10000     |
|143555104|Iceland                       |Personal Care  |10000     |
|573532950|Iran                          |Snacks         |10000     |
|885743367|Panama                        |Meat           |10000     |
|230469834|Qatar                         |Meat           |10000     |
|240709006|Andorra                       |Beverages      |9999      |
|895982539|Cameroon                      |Beverages      |9999      |
|572350203|Cote d'Ivoire                 |Snacks         |9999      |
|264735591|Cote d'Iv

In [0]:
#Para tratar con valores nulos existen las opciones ‘asc_nulls_first()’, ‘desc_nulls_first()’, ‘asc_nulls_last()’, ‘desc_nulls_last()’

(sales_df.select('Region','Country').orderBy(col('Region').asc_nulls_first())).show()


+------+-----------+
|Region|    Country|
+------+-----------+
|  Asia|     Taiwan|
|  Asia|   Maldives|
|  Asia|   Cambodia|
|  Asia| Kazakhstan|
|  Asia|      China|
|  Asia|  Singapore|
|  Asia|  Singapore|
|  Asia| Kazakhstan|
|  Asia|    Vietnam|
|  Asia| Uzbekistan|
|  Asia|   Mongolia|
|  Asia|    Vietnam|
|  Asia|      Nepal|
|  Asia|   Thailand|
|  Asia|     Bhutan|
|  Asia|   Mongolia|
|  Asia| Uzbekistan|
|  Asia|South Korea|
|  Asia|     Bhutan|
|  Asia| Kyrgyzstan|
+------+-----------+
only showing top 20 rows



In [0]:
#distinct -> Devuelve los valores unicos (distintos) del dataframe (por ej.: para encontrar valores unicos de una columna, contarlos)

print(sales_df.select('Region').distinct().count())

sales_df.select('Region').distinct().show()


7
+--------------------+
|              Region|
+--------------------+
|Middle East and N...|
|Australia and Oce...|
|              Europe|
|  Sub-Saharan Africa|
|Central America a...|
|       North America|
|                Asia|
+--------------------+



In [0]:
# limit -> Restringe el numero de registos del dataframe a devolver al espedificado entre parentesis
# OJO show(10) deuelve por pantalla los 10 primeros registros del dataframe completo, limit(10) recorta el dataframe a los 10 primeros

(sales_df.select(col('Order_ID'),col('Country'),col('Item_Type'),col('Units_Sold'))
.orderBy(col('Units_Sold').desc(),col('Country').asc()).limit(20)).count()


Out[29]: 20

In [0]:
'''
Realizar la siguiente CONSULTA:
     Devolver los campos producto , unidades vendidas, fechas de pedido y envío, de las ventas de la Zona Logística de Asia, ordenadas por país. Sólo nos interesan los 10 primeros.
'''
#producto -> Item_Type
#unidades vendidas -> Units_Sold
#Fechas de pedido -> Order_Date
#Fecha de envio -> Ship_Date
#zona logistica Asia -> Region 
#pais -> Country

(sales_df.filter((col('Region')=='Asia')).select(col('Item_Type'), col('Units_Sold'), col('Order_Date'), col('Ship_Date'),col('Country')).orderBy(col('Country'))).show(10)


+----------+----------+----------+----------+----------+
| Item_Type|Units_Sold|Order_Date| Ship_Date|   Country|
+----------+----------+----------+----------+----------+
| Household|      4580|2015-06-02|2015-06-04|Bangladesh|
|Vegetables|      2224|2016-03-26|2016-04-24|Bangladesh|
| Household|      3481|2015-05-03|2015-05-21|Bangladesh|
|    Snacks|      7359|2013-05-27|2013-05-27|Bangladesh|
| Cosmetics|      8782|2015-11-14|2015-11-26|Bangladesh|
| Household|      8506|2013-07-01|2013-07-30|Bangladesh|
|   Clothes|      3853|2012-07-24|2012-08-31|Bangladesh|
| Household|      8783|2010-11-19|2010-12-02|Bangladesh|
|    Snacks|      8063|2013-11-08|2013-12-24|Bangladesh|
|   Clothes|        58|2011-09-29|2011-10-28|Bangladesh|
+----------+----------+----------+----------+----------+
only showing top 10 rows

Out[43]: "\n(sales_df.filter((col('Region')=='Asia'))\n\t   .select(col('Country')\n"

In [0]:
#withColumn() añade una nueva columna al dataframe, por ej un campo calculado

from pyspark.sql.functions import lit

# VALOR DETERMINADO
sales_df.withColumn("Sent", lit(False)).show(5)

# CAMPO CALCULADO
sales_df.withColumn("Total_Price", expr("Units_Sold *  Unit_Price")).show(5)


+---------+----------+-------------+----------+----------+--------------------+--------------------+----------+-----+
| Order_ID|Order_Date|    Item_Type|Units_Sold|Unit_Price|              Region|             Country| Ship_Date| Sent|
+---------+----------+-------------+----------+----------+--------------------+--------------------+----------+-----+
|535113847|2014-10-08|       Snacks|       934|    152.58|Middle East and N...|          Azerbaijan|2014-10-23|false|
|874708545|2015-02-22|    Cosmetics|      4551|     437.2|Central America a...|              Panama|2015-02-27|false|
|854349935|2015-12-09|       Fruits|      9986|      9.33|  Sub-Saharan Africa|Sao Tome and Prin...|2016-01-18|false|
|892836844|2014-09-17|Personal Care|      9118|     81.73|  Sub-Saharan Africa|Sao Tome and Prin...|2014-10-12|false|
|129280602|2010-02-04|    Household|      5858|    668.27|Central America a...|              Belize|2010-03-05|false|
+---------+----------+-------------+----------+---------

In [0]:
sales_df.withColumnRenamed ('Region','Logistics_Area').show(5)


+---------+----------+-------------+----------+----------+--------------------+--------------------+----------+
| Order_ID|Order_Date|    Item_Type|Units_Sold|Unit_Price|      Logistics_Area|             Country| Ship_Date|
+---------+----------+-------------+----------+----------+--------------------+--------------------+----------+
|535113847|2014-10-08|       Snacks|       934|    152.58|Middle East and N...|          Azerbaijan|2014-10-23|
|874708545|2015-02-22|    Cosmetics|      4551|     437.2|Central America a...|              Panama|2015-02-27|
|854349935|2015-12-09|       Fruits|      9986|      9.33|  Sub-Saharan Africa|Sao Tome and Prin...|2016-01-18|
|892836844|2014-09-17|Personal Care|      9118|     81.73|  Sub-Saharan Africa|Sao Tome and Prin...|2014-10-12|
|129280602|2010-02-04|    Household|      5858|    668.27|Central America a...|              Belize|2010-03-05|
+---------+----------+-------------+----------+----------+--------------------+--------------------+----

In [0]:
# drop() elimina una o varias columnas del dataframe, las columnas se indican una tras otra
resumen_df = sales_df.withColumn("Total_Price", expr("Units_Sold *  Unit_Price"))

resumen_df.show(5)


+---------+----------+-------------+----------+----------+--------------------+--------------------+----------+------------------+
| Order_ID|Order_Date|    Item_Type|Units_Sold|Unit_Price|              Region|             Country| Ship_Date|       Total_Price|
+---------+----------+-------------+----------+----------+--------------------+--------------------+----------+------------------+
|535113847|2014-10-08|       Snacks|       934|    152.58|Middle East and N...|          Azerbaijan|2014-10-23|         142509.72|
|874708545|2015-02-22|    Cosmetics|      4551|     437.2|Central America a...|              Panama|2015-02-27|         1989697.2|
|854349935|2015-12-09|       Fruits|      9986|      9.33|  Sub-Saharan Africa|Sao Tome and Prin...|2016-01-18|          93169.38|
|892836844|2014-09-17|Personal Care|      9118|     81.73|  Sub-Saharan Africa|Sao Tome and Prin...|2014-10-12|         745214.14|
|129280602|2010-02-04|    Household|      5858|    668.27|Central America a...|    

In [0]:
# se eleiminan las colunmas 'Unit_Price' y 'Region' del df anterior, ya no aparecen en el printSchema 
resumen_df = resumen_df.drop('Unit_Price','Region')

resumen_df.printSchema()

root
 |-- Order_ID: integer (nullable = true)
 |-- Order_Date: date (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Units_Sold: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- Ship_Date: date (nullable = true)
 |-- Total_Price: double (nullable = true)



In [0]:
#dropna(how='any',thresh=None,subset=None):
# how -> 'any'/'all'
#thresh -> num de valores validos que debe tener como minimo la fila, sino se elimina
#subset -> aplicar solo a unas columnas

df.show()

# eliminamos aquellos con salario nulo
not_null_df = df.dropna(subset='salary')

not_null_df.show()



+---------+-------------+-----+------+------+
|firstname|     lastname|   id|gender|salary|
+---------+-------------+-----+------+------+
|     null|     Smith   |36636|     M|  3500|
|  Michael|         Rose|40288|     M|  4750|
|   Robert|     Williams|42114|     M|  null|
|    Maria|    Jones    |39192|     F|  4000|
+---------+-------------+-----+------+------+

+---------+-------------+-----+------+------+
|firstname|     lastname|   id|gender|salary|
+---------+-------------+-----+------+------+
|     null|     Smith   |36636|     M|  3500|
|  Michael|         Rose|40288|     M|  4750|
|    Maria|    Jones    |39192|     F|  4000|
+---------+-------------+-----+------+------+



In [0]:
# cast() junto con withColumn() cambia el tipo de una columna
sales_df.withColumn('Order_ID', col('Order_ID').cast('string')).printSchema()

root
 |-- Order_ID: string (nullable = true)
 |-- Order_Date: date (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Units_Sold: integer (nullable = true)
 |-- Unit_Price: double (nullable = true)
 |-- Region: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Ship_Date: date (nullable = true)



In [0]:
from pyspark.sql.functions import upper, lower

corregido1 = df.withColumn('firstname', upper(col('firstname')))

corregido1.show()


+---------+-------------+-----+------+------+
|firstname|     lastname|   id|gender|salary|
+---------+-------------+-----+------+------+
|     null|     Smith   |36636|     M|  3500|
|  MICHAEL|         Rose|40288|     M|  4750|
|   ROBERT|     Williams|42114|     M|  null|
|    MARIA|    Jones    |39192|     F|  4000|
+---------+-------------+-----+------+------+



In [0]:
from pyspark.sql.functions import ltrim, rtrim, trim

corregido2 = df.withColumn('lastname', trim(col('lastname')))

corregido2.show()



+---------+--------+-----+------+------+
|firstname|lastname|   id|gender|salary|
+---------+--------+-----+------+------+
|     null|   Smith|36636|     M|  3500|
|  Michael|    Rose|40288|     M|  4750|
|   Robert|Williams|42114|     M|  null|
|    Maria|   Jones|39192|     F|  4000|
+---------+--------+-----+------+------+



In [0]:
#Spark SQL admite gran variedad de formatos para almacenar
#dataframe.write.format(args).mode(save_mode).option('key','value').save(path)

path = 'dbfs:/FileStore/shared_uploads/ciberniainfo@gmail.com/sales'
(sales_df.write
       	.format("parquet")
            	.mode("overwrite")
            	.option("compression", "snappy")
            	.save(path))

In [0]:
%fs ls 'dbfs:/FileStore/shared_uploads/ciberniainfo@gmail.com/sales'

path,name,size,modificationTime
dbfs:/FileStore/shared_uploads/ciberniainfo@gmail.com/sales/_SUCCESS,_SUCCESS,0,1682625444000
dbfs:/FileStore/shared_uploads/ciberniainfo@gmail.com/sales/_committed_9007375134093540847,_committed_9007375134093540847,224,1682625443000
dbfs:/FileStore/shared_uploads/ciberniainfo@gmail.com/sales/_started_9007375134093540847,_started_9007375134093540847,0,1682625442000
dbfs:/FileStore/shared_uploads/ciberniainfo@gmail.com/sales/part-00000-tid-9007375134093540847-301cddc8-a25a-4208-be1a-21f08e3a2bf8-115-1-c000.snappy.parquet,part-00000-tid-9007375134093540847-301cddc8-a25a-4208-be1a-21f08e3a2bf8-115-1-c000.snappy.parquet,662217,1682625443000
dbfs:/FileStore/shared_uploads/ciberniainfo@gmail.com/sales/part-00001-tid-9007375134093540847-301cddc8-a25a-4208-be1a-21f08e3a2bf8-116-1-c000.snappy.parquet,part-00001-tid-9007375134093540847-301cddc8-a25a-4208-be1a-21f08e3a2bf8-116-1-c000.snappy.parquet,588435,1682625443000


In [0]:
'''
Partir del Dataframe “Sales” original (desde cero)
Simulación ETL sencilla.
Crear columna descuento fijo (idea: si expresamos el descuento como proporción, lo podemos utilizar directamente para calcular el precio, multiplicando)
	Precio: 200 🡪 descuento 25% 🡪 1 – 0,25 = 0,75 🡪 precio final = 0,75 * precio
Crear columna precio total (precio * unidades * descuento). Eliminar después columna precio unitario y descuento
Pasar el campo Región a MAYÚSCULAS. Cambiar el nombre del campo a “Logist_Area”
'''

