In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q pyspark

# ðŸ§ª Funciones para Transformaciones en PySpark

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("LibrosDF").getOrCreate()

df = spark.read.csv('/content/libros.csv',header=True,inferSchema=True)
df.show(2)

+--------------------+------------+-----------+-------+-----+----+-----------+
|                Name|      Author|User Rating|Reviews|Price|Year|      Genre|
+--------------------+------------+-----------+-------+-----+----+-----------+
|10-Day Green Smoo...|    JJ Smith|        4.7|  17350|    8|2016|Non Fiction|
|   11/22/63: A Novel|Stephen King|        4.6|   2052|   22|2011|    Fiction|
+--------------------+------------+-----------+-------+-----+----+-----------+
only showing top 2 rows


## EJECUTANDO SQL CON PYPSARK

In [5]:
df.createOrReplaceTempView("libros")
spark.sql("SELECT Name,Author FROM libros").show(4)

+--------------------+------------------+
|                Name|            Author|
+--------------------+------------------+
|10-Day Green Smoo...|          JJ Smith|
|   11/22/63: A Novel|      Stephen King|
|12 Rules for Life...|Jordan B. Peterson|
|1984 (Signet Clas...|     George Orwell|
+--------------------+------------------+
only showing top 4 rows


# ðŸ“˜ 1. select(): Seleccionar columnas

In [6]:
df.select("Name","Author").show(4)

+--------------------+------------------+
|                Name|            Author|
+--------------------+------------------+
|10-Day Green Smoo...|          JJ Smith|
|   11/22/63: A Novel|      Stephen King|
|12 Rules for Life...|Jordan B. Peterson|
|1984 (Signet Clas...|     George Orwell|
+--------------------+------------------+
only showing top 4 rows


# ðŸ“˜ 2. selectExpr(): Usar expresiones SQL

In [7]:
df.selectExpr("Name","(Year + 1) as next_year").show(5)

+--------------------+---------+
|                Name|next_year|
+--------------------+---------+
|10-Day Green Smoo...|     2017|
|   11/22/63: A Novel|     2012|
|12 Rules for Life...|     2019|
|1984 (Signet Clas...|     2018|
|5,000 Awesome Fac...|     2020|
+--------------------+---------+
only showing top 5 rows


# ðŸ“˜ 3. filter(): Filtrar libros publicados despuÃ©s del aÃ±o 2000 ( igual que where de sql)

In [9]:
df.filter(df['Year'] > 2015).show(5)

+--------------------+--------------------+-----------+-------+-----+----+-----------+
|                Name|              Author|User Rating|Reviews|Price|Year|      Genre|
+--------------------+--------------------+-----------+-------+-----+----+-----------+
|10-Day Green Smoo...|            JJ Smith|        4.7|  17350|    8|2016|Non Fiction|
|12 Rules for Life...|  Jordan B. Peterson|        4.7|  18979|   15|2018|Non Fiction|
|1984 (Signet Clas...|       George Orwell|        4.7|  21424|    6|2017|    Fiction|
|5,000 Awesome Fac...|National Geograph...|        4.8|   7665|   12|2019|Non Fiction|
|A Gentleman in Mo...|         Amor Towles|        4.7|  19699|   15|2017|    Fiction|
+--------------------+--------------------+-----------+-------+-----+----+-----------+
only showing top 5 rows


# ðŸ“˜ 4. where(): Igual que filter(), pero con sintaxis SQL

In [10]:
df.where("Year < 2015").show(5)

+--------------------+-------------------+-----------+-------+-----+----+-----------+
|                Name|             Author|User Rating|Reviews|Price|Year|      Genre|
+--------------------+-------------------+-----------+-------+-----+----+-----------+
|   11/22/63: A Novel|       Stephen King|        4.6|   2052|   22|2011|    Fiction|
|A Dance with Drag...|George R. R. Martin|        4.4|  12643|   11|2011|    Fiction|
|A Game of Thrones...|George R. R. Martin|        4.7|  19735|   30|2014|    Fiction|
|A Patriot's Histo...|   Larry Schweikart|        4.6|    460|    2|2010|Non Fiction|
|A Stolen Life: A ...|      Jaycee Dugard|        4.6|   4149|   32|2011|Non Fiction|
+--------------------+-------------------+-----------+-------+-----+----+-----------+
only showing top 5 rows


# ðŸ“˜ 5. distinct(): filtro por  autores duplicados

In [11]:
df.select("Author").distinct().show(5)

+--------------+
|        Author|
+--------------+
|   Sarah Young|
|    Jill Twiss|
|    Sara Gruen|
|    Doug Lemov|
|Justin Halpern|
+--------------+
only showing top 5 rows


# ðŸ“˜ 6. sort(): Ordenar libros por aÃ±o

In [12]:
df.sort("Year").show(5)

+--------------------+----------------+-----------+-------+-----+----+-----------+
|                Name|          Author|User Rating|Reviews|Price|Year|      Genre|
+--------------------+----------------+-----------+-------+-----+----+-----------+
|Dead And Gone: A ...|Charlaine Harris|        4.6|   1541|    4|2009|    Fiction|
|Eat This, Not Tha...| David Zinczenko|        4.3|    956|   14|2009|Non Fiction|
|Diary of a Wimpy ...|     Jeff Kinney|        4.8|   3837|   15|2009|    Fiction|
|Act Like a Lady, ...|    Steve Harvey|        4.6|   5013|   17|2009|Non Fiction|
|Divine Soul Mind ...|    Zhi Gang Sha|        4.6|     37|    6|2009|Non Fiction|
+--------------------+----------------+-----------+-------+-----+----+-----------+
only showing top 5 rows


# ðŸ“˜ 7. orderBy(): Igual que sort()

In [13]:
df.orderBy(df.Year.desc()).show(5)

+--------------------+--------------+-----------+-------+-----+----+-----------+
|                Name|        Author|User Rating|Reviews|Price|Year|      Genre|
+--------------------+--------------+-----------+-------+-----+----+-----------+
|Difficult Riddles...| M Prefontaine|        4.6|   7955|    5|2019|Non Fiction|
|Giraffes Can't Dance| Giles Andreae|        4.8|  14038|    4|2019|    Fiction|
|Dog Man: Brawl of...|    Dav Pilkey|        4.9|   7235|    4|2019|    Fiction|
|            Becoming|Michelle Obama|        4.8|  61133|   11|2019|Non Fiction|
|Dog Man: Fetch-22...|    Dav Pilkey|        4.9|  12619|    8|2019|    Fiction|
+--------------------+--------------+-----------+-------+-----+----+-----------+
only showing top 5 rows


ðŸ“˜ 8. withColumn(): Crear una nueva columna con el aÃ±o aumentado

In [14]:
from pyspark.sql.functions import col
df.withColumn("year_plus",col("Year") + 10).show(5)

+--------------------+--------------------+-----------+-------+-----+----+-----------+---------+
|                Name|              Author|User Rating|Reviews|Price|Year|      Genre|year_plus|
+--------------------+--------------------+-----------+-------+-----+----+-----------+---------+
|10-Day Green Smoo...|            JJ Smith|        4.7|  17350|    8|2016|Non Fiction|     2026|
|   11/22/63: A Novel|        Stephen King|        4.6|   2052|   22|2011|    Fiction|     2021|
|12 Rules for Life...|  Jordan B. Peterson|        4.7|  18979|   15|2018|Non Fiction|     2028|
|1984 (Signet Clas...|       George Orwell|        4.7|  21424|    6|2017|    Fiction|     2027|
|5,000 Awesome Fac...|National Geograph...|        4.8|   7665|   12|2019|Non Fiction|     2029|
+--------------------+--------------------+-----------+-------+-----+----+-----------+---------+
only showing top 5 rows


ðŸ“˜ 9. withColumnRenamed(): Renombrar columna "titulo" a "titulo_libro"

In [15]:
df.withColumnRenamed("Name","Title").show(5)

+--------------------+--------------------+-----------+-------+-----+----+-----------+
|               Title|              Author|User Rating|Reviews|Price|Year|      Genre|
+--------------------+--------------------+-----------+-------+-----+----+-----------+
|10-Day Green Smoo...|            JJ Smith|        4.7|  17350|    8|2016|Non Fiction|
|   11/22/63: A Novel|        Stephen King|        4.6|   2052|   22|2011|    Fiction|
|12 Rules for Life...|  Jordan B. Peterson|        4.7|  18979|   15|2018|Non Fiction|
|1984 (Signet Clas...|       George Orwell|        4.7|  21424|    6|2017|    Fiction|
|5,000 Awesome Fac...|National Geograph...|        4.8|   7665|   12|2019|Non Fiction|
+--------------------+--------------------+-----------+-------+-----+----+-----------+
only showing top 5 rows
