In [2]:
import findspark
findspark.init()
import pyspark
import pandas as pd
from pyspark.sql import SparkSession

In [8]:
from pyspark.sql.functions import *

In [6]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [7]:
sc

In [13]:
data = [("Alice", 25, "New York"),
        ("Bob", 30, "Los Angeles"),
        ("Charlie", 22, "Chicago")]
df = spark.createDataFrame(data, ["Name","Age","City"])
df.show()

+-------+---+-----------+
|   Name|Age|       City|
+-------+---+-----------+
|  Alice| 25|   New York|
|    Bob| 30|Los Angeles|
|Charlie| 22|    Chicago|
+-------+---+-----------+



In [15]:
p2 = df.select("Name")
p2.show()

+-------+
|   Name|
+-------+
|  Alice|
|    Bob|
|Charlie|
+-------+



In [18]:
p3 = df.select("Name").filter(df.Age >= 25)
p3.show()

+-----+
| Name|
+-----+
|Alice|
|  Bob|
+-----+



In [19]:
p4 = df.withColumn("Country", lit("US"))
p4.show()

+-------+---+-----------+-------+
|   Name|Age|       City|Country|
+-------+---+-----------+-------+
|  Alice| 25|   New York|     US|
|    Bob| 30|Los Angeles|     US|
|Charlie| 22|    Chicago|     US|
+-------+---+-----------+-------+



In [20]:
p5 = df.select(df['Age']).summary("mean")
p5.show()

+-------+------------------+
|summary|               Age|
+-------+------------------+
|   mean|25.666666666666668|
+-------+------------------+



In [22]:
p6 = df.sort(desc("Age"))
p6.show()

+-------+---+-----------+
|   Name|Age|       City|
+-------+---+-----------+
|    Bob| 30|Los Angeles|
|  Alice| 25|   New York|
|Charlie| 22|    Chicago|
+-------+---+-----------+



In [24]:
p7 = df.groupBy("City").agg(count("Name").alias("#personas"))
p7.show()

+-----------+---------+
|       City|#personas|
+-----------+---------+
|   New York|        1|
|Los Angeles|        1|
|    Chicago|        1|
+-----------+---------+



In [25]:
p8 = df.withColumnRenamed("Name","NombreCompleto")
p8.show()

+--------------+---+-----------+
|NombreCompleto|Age|       City|
+--------------+---+-----------+
|         Alice| 25|   New York|
|           Bob| 30|Los Angeles|
|       Charlie| 22|    Chicago|
+--------------+---+-----------+



In [26]:
p9 = df.drop("Age")
p9.show()

+-------+-----------+
|   Name|       City|
+-------+-----------+
|  Alice|   New York|
|    Bob|Los Angeles|
|Charlie|    Chicago|
+-------+-----------+



In [27]:
df.createOrReplaceTempView("tmp_table")

In [29]:
spark.sql("select * from tmp_table where Age >= 20").show()

+-------+---+-----------+
|   Name|Age|       City|
+-------+---+-----------+
|  Alice| 25|   New York|
|    Bob| 30|Los Angeles|
|Charlie| 22|    Chicago|
+-------+---+-----------+



In [30]:
spark.sql("select sum(Age) Age_sum from tmp_table").show()

+-------+
|Age_sum|
+-------+
|     77|
+-------+



In [31]:
spark.sql("select min(Age) min_age, max(Age) max_age from tmp_table").show()

+-------+-------+
|min_age|max_age|
+-------+-------+
|     22|     30|
+-------+-------+



In [33]:
spark.sql("select * from tmp_table where City = 'Chicago' and Age < 30").show()

+-------+---+-------+
|   Name|Age|   City|
+-------+---+-------+
|Charlie| 22|Chicago|
+-------+---+-------+



In [34]:
p14 = df.withColumn("EdadDuplicada", col("Age") * 2)
p14.show()

+-------+---+-----------+-------------+
|   Name|Age|       City|EdadDuplicada|
+-------+---+-----------+-------------+
|  Alice| 25|   New York|           50|
|    Bob| 30|Los Angeles|           60|
|Charlie| 22|    Chicago|           44|
+-------+---+-----------+-------------+



In [35]:
p15 = df.withColumn("EdadMeses", col("Age") * 12)
p15.show()

+-------+---+-----------+---------+
|   Name|Age|       City|EdadMeses|
+-------+---+-----------+---------+
|  Alice| 25|   New York|      300|
|    Bob| 30|Los Angeles|      360|
|Charlie| 22|    Chicago|      264|
+-------+---+-----------+---------+



In [39]:
p16 = spark.sql("select count(Name) total_personas from tmp_table")
p16.show()

+--------------+
|total_personas|
+--------------+
|             3|
+--------------+



In [42]:
p17 = df.filter(df.Age % 2 == 0)
p17.show()

+-------+---+-----------+
|   Name|Age|       City|
+-------+---+-----------+
|    Bob| 30|Los Angeles|
|Charlie| 22|    Chicago|
+-------+---+-----------+



In [44]:
p18 = spark.sql("select case when age between 0 and 20 then '0-20'" +
                            "when age between 21 and 40 then '21-40'" +
                            "when age between 41 and 60 then '41-60'" +
                            "when age > 60 then '61+'end age_range, count(name) total_personas from tmp_table group by 1")
p18.show()

+---------+--------------+
|age_range|total_personas|
+---------+--------------+
|    21-40|             3|
+---------+--------------+



In [46]:
p19 = df.groupBy("Name").agg(count("Name").alias("#personas"))
p19.show()

+-------+---------+
|   Name|#personas|
+-------+---------+
|  Alice|        1|
|    Bob|        1|
|Charlie|        1|
+-------+---------+



In [47]:
p20 = df.withColumn("InformacionPersonal", concat(col("Name"),lit(", "),col("City")))
p20.show()

+-------+---+-----------+-------------------+
|   Name|Age|       City|InformacionPersonal|
+-------+---+-----------+-------------------+
|  Alice| 25|   New York|    Alice, New York|
|    Bob| 30|Los Angeles|   Bob, Los Angeles|
|Charlie| 22|    Chicago|   Charlie, Chicago|
+-------+---+-----------+-------------------+

