In [1]:
from pyspark.sql.functions import udf, col, count
from pyspark.sql.functions import mean as _mean
from pyspark.sql.types import IntegerType

In [2]:
spark

In [3]:
rdd = sc.parallelize([(1,2,3),(1,5,6),(2,8,9)])

In [4]:
df = rdd.toDF(["a","b","c"])

In [5]:
df.collect()

[Row(a=1, b=2, c=3), Row(a=1, b=5, c=6), Row(a=2, b=8, c=9)]

In [6]:
df.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: long (nullable = true)
 |-- c: long (nullable = true)



In [7]:
df.show()

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  1|  2|  3|
|  1|  5|  6|
|  2|  8|  9|
+---+---+---+



In [8]:
df.select("a")

DataFrame[a: bigint]

In [9]:
df.filter(col("a") >= 2).show()

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  2|  8|  9|
+---+---+---+



In [10]:
df.groupBy("a").sum().show()

+---+------+------+------+
|  a|sum(a)|sum(b)|sum(c)|
+---+------+------+------+
|  1|     2|     7|     9|
|  2|     2|     8|     9|
+---+------+------+------+



In [11]:
df.groupBy("a").agg(count("b").alias('count_a'),_mean("c").alias('avg_c')).show()

+---+-------+-----+
|  a|count_a|avg_c|
+---+-------+-----+
|  1|      2|  4.5|
|  2|      1|  9.0|
+---+-------+-----+



In [12]:
df.groupBy("a").agg(count("b").alias('count_a'),_mean("c").alias('avg_c')).explain()

== Physical Plan ==
*(2) HashAggregate(keys=[a#0L], functions=[count(b#1L), avg(c#2L)])
+- Exchange hashpartitioning(a#0L, 200)
   +- *(1) HashAggregate(keys=[a#0L], functions=[partial_count(b#1L), partial_avg(c#2L)])
      +- Scan ExistingRDD[a#0L,b#1L,c#2L]


In [56]:
def my_add(a,b):
    return a + b

In [57]:
udf_my_add = udf(my_add, IntegerType())

In [58]:
df.withColumn("d", udf_my_add(col('b'),col('c'))).show()

+---+---+---+---+
|  a|  b|  c|  d|
+---+---+---+---+
|  1|  2|  3|  5|
|  1|  5|  6| 11|
|  2|  8|  9| 17|
+---+---+---+---+

