In [4]:
#

path = "/data/students/bigdata_internet/dataframe_examples/mycsv.csv"
df = spark.read.load(path, format="csv", header=True, inferSchema=True)
df.show()

+---------+---+-----+
|     name|age| dept|
+---------+---+-----+
|    Paolo| 50|  DET|
|     Luca| 40|DAUIN|
|  Martino| 30|  DAD|
|    Paolo| 25| DIST|
|Francesca| 40|DAUIN|
|    Paolo| 32|  DET|
|     Luca| 56|  DAD|
+---------+---+-----+



In [3]:
df.createOrReplaceTempView('myTable')

In [13]:
# With SQL:
df_out = spark.sql(" SELECT dept, count(dept) \
          FROM myTable \
          WHERE age>45 \
          GROUP BY dept \
          ORDER BY dept DESC \
          ")
df_out.show()

+----+-----------+
|dept|count(dept)|
+----+-----------+
| DET|          1|
| DAD|          1|
+----+-----------+



In [16]:
# With DF:
df_filt = df.filter('age>45').groupBy("dept").agg({'dept':'count'}).sort('dept', ascending=False)
df_filt.show()

+----+-----------+
|dept|count(dept)|
+----+-----------+
| DET|          1|
| DAD|          1|
+----+-----------+



                                                                                

In [17]:
# Create a UDF:
spark.udf.register("myFunct", lambda x: len(x)+25)

<function __main__.<lambda>(x)>

In [19]:
# Use the UDF to sum 25 to the lengthof each name and store each result in a row of a new DF

# DF
df.selectExpr("myFunct(name)").show()

+-------------+
|myFunct(name)|
+-------------+
|           30|
|           29|
|           32|
|           30|
|           34|
|           30|
|           29|
+-------------+



In [21]:
# SQL
spark.sql("SELECT myFunct(name) FROM myTable").show()

+-------------+
|myFunct(name)|
+-------------+
|           30|
|           29|
|           32|
|           30|
|           34|
|           30|
|           29|
+-------------+



In [23]:
# By creating a standalone python function:
def myFunct2(x):
    res = len(str(x)) + 25
    return res

In [24]:
# NOTE: the name of the SQL function can (but shouldn't) be different from the one of the function just defined
spark.udf.register('myFunct2', myFunct2)


<function __main__.myFunct2(x)>

In [25]:
# DF
df.selectExpr("myFunct2(name)").show()

# SQL
spark.sql("SELECT myFunct2(name) FROM myTable").show()

+--------------+
|myFunct2(name)|
+--------------+
|            30|
|            29|
|            32|
|            30|
|            34|
|            30|
|            29|
+--------------+

+--------------+
|myFunct2(name)|
+--------------+
|            30|
|            29|
|            32|
|            30|
|            34|
|            30|
|            29|
+--------------+

