In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

## Tip #1

In [2]:
df1 = spark.range(20).withColumnRenamed("id", "column_1")

In [3]:
df2 = spark.range(20).withColumnRenamed("id", "column_2")

In [4]:
df = df1.join(df2, df1["column_1"]!=df2["column_2"]).repartition(5)

In [5]:
df.show(3)

+--------+--------+
|column_1|column_2|
+--------+--------+
|       1|      10|
|       0|      18|
|       2|      14|
+--------+--------+
only showing top 3 rows



In [6]:
columns_list = [df[column_name] for column_name in df.columns]

In [7]:
columns_list

[Column<b'column_1'>, Column<b'column_2'>]

In [8]:
columns_list = [(2*column) for column in columns_list]

In [9]:
columns_list

[Column<b'(column_1 * 2)'>, Column<b'(column_2 * 2)'>]

In [10]:
df.select(df["*"], *columns_list).show(3)

+--------+--------+--------------+--------------+
|column_1|column_2|(column_1 * 2)|(column_2 * 2)|
+--------+--------+--------------+--------------+
|       1|      10|             2|            20|
|       0|      18|             0|            36|
|       2|      14|             4|            28|
+--------+--------+--------------+--------------+
only showing top 3 rows



## Tip #2 

In [11]:
with_column_df = df
for idx, column in enumerate(columns_list):
    with_column_df = with_column_df.withColumn(f"column_{idx+1}_multiplied", column)
with_column_df.show(3)

+--------+--------+-------------------+-------------------+
|column_1|column_2|column_1_multiplied|column_2_multiplied|
+--------+--------+-------------------+-------------------+
|       1|      10|                  2|                 20|
|       0|      18|                  0|                 36|
|       2|      14|                  4|                 28|
+--------+--------+-------------------+-------------------+
only showing top 3 rows



## Tip #3

In [12]:
df.groupBy("column_1").sum().show(3)

+--------+-------------+-------------+
|column_1|sum(column_1)|sum(column_2)|
+--------+-------------+-------------+
|      19|          361|          171|
|       0|            0|          190|
|       7|          133|          183|
+--------+-------------+-------------+
only showing top 3 rows



In [13]:
from pyspark.sql.functions import sum 

In [14]:
column_1_summed = sum("column_1").alias("column_1_sum")
column_2_summed = sum("column_2").alias("column_2_sum")
df.groupBy("column_1").agg(column_1_summed, column_2_summed).show(3)

+--------+------------+------------+
|column_1|column_1_sum|column_2_sum|
+--------+------------+------------+
|      19|         361|         171|
|       0|           0|         190|
|       7|         133|         183|
+--------+------------+------------+
only showing top 3 rows



## Tip #4

In [15]:
from pyspark.sql.functions import sin

In [16]:
df.select(sin(df["column_1"]), sin(df["column_2"])).show(3)

+------------------+-------------------+
|     SIN(column_1)|      SIN(column_2)|
+------------------+-------------------+
|0.8414709848078965|-0.5440211108893698|
|               0.0|-0.7509872467716762|
|0.9092974268256817| 0.9906073556948704|
+------------------+-------------------+
only showing top 3 rows



In [17]:
df.createOrReplaceTempView("data")

In [18]:
spark.sql("""
select sin(column_1), sin(column_2)
from data
""").show(3)

+-----------------------------+-----------------------------+
|SIN(CAST(column_1 AS DOUBLE))|SIN(CAST(column_2 AS DOUBLE))|
+-----------------------------+-----------------------------+
|           0.8414709848078965|          -0.5440211108893698|
|                          0.0|          -0.7509872467716762|
|           0.9092974268256817|           0.9906073556948704|
+-----------------------------+-----------------------------+
only showing top 3 rows



In [19]:
df.selectExpr("sin(column_1)", "sin(column_2)").show(3)

+-----------------------------+-----------------------------+
|SIN(CAST(column_1 AS DOUBLE))|SIN(CAST(column_2 AS DOUBLE))|
+-----------------------------+-----------------------------+
|           0.8414709848078965|          -0.5440211108893698|
|                          0.0|          -0.7509872467716762|
|           0.9092974268256817|           0.9906073556948704|
+-----------------------------+-----------------------------+
only showing top 3 rows



In [20]:
df.selectExpr("sin(column_2) as column_2_sin").show(3)

+-------------------+
|       column_2_sin|
+-------------------+
|0.14987720966295234|
|-0.9999902065507035|
| 0.9906073556948704|
+-------------------+
only showing top 3 rows



## Tip #5

In [21]:
df.take(3)

[Row(column_1=1, column_2=10),
 Row(column_1=0, column_2=18),
 Row(column_1=2, column_2=14)]

In [22]:
df.rdd.map(type).take(1)

[pyspark.sql.types.Row]

In [23]:
a_row = df.take(1)[0]

In [24]:
a_row

Row(column_1=1, column_2=10)

In [25]:
a_row["column_1"]

1

In [26]:
a_row.asDict()

{'column_1': 1, 'column_2': 10}