In [5]:
from pyspark.sql import SparkSession


spark = SparkSession.builder.appName("00 SparkSession").master("local[*]").getOrCreate()

spark

In [9]:
data = [
     [1, "Gav", 35]
    ,[2, "Zac", 34]
    ,[3, "Hr", 28]
    ,[4, "Mar", 26]
]

+---+----+---+
| id|name|age|
+---+----+---+
|  1| Gav| 35|
|  2| Zac| 34|
|  3|  Hr| 28|
|  4| Mar| 26|
+---+----+---+



In [None]:
# define the schema as string


schema = "id int, name string, age int"

df_users = spark.createDataFrame(data, schema)

df_users.show()

In [15]:
# define the schema as Struct

from pyspark.sql.types import StructType, StructField, StringType, IntegerType

py_schema = StructType(
    [
         StructField("id", IntegerType(), False)
        ,StructField("name", StringType(),False)
        ,StructField("age", IntegerType(), False)
    ]
)


df_other_users = spark.createDataFrame(data,py_schema)

In [29]:
# from string schema to pyspark schema
from pyspark.sql.types import _parse_datatype_string


pyschema = _parse_datatype_string(schema)

pyschema

StructType([StructField('id', IntegerType(), True), StructField('name', StringType(), True), StructField('age', IntegerType(), True)])

In [31]:
# inspect schema

df_users.printSchema()
df_users.schema


root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



StructType([StructField('id', IntegerType(), True), StructField('name', StringType(), True), StructField('age', IntegerType(), True)])

In [20]:
# refer to a column
df_users.id
df_users["id"]


from pyspark.sql.functions import col, expr

col("id")
expr("id")

Column<'id'>

In [24]:
# SELECT: provide as parameters the columns 
df_users.select(col("id"), expr("name as user_name"), df_users.id, df_users["age"]).show()

# selectExpr: provide as parameters the columns as string expressions (no need to use individual expr for each column)
df_users.selectExpr("id", "cast(age as int) as user_age", "name as user_name").show()


+---+---------+---+---+
| id|user_name| id|age|
+---+---------+---+---+
|  1|      Gav|  1| 35|
|  2|      Zac|  2| 34|
|  3|       Hr|  3| 28|
|  4|      Mar|  4| 26|
+---+---------+---+---+

+---+--------+---------+
| id|user_age|user_name|
+---+--------+---------+
|  1|      35|      Gav|
|  2|      34|      Zac|
|  3|      28|       Hr|
|  4|      26|      Mar|
+---+--------+---------+



In [26]:
df_users.where("age > 30").show()
df_users.filter("age > 30").show()

df_users.where(col("age") > 30).show()

+---+----+---+
| id|name|age|
+---+----+---+
|  1| Gav| 35|
|  2| Zac| 34|
+---+----+---+

+---+----+---+
| id|name|age|
+---+----+---+
|  1| Gav| 35|
|  2| Zac| 34|
+---+----+---+

+---+----+---+
| id|name|age|
+---+----+---+
|  1| Gav| 35|
|  2| Zac| 34|
+---+----+---+

