In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Basics").getOrCreate()

df = spark.read.json("people.json")

df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [5]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [6]:
df.columns

['age', 'name']

In [9]:
print(df.describe())

print(df.describe().show())

DataFrame[summary: string, age: string, name: string]
+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+

None


In [36]:
# Define Schema manually
from pyspark.sql.types import (StructField, StringType,
                                IntegerType, StructType)

data_schema = [StructField("age", IntegerType(), nullable = True),
               StructField("name", StringType(), True)]
                # last atr is that it can be null or not

final_struct = StructType(fields = data_schema)

df = spark.read.json("people.json", schema = final_struct)

df.show()

df.printSchema()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [40]:
# select columns an single columns
df["age"]

Column<b'age'>

In [45]:
# selecting columns as dataframes
print(df.select("age"))
df.select("age").show()

DataFrame[age: int]
+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [46]:
df.head(2) # row object

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

In [47]:
# add new column
df.withColumn("newage", df["age"]+10).show()

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    40|
|  19| Justin|    29|
+----+-------+------+



In [49]:
df.withColumn("newage", df["age"]+10).withColumnRenamed("age","age_newname").show()

+-----------+-------+------+
|age_newname|   name|newage|
+-----------+-------+------+
|       null|Michael|  null|
|         30|   Andy|    40|
|         19| Justin|    29|
+-----------+-------+------+



In [51]:
# Using SQL queries
# 1. register as SQL temporary view
df.createOrReplaceTempView("people")
# 2. write query
results = spark.sql("SELECT * FROM people")
results.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [52]:
results2 = spark.sql("SELECT * FROM people where age > 20")
results2.show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+

