<h1> Basic DataFrame operations in pyspark </h1>

<h4> Install pyspark library and import  </h4>

In [72]:
pip install pyspark



In [73]:
import pyspark

In [74]:
from pyspark.sql import SparkSession

<h4> Create spark application </h4>

In [75]:
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [76]:
spark

In [77]:
spark.sparkContext.master

'local[*]'

<h4> Loading json file and data frame operations</h4>

In [78]:
df = spark.read.json('people.json')

In [79]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [80]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [81]:
df.columns

['age', 'name']

In [82]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



<h4> Using user defined schema </h4>

In [83]:
from pyspark.sql.types import StructField,StringType,IntegerType,StructType

In [84]:
schema = StructType([StructField("age",IntegerType(),False),
               StructField("name",StringType(),False)])

In [85]:
df1 = spark.read.json('people.json',schema=schema)

In [86]:
df1.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [87]:
df1.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [88]:
type(df['age'])

pyspark.sql.column.Column

In [89]:
df.select('age')

DataFrame[age: bigint]

In [90]:
df.withColumn('doubleAge',df['age']*2).show()

+----+-------+---------+
| age|   name|doubleAge|
+----+-------+---------+
|null|Michael|     null|
|  30|   Andy|       60|
|  19| Justin|       38|
+----+-------+---------+



In [91]:
df.withColumnRenamed('age','my_new_age').show()

+----------+-------+
|my_new_age|   name|
+----------+-------+
|      null|Michael|
|        30|   Andy|
|        19| Justin|
+----------+-------+



<h4> Using spark SQL </h4>

In [92]:
df.createOrReplaceTempView('people')

In [93]:
results = spark.sql("select *,length(name) from people where age>0 order by name desc")
results.show()

+---+------+------------+
|age|  name|length(name)|
+---+------+------------+
| 19|Justin|           6|
| 30|  Andy|           4|
+---+------+------------+

