In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession 
# this is really very helpful to create a dataframe directly from file like json,csv,tsv,..etc, to run sql queries 
# on dataframe & to delete registered temp table in memory and also to get SparkContext object so that we can create 
# RDD and play with that

from pyspark.sql import Row 
# you need to import Row, when you are making structured data out of unstructured data i.e you are giving some schema
# to unstructured data to make it structured with the help of Row object. So converting RDD to DataFrame with the help
# of Row.

from pyspark.sql.types import StructField,IntegerType,StringType,StructType
# These Field and Types are just to change schema of dataframe i.e to change the datatype and some nullable properties

In [3]:
spark = SparkSession.builder.appName("basics").getOrCreate()

In [4]:
print(spark)

<pyspark.sql.session.SparkSession object at 0x10dbea208>


In [5]:
print(type(spark))

<class 'pyspark.sql.session.SparkSession'>


In [6]:
# DataFrame is just DataSet of Row objects as individual elements.
df = spark.read. \
    json('file:///Users/hdagar3/Documents/Spark_Things/Spark_Course_Files_JosePortilla/Spark_DataFrames/people.json')

In [7]:
sc = spark.sparkContext  # fetching sparkContext object from SparkSession object

In [8]:
print(sc) # you can make use of this spark context object to play with RDDs
print(type(sc))

<SparkContext master=local[*] appName=basics>
<class 'pyspark.context.SparkContext'>


In [9]:
print(df) # this is a dataframe object
print(type(df))

DataFrame[age: bigint, name: string]
<class 'pyspark.sql.dataframe.DataFrame'>


In [10]:
print(df.dtypes)

[('age', 'bigint'), ('name', 'string')]


In [11]:
print(df.describe()) # it returns a dataframe with some statistical info of columns present in a dataframe
stat_df = df.describe()
print(stat_df.collect())
stat_df.show() # dataframe in form of table visualization and it returns nothing

DataFrame[summary: string, age: string, name: string]
[Row(summary='count', age='2', name='3'), Row(summary='mean', age='24.5', name=None), Row(summary='stddev', age='7.7781745930520225', name=None), Row(summary='min', age='19', name='Andy'), Row(summary='max', age='30', name='Michael')]
+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [12]:
print(df.head(2))
print(df.take(2))
print(type(df.head(2)))
print(type(df.take(2)))
# So both are same

# each row object is one element of dataframe --> think it like this

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]
[Row(age=None, name='Michael'), Row(age=30, name='Andy')]
<class 'list'>
<class 'list'>


In [13]:
print(df.printSchema()) # it gives us the schema of dataframe i.e including datatype of column names with null allowed
                        # or not

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)

None


In [14]:
print(df.schema)
# This is actual schema which you can change if you want

# Sometimes what happens is, whenever spark reads data from some source then it take datatype of all columns as string
# by default. So in order to change that we make use of StructField,StructType,IntegerType,StringType,...etc

StructType(List(StructField(age,LongType,true),StructField(name,StringType,true)))


In [30]:
new_data_schema = [StructField('age',IntegerType(),True),
                   StructField('name',StringType(),True)]

final_struc = StructType(fields=new_data_schema)

In [32]:
customized_schema_df = spark.read. \
    json('file:///Users/hdagar3/Documents/Spark_Things/Spark_Course_Files_JosePortilla/Spark_DataFrames/people.json', \
        schema=final_struc)

In [34]:
print(customized_schema_df.schema)
customized_schema_df.show()

StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true)))
+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [16]:
print(df.collect()) # it returns list of each element of dataframe, here in dataframe each element is a Row() object
print(df.count()) # just like in rdd, here it also returns no of Row() objects present in a dataset

# DataFrame is nothing but a DataSet of Row() objects --> V.V. IM
# From spark 2.0 version onwards, they call DataFrame as DataSet(so use DataSet term instead of DataFrame for future)

[Row(age=None, name='Michael'), Row(age=30, name='Andy'), Row(age=19, name='Justin')]
3


In [17]:
df.show() # it returns nice table view of dataframe and returns nothing
# if there is missing data then it replaces that missing value as null, later we could see how to deal with that

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [18]:
print(df.columns) # it returns column names in dataframe

['age', 'name']


In [19]:
pandas_dataframe = df.toPandas() 
# converting Spark DataFrame to Pandas DataFrame --> this is awesome

In [20]:
print(pandas_dataframe)

    age     name
0   NaN  Michael
1  30.0     Andy
2  19.0   Justin


In [21]:
print(type(pandas_dataframe))

<class 'pandas.core.frame.DataFrame'>


In [22]:
df.createOrReplaceTempView('people') 
# to register the dataframe as a temp table in RAM so that we can run sql queries on a dataframe, easy for users who
# know SQL.

In [23]:
df2 = spark.sql('select * from people')
# after running sql queries, in return you again get a different dataframe.

In [24]:
print(df2)
print(type(df2))

DataFrame[age: bigint, name: string]
<class 'pyspark.sql.dataframe.DataFrame'>


In [26]:
print(df2.printSchema())

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)

None


In [29]:
df2.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [28]:
spark.catalog.dropTempView("people") # to delete the temporary registered table in memory

In [None]:
# END