In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession,Row

In [3]:
spark = SparkSession.builder.appName("dataframe_basics").getOrCreate()

In [4]:
print(spark)
print(type(spark))

<pyspark.sql.session.SparkSession object at 0x1084ec128>
<class 'pyspark.sql.session.SparkSession'>


In [10]:
df = spark.read.csv \
    ('file:///Users/hdagar3/Documents/Spark_Things/Spark_Course_Files_JosePortilla/Spark_DataFrames/appl_stock.csv' \
     ,inferSchema=True,header=True)
    
# we are specifying that header is present so that it will take column names specified in file otherwise it would pick
# random column names

In [11]:
df.show()

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|               Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|    

In [14]:
df.printSchema() 
# This is showing stock information

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [16]:
print(df.head(3)) # it is a list
# it is just showing first 3 Row objects.

[Row(Date=datetime.datetime(2010, 1, 4, 0, 0), Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039), Row(Date=datetime.datetime(2010, 1, 5, 0, 0), Open=214.599998, High=215.589994, Low=213.249994, Close=214.379993, Volume=150476200, Adj Close=27.774976000000002), Row(Date=datetime.datetime(2010, 1, 6, 0, 0), Open=214.379993, High=215.23, Low=210.750004, Close=210.969995, Volume=138040000, Adj Close=27.333178000000004)]


In [17]:
print(df.head(3)[0])
# First Row object of a list

Row(Date=datetime.datetime(2010, 1, 4, 0, 0), Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039)


In [18]:
# SparkDataFrame is built upon the SparkSQL
# Now we will discuss the filter operation on dataframe

In [29]:
filtered_dataframe = df.filter("Close < 500") # this is not the best way to do it, it is just SparkSQL way of doing it 
print(filtered_dataframe)
print(filtered_dataframe.select(['Close','Open'])) # select returns required dataframe
filtered_dataframe.select(['Close','Open']).show() # .show() returns nothing

DataFrame[Date: timestamp, Open: double, High: double, Low: double, Close: double, Volume: int, Adj Close: double]
DataFrame[Close: double, Open: double]
+------------------+------------------+
|             Close|              Open|
+------------------+------------------+
|        214.009998|        213.429998|
|        214.379993|        214.599998|
|        210.969995|        214.379993|
|            210.58|            211.75|
|211.98000499999998|        210.299994|
|210.11000299999998|212.79999700000002|
|        207.720001|209.18999499999998|
|        210.650002|        207.870005|
|            209.43|210.11000299999998|
|            205.93|210.92999500000002|
|        215.039995|        208.330002|
|            211.73|        214.910006|
|        208.069996|        212.079994|
|            197.75|206.78000600000001|
|        203.070002|202.51000200000001|
|        205.940001|205.95000100000001|
|        207.880005|        206.849995|
|        199.289995|        204.930004|
|     

In [36]:
# This is recommended pythonic way of doing above thing
print(df['Close'])
print(type(df['Close']))
print(type(df.filter(df['Close']<500)))

another_df = df.filter(df['Close']<500)
print(type(another_df.select('Volume')))
result_df = another_df.select('Volume')
result_df.show()

# try to combine all steps at one line --> above is just for explanation

Column<b'Close'>
<class 'pyspark.sql.column.Column'>
<class 'pyspark.sql.dataframe.DataFrame'>
<class 'pyspark.sql.dataframe.DataFrame'>
+---------+
|   Volume|
+---------+
|123432400|
|150476200|
|138040000|
|119282800|
|111902700|
|115557400|
|148614900|
|151473000|
|108223500|
|148516900|
|182501900|
|153038200|
|152038600|
|220441900|
|266424900|
|466777500|
|430642100|
|293375600|
|311488100|
|187469100|
+---------+
only showing top 20 rows



In [40]:
df.filter((df['Close']<200) & (df['Open']>200)).show() 
# operators --> &(AND) , |(OR) and ~(NEGATE) [pass condition in paranthesis] 

+-------------------+------------------+----------+----------+----------+---------+------------------+
|               Date|              Open|      High|       Low|     Close|   Volume|         Adj Close|
+-------------------+------------------+----------+----------+----------+---------+------------------+
|2010-01-22 00:00:00|206.78000600000001|207.499996|    197.16|    197.75|220441900|         25.620401|
|2010-01-28 00:00:00|        204.930004|205.500004|198.699995|199.289995|293375600|25.819922000000002|
|2010-01-29 00:00:00|        201.079996|202.199995|190.250002|192.060003|311488100|         24.883208|
+-------------------+------------------+----------+----------+----------+---------+------------------+



In [43]:
df.filter(df['Low'] == 197.16).show()  # here just displaying a dataframe

+-------------------+------------------+----------+------+------+---------+---------+
|               Date|              Open|      High|   Low| Close|   Volume|Adj Close|
+-------------------+------------------+----------+------+------+---------+---------+
|2010-01-22 00:00:00|206.78000600000001|207.499996|197.16|197.75|220441900|25.620401|
+-------------------+------------------+----------+------+------+---------+---------+



In [44]:
list_row_objects = df.filter(df['Low'] == 197.16).collect()  
# it will return list of Row objects, and hence I can do a lil bit of processing if I want after collecting.

In [46]:
print(list_row_objects)

[Row(Date=datetime.datetime(2010, 1, 22, 0, 0), Open=206.78000600000001, High=207.499996, Low=197.16, Close=197.75, Volume=220441900, Adj Close=25.620401)]


In [51]:
# we can apply lot of functions on a Row object
row_object = list_row_objects[0]
print(type(row_object))


# DataFrame is just a DataSet of Row objects 


dictionary  = row_object.asDict()  # we can fetch a row object as a dictionary 
print(dictionary)
print(dictionary['Volume'])

<class 'pyspark.sql.types.Row'>
{'Date': datetime.datetime(2010, 1, 22, 0, 0), 'Open': 206.78000600000001, 'High': 207.499996, 'Low': 197.16, 'Close': 197.75, 'Volume': 220441900, 'Adj Close': 25.620401}
220441900


In [None]:
# END