In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

In [3]:
spark = SparkSession.builder.appName("basics2").getOrCreate()

In [4]:
print(spark)
print(type(spark))

<pyspark.sql.session.SparkSession object at 0x1072a22e8>
<class 'pyspark.sql.session.SparkSession'>


In [5]:
df = spark.read. \
    json('file:///Users/hdagar3/Documents/Spark_Things/Spark_Course_Files_JosePortilla/Spark_DataFrames/people.json')

In [6]:
print(df)
print(type(df))
df.show()

DataFrame[age: bigint, name: string]
<class 'pyspark.sql.dataframe.DataFrame'>
+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [7]:
print(df['age'])
print(type(df['age']))

# So df['age'] would return the column object

Column<b'age'>
<class 'pyspark.sql.column.Column'>


In [8]:
print(df.select('age'))
print(type(df.select('age')))

DataFrame[age: bigint]
<class 'pyspark.sql.dataframe.DataFrame'>


In [9]:
selected_dataframe1 = df.select('age') # we can also pass list of column names here to grap those columns
selected_dataframe1.show()

selected_dataframe2 = df.select(['age','name']) # it again returns a dataframe
selected_dataframe2.show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [10]:
print(df.head(2)) # could also use take() method
# it is a list of Row objects

print(df.head(2)[0]) # first Row object
print(type(df.head(2)[0]))

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]
Row(age=None, name='Michael')
<class 'pyspark.sql.types.Row'>


In [11]:
# in order to create new column we could do something like this :
df.show()

new_dataframe = df.withColumn('new_age',df['age']) 
# this operation is not inplace, hence we need to take results in another dataframe
# df['age'] is of type Column

new_dataframe.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

+----+-------+-------+
| age|   name|new_age|
+----+-------+-------+
|null|Michael|   null|
|  30|   Andy|     30|
|  19| Justin|     19|
+----+-------+-------+



In [12]:
another_dataframe = df.withColumn('double_age',df['age']*2)  # df['age']**2 , df['age']/2 ..etc operations
# So we can apply any type of operation on numeric columns which you want
another_dataframe.show()

+----+-------+----------+
| age|   name|double_age|
+----+-------+----------+
|null|Michael|      null|
|  30|   Andy|        60|
|  19| Justin|        38|
+----+-------+----------+



In [13]:
df.show() # it did not change because none of the operations was mutable/inplace.

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [14]:
# We can also rename existing column
modified_df = df.withColumnRenamed('age','my_new_age')
modified_df.show()

+----------+-------+
|my_new_age|   name|
+----------+-------+
|      null|Michael|
|        30|   Andy|
|        19| Justin|
+----------+-------+



In [15]:
# Now we can register dataframe as a table and perform SQL operations on that.
df.createOrReplaceTempView('people')

In [16]:
df_sql = spark.sql("select * from people") # it returns again a data frame

In [17]:
df_sql.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [18]:
spark.sql("select * from people where age=30").show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [19]:
# END