In [1]:
import os
import findspark
findspark.init(os.getenv('SPARK_HOME'))
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Basics").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/08/10 19:00:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/08/10 19:00:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/08/10 19:00:43 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/08/10 19:00:43 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/08/10 19:00:43 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


In [3]:
df = spark.read.json('people.json')
df.show()

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [4]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [5]:
df.columns

['age', 'name']

In [6]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [7]:
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

In [8]:
data_schema = [StructField(name='age', dataType=IntegerType(), nullable=True),
               StructField(name='name', dataType=StringType(), nullable=True)]

In [9]:
final_struc = StructType(fields=data_schema)

In [10]:
df = spark.read.json('people.json', schema=final_struc)

In [11]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [12]:
print(df['age'])
print(df.select('age'))

Column<'age'>
DataFrame[age: int]


In [13]:
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [14]:
df.head(2)

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

In [15]:
df.select(['age', 'name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [16]:
df = df.withColumn(colName='newage', col=df['age']+2)
df.show()

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    32|
|  19| Justin|    21|
+----+-------+------+



In [17]:
df.withColumnRenamed(existing='newage', new='age_plus_2').show()

+----+-------+----------+
| age|   name|age_plus_2|
+----+-------+----------+
|null|Michael|      null|
|  30|   Andy|        32|
|  19| Justin|        21|
+----+-------+----------+



In [18]:
df.createOrReplaceTempView('people')

In [19]:
results = spark.sql('SELECT * FROM people')
results.show()

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    32|
|  19| Justin|    21|
+----+-------+------+



In [20]:
new_results = spark.sql('SELECT * FROM people WHERE age=30')
new_results.show()

+---+----+------+
|age|name|newage|
+---+----+------+
| 30|Andy|    32|
+---+----+------+

