https://www.datacamp.com/cheat-sheet/pyspark-cheat-sheet-spark-dataframes-in-python

In [1]:
# Spark SGlL is Apache Spark's module for working with structured data.

# Initializing SparkSession 
# A SparkSession can be used create DataFrame, register DataFrame as tables, 
# execute SGL over tables, cache tables, and read parquet files.

In [2]:
from pyspark.sql import SparkSession

# spark = SparkSession.builder.appName("Python Spark SQL basic example") \
#             .config("spark.some.config.option", "some-value").getOrCreate()

In [2]:
# Creating DataFrames
# Fromm RDDs

from pyspark.sql.types import *
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('people').getOrCreate()

# Read text file and split lines
lines = spark.sparkContext.textFile('people.txt')
parts = lines.map(lambda l: l.split(','))

# Create Rows from each line
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))

# Create DataFrame from Rows
peopledf = spark.createDataFrame(people)

In [3]:
spark.sql("SELECT * FROM customer_partitioned_table").show(2)

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `customer_partitioned_table` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [customer_partitioned_table], [], false


In [7]:
# Specify Schema

# Define the schema string and split it into separate field names
schemaString = "name age"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]

# Define the schema using the StructType and the list of StructFields
schema = StructType(fields)

# Map the parts to Row objects with the specified field names and types
people = parts.map(lambda p: Row(name=p[0],age=int(p[1].strip())))

# Create a DataFrame using the schema and the mapped data
spark.createDataFrame(people, schema).show(5,False)

+----+---+
|name|age|
+----+---+
|mlsb|38 |
|zlz |42 |
|zuni|44 |
|fky |33 |
|arwh|49 |
+----+---+
only showing top 5 rows



In [127]:
# From Spark Data Sources
# JSON

# df = spark.read.json("customer.json")
df = spark.read.csv("customer.csv",header=True)
# df.show(4,False)
# df2 = spark.read.load("people.json", format="json")
# df3 = spark.read.load("users.parquet")
df4 = spark.read.text("people.txt")


In [128]:
# filter
#Filter entries of age, only keep those records of which the values are >24>>> 
df.filter(df["age"]>24).show(5,False)
df.count()

+---------+--------+---+---------------+----------------------------+
|firstName|lastName|age|address        |phoneNumber                 |
+---------+--------+---+---------------+----------------------------+
|kglqe    |mmkuld  |38 |[QRG ,42984]   |[887 446-4575, 547 846-9558]|
|wqh      |tlv     |39 |[IMP ,64889]   |[949 994-4587, 787 788-5595]|
|ioy      |tdzz    |52 |[JMEOWS ,58728]|[444 465-9787, 794 775-7689]|
|vousa    |jepea   |26 |[OVCDA ,34448] |[947 899-4447, 796 847-9778]|
|ncx      |uzzki   |29 |[EEXPQ ,74927] |[766 864-9548, 484 766-4897]|
+---------+--------+---+---------------+----------------------------+
only showing top 5 rows



10000

In [129]:
# Duplicate Values 

df = df.dropDuplicates()
df.count()

10000

In [130]:
# Queries 

from pyspark.sql import functions as F

In [131]:
# select
# Show all entries in firstName column
df.select("firstName").show(1)

# Show all entries in firstName and lastName columns
df.select("firstName", "lastName").show(1)

# Show all entries in firstName, age and type
# df.select("firstName",
#           "age",
#           F.explode("phoneNumber").alias("contactInfo")).select("contactInfo.type",
#                                                               "firstName",
#                                                               "age").show()
df.select("firstName",
          "age",
          F.explode(F.split("phoneNumber", ",")).alias("contactInfo")
         ).select("contactInfo",
                   "firstName",
                   "age").show(1)
# Show all entries in firstName and age, add 1 to entries of age
df.select(df["firstName"], (df["age"]+1).alias("age")).show(1)

# Show all entries where age > 24
df.select(df['age'] > 24).show(1,False)

+---------+
|firstName|
+---------+
|     vijh|
+---------+
only showing top 1 row

+---------+--------+
|firstName|lastName|
+---------+--------+
|     vijh|     mfp|
+---------+--------+
only showing top 1 row

+-------------+---------+---+
|  contactInfo|firstName|age|
+-------------+---------+---+
|[748 668-4596|     vijh| 24|
+-------------+---------+---+
only showing top 1 row

+---------+----+
|firstName| age|
+---------+----+
|     vijh|25.0|
+---------+----+
only showing top 1 row

+----------+
|(age > 24)|
+----------+
|false     |
+----------+
only showing top 1 row



In [132]:
# when
# Select firstName and 0 or 1 depending on age > 30
df.select("firstName", F.when(df.age > 30, 1).otherwise(0)).show(5)

# Show firstName if in the given options
df[df.firstName.isin("rgy", "Boris")].collect()

+---------+--------------------------------------+
|firstName|CASE WHEN (age > 30) THEN 1 ELSE 0 END|
+---------+--------------------------------------+
|     vijh|                                     0|
|      rgy|                                     1|
|      kde|                                     0|
|     uzui|                                     0|
|      qow|                                     1|
+---------+--------------------------------------+
only showing top 5 rows



[Row(firstName='rgy', lastName='jtyxsl', age='50', address='[YGIB ,54852]', phoneNumber='[789 988-4454, 666 558-5677]')]

In [133]:
# like

df.withColumn("lastName", df.lastName.like("mfp")).select("firstName", "lastName").show(3)
df.select("firstName", #Show firstName, and lastName is TRUE if lastName is like Smith              
              df.lastName.like("Smith")) \
                  .show(3)


+---------+--------+
|firstName|lastName|
+---------+--------+
|     vijh|    true|
|      rgy|   false|
|      kde|   false|
+---------+--------+
only showing top 3 rows

+---------+-------------------+
|firstName|lastName LIKE Smith|
+---------+-------------------+
|     vijh|              false|
|      rgy|              false|
|      kde|              false|
+---------+-------------------+
only showing top 3 rows



In [134]:
# Startswith - Endswith 
# Select firstName and a boolean value indicating whether lastName starts with 'Sm'
df.select("firstName", F.col("lastName").startswith("vi").alias("lastNameStartsWithSm")).show(4)

# Select last names ending in 'th'
df.select(F.col("lastName").endswith("fp").alias("lastNameEndsWithTh")).show(4)

+---------+--------------------+
|firstName|lastNameStartsWithSm|
+---------+--------------------+
|     vijh|               false|
|      rgy|               false|
|      kde|               false|
|     uzui|               false|
+---------+--------------------+
only showing top 4 rows

+------------------+
|lastNameEndsWithTh|
+------------------+
|              true|
|             false|
|             false|
|             false|
+------------------+
only showing top 4 rows



In [135]:
# substring
# Select the substring of firstName column and alias it as "name"
df.select(F.substring(df.firstName, 1, 3).alias("name")).collect()[:2]

[Row(name='vij'), Row(name='rgy')]

In [136]:
# between
df.select(df.age.between(22, 24)) \
    .show(2) # Show age: values are TRUE if between 22 and 24          
df.filter(df.age.between(22,24)).select("*").show(2) #Show the whole row for age between 22 and 24
df.filter(df.age.between(22,24)).show(2) #Show the whole row for age between 22 and 24

+-----------------------------+
|((age >= 22) AND (age <= 24))|
+-----------------------------+
|                         true|
|                        false|
+-----------------------------+
only showing top 2 rows

+---------+--------+---+--------------+--------------------+
|firstName|lastName|age|       address|         phoneNumber|
+---------+--------+---+--------------+--------------------+
|     vijh|     mfp| 24|[IPWIA ,78644]|[748 668-4596, 46...|
|    vnjak|     exi| 24| [XBRJ ,62764]|[746 877-9994, 85...|
+---------+--------+---+--------------+--------------------+
only showing top 2 rows

+---------+--------+---+--------------+--------------------+
|firstName|lastName|age|       address|         phoneNumber|
+---------+--------+---+--------------+--------------------+
|     vijh|     mfp| 24|[IPWIA ,78644]|[748 668-4596, 46...|
|    vnjak|     exi| 24| [XBRJ ,62764]|[746 877-9994, 85...|
+---------+--------+---+--------------+--------------------+
only showing top 2 rows



In [137]:
# Add, Update & Remove Columns 
# Adding Columns

# df = df.withColumn("city", df.address.city)\
#        .withColumn("postalCode", df.address.postalCode)\
#        .withColumn("state", df.address.state)\
#        .withColumn("streetAddress", df.address.streetAddress)\
#        .withColumn("telePhoneNumber", explode(df.phoneNumber.number))\
#        .withColumn("telePhoneType", explode(df.phoneNumber.type))
# Exploding Arrays
# df = df.withColumn("telePhoneNumber_ex", F.explode(F.split(df.phoneNumber, ',')))
df = df.withColumn("phoneNumber1", F.regexp_replace(df.phoneNumber, r'[\[\]]', ''))
df = df.withColumn("address", F.regexp_replace(df.address, r'[\[\]]', ''))
df = df.withColumn("telePhoneNumber", (F.split(df.phoneNumber1,',')[0]))\
       .withColumn("telePhoneType", (F.split(df.phoneNumber1,',')[1]))\
       .withColumn("city", (F.split(df.address,',')[0]))\
       .withColumn("postalCode", (F.split(df.address,',')[1]))\
       .withColumn("state",  F.substring(F.split(df.address,',')[0], 0, 2))\
       .withColumn("streetAddress", (F.split(df.address,',')[1]))
df.show(5)

+---------+--------+---+------------+--------------------+--------------------+---------------+-------------+------+----------+-----+-------------+
|firstName|lastName|age|     address|         phoneNumber|        phoneNumber1|telePhoneNumber|telePhoneType|  city|postalCode|state|streetAddress|
+---------+--------+---+------------+--------------------+--------------------+---------------+-------------+------+----------+-----+-------------+
|     vijh|     mfp| 24|IPWIA ,78644|[748 668-4596, 46...|748 668-4596, 469...|   748 668-4596| 469 549-4496|IPWIA |     78644|   IP|        78644|
|      rgy|  jtyxsl| 50| YGIB ,54852|[789 988-4454, 66...|789 988-4454, 666...|   789 988-4454| 666 558-5677| YGIB |     54852|   YG|        54852|
|      kde|     hjc| 17| XMJH ,75137|[459 996-5597, 85...|459 996-5597, 857...|   459 996-5597| 857 767-8954| XMJH |     75137|   XM|        75137|
|     uzui|    qyjq| 25|KHHHM ,97616|[894 457-8954, 64...|894 457-8954, 644...|   894 457-8954| 644 596-5869|KHH

In [138]:
# removing
df = df.drop("phoneNumber1", "phoneNumber")
# df = df.drop(df.address).drop(df.phoneNumber) #not preferred

In [139]:
# Updating Columns
df = df.withColumnRenamed('telePhoneNumber', 'phoneNumber')

In [140]:
# Missing & Replacing Values
# Replace null values with 50
df.na.fill(50).show(2) 

# Return new dataframe omitting rows with null values
df.na.drop().show(2)

# Return new dataframe replacing one value with another
df.na.replace(10, 20).show(2)

+---------+--------+---+------------+------------+-------------+------+----------+-----+-------------+
|firstName|lastName|age|     address| phoneNumber|telePhoneType|  city|postalCode|state|streetAddress|
+---------+--------+---+------------+------------+-------------+------+----------+-----+-------------+
|     vijh|     mfp| 24|IPWIA ,78644|748 668-4596| 469 549-4496|IPWIA |     78644|   IP|        78644|
|      rgy|  jtyxsl| 50| YGIB ,54852|789 988-4454| 666 558-5677| YGIB |     54852|   YG|        54852|
+---------+--------+---+------------+------------+-------------+------+----------+-----+-------------+
only showing top 2 rows

+---------+--------+---+------------+------------+-------------+------+----------+-----+-------------+
|firstName|lastName|age|     address| phoneNumber|telePhoneType|  city|postalCode|state|streetAddress|
+---------+--------+---+------------+------------+-------------+------+----------+-----+-------------+
|     vijh|     mfp| 24|IPWIA ,78644|748 668-459

In [141]:
# groupby
df.groupBy("age").count().alias("num_members").show(2)

+---+-----+
|age|count|
+---+-----+
| 51|  214|
| 54|  228|
+---+-----+
only showing top 2 rows



In [142]:
# Sort by age in descending order
peopledf.sort(peopledf.age.desc()).collect()[:4]

# Sort by age in descending order using "sort" method and specifying ascending value as False
df.sort("age", ascending=False).collect()[:4]

# Sort by age in descending order and then by city in ascending order using "orderBy" method and specifying a list of column names and corresponding ordering values
df.orderBy(["age","lastName"],ascending=[0,1]).collect()[:4]

[Row(firstName='hsc', lastName='aanvlh', age='54', address='WMVCN ,22623', phoneNumber='544 754-8944', telePhoneType=' 947 786-4875', city='WMVCN ', postalCode='22623', state='WM', streetAddress='22623'),
 Row(firstName='ssjp', lastName='ahidp', age='54', address='PLGLOK ,92166', phoneNumber='895 464-4988', telePhoneType=' 598 667-8499', city='PLGLOK ', postalCode='92166', state='PL', streetAddress='92166'),
 Row(firstName='njb', lastName='akitfb', age='54', address='FIUEO ,97358', phoneNumber='685 777-6689', telePhoneType=' 666 655-8888', city='FIUEO ', postalCode='97358', state='FI', streetAddress='97358'),
 Row(firstName='ljz', lastName='aldb', age='54', address='EDOJ ,44649', phoneNumber='677 896-4854', telePhoneType=' 767 585-4785', city='EDOJ ', postalCode='44649', state='ED', streetAddress='44649')]

In [143]:
# GroupBy:

df.groupBy("age").count().show(2, truncate=False)

# Sort:

peopledf.sort("age", ascending=False).show(2, truncate=False)

df.sort(["age", "city"], ascending=[0, 1]).show(2, truncate=False)

# Repartitioning:

df.repartition(10).rdd.getNumPartitions()

df.coalesce(1).rdd.getNumPartitions()

+---+-----+
|age|count|
+---+-----+
|51 |214  |
|54 |228  |
+---+-----+
only showing top 2 rows

+----+---+
|name|age|
+----+---+
|nkz |54 |
|nfv |54 |
+----+---+
only showing top 2 rows

+---------+--------+---+------------+------------+-------------+------+----------+-----+-------------+
|firstName|lastName|age|address     |phoneNumber |telePhoneType|city  |postalCode|state|streetAddress|
+---------+--------+---+------------+------------+-------------+------+----------+-----+-------------+
|txtqe    |mcu     |54 |ABWA ,11496 |658 684-7898| 997 655-7855|ABWA  |11496     |AB   |11496        |
|dyp      |fbtmnr  |54 |ACXQR ,37456|667 744-7794| 969 476-4774|ACXQR |37456     |AC   |37456        |
+---------+--------+---+------------+------------+-------------+------+----------+-----+-------------+
only showing top 2 rows



1

In [None]:
# Register DataFrames as Views
peopledf.createOrReplaceGlobalTempView("people")
# df.createTempView("customer")
df.createOrReplaceTempView("customer")

# Query Views
customer_df = spark.sql("SELECT * FROM customer")
people_df = spark.sql("SELECT * FROM global_temp.people")

In [156]:
# Save to table
customer_df.write.format("parquet").mode("overwrite").saveAsTable("customer_table")
people_df.write.format("parquet").mode("overwrite").saveAsTable("people_table")
new_df = spark.sql("SELECT * FROM customer_table").withColumn("date",F.current_date())
new_df.write.format("parquet").mode("overwrite").partitionBy("date").saveAsTable("customer_partitioned_table")
spark.sql("SELECT * FROM customer_partitioned_table").show(2)

+---------+--------+---+------------+------------+-------------+------+----------+-----+-------------+----------+
|firstName|lastName|age|     address| phoneNumber|telePhoneType|  city|postalCode|state|streetAddress|      date|
+---------+--------+---+------------+------------+-------------+------+----------+-----+-------------+----------+
|     vijh|     mfp| 24|IPWIA ,78644|748 668-4596| 469 549-4496|IPWIA |     78644|   IP|        78644|2023-10-30|
|      rgy|  jtyxsl| 50| YGIB ,54852|789 988-4454| 666 558-5677| YGIB |     54852|   YG|        54852|2023-10-30|
+---------+--------+---+------------+------------+-------------+------+----------+-----+-------------+----------+
only showing top 2 rows



In [148]:
# Inspect Data
print(customer_df.dtypes)                 # to get the data types of columns in the dataframe
customer_df.show(1)                        # to display the dataframe in a tabular format
print(customer_df.head())                 # to display the first n rows of dataframe (default value of n is 5)
print(customer_df.take(1))                # to take the first n rows of dataframe (default value of n is 1)
print(customer_df.schema)                 # to get the schema of the dataframe


# printing all the outputs together


[('firstName', 'string'), ('lastName', 'string'), ('age', 'string'), ('address', 'string'), ('phoneNumber', 'string'), ('telePhoneType', 'string'), ('city', 'string'), ('postalCode', 'string'), ('state', 'string'), ('streetAddress', 'string')]
+---------+--------+---+------------+------------+-------------+------+----------+-----+-------------+
|firstName|lastName|age|     address| phoneNumber|telePhoneType|  city|postalCode|state|streetAddress|
+---------+--------+---+------------+------------+-------------+------+----------+-----+-------------+
|     vijh|     mfp| 24|IPWIA ,78644|748 668-4596| 469 549-4496|IPWIA |     78644|   IP|        78644|
+---------+--------+---+------------+------------+-------------+------+----------+-----+-------------+
only showing top 1 row

Row(firstName='vijh', lastName='mfp', age='24', address='IPWIA ,78644', phoneNumber='748 668-4596', telePhoneType=' 469 549-4496', city='IPWIA ', postalCode='78644', state='IP', streetAddress='78644')
[Row(firstName='

In [152]:
customer_df.describe().show()             # to get the summary statistics of the dataframe
print(customer_df.columns)                # to get the column names of the dataframe
print(customer_df.count())                # to get the count of rows in the dataframe
print(customer_df.distinct().count())     # to get the count of distinct rows in the dataframe
customer_df.printSchema()                 # to print the schema of the dataframe

+-------+---------+--------+------------------+------------+------------+-------------+------+-----------------+-----+-----------------+
|summary|firstName|lastName|               age|     address| phoneNumber|telePhoneType|  city|       postalCode|state|    streetAddress|
+-------+---------+--------+------------------+------------+------------+-------------+------+-----------------+-----+-----------------+
|  count|    10000|   10000|             10000|       10000|       10000|        10000| 10000|            10000|10000|            10000|
|   mean|      NaN|    NULL|           31.9126|        NULL|        NULL|         NULL|  NULL|        55304.204| NULL|        55304.204|
| stddev|     NULL|    NULL|12.954649494847528|        NULL|        NULL|         NULL|  NULL|25730.68664139555| NULL|25730.68664139555|
|    min|      aac|   aabux|                10|AACIH ,45377|444 444-6695| 444 444-6657|AACIH |            11111|   AA|            11111|
|    max|    zztsa|    zzzt|             

In [153]:
customer_df.explain()                     # to print the execution plan of the dataframe


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [firstName#6461, lastName#6462, age#6463, address#6780, split(phoneNumber1#6773, ,, -1)[0] AS phoneNumber#6914, split(phoneNumber1#6773, ,, -1)[1] AS telePhoneType#6795, split(address#6780, ,, -1)[0] AS city#6804, split(address#6780, ,, -1)[1] AS postalCode#6814, substring(split(address#6780, ,, -1)[0], 0, 2) AS state#6825, split(address#6780, ,, -1)[1] AS streetAddress#6837]
   +- HashAggregate(keys=[phoneNumber#6465, age#6463, lastName#6462, firstName#6461, address#6464], functions=[])
      +- Exchange hashpartitioning(phoneNumber#6465, age#6463, lastName#6462, firstName#6461, address#6464, 200), ENSURE_REQUIREMENTS, [plan_id=12326]
         +- HashAggregate(keys=[phoneNumber#6465, age#6463, lastName#6462, firstName#6461, address#6464], functions=[])
            +- FileScan csv [firstName#6461,lastName#6462,age#6463,address#6464,phoneNumber#6465] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileInd

In [150]:
# Output
# ======

# Convert DataFrame into RDD
rdd1 = df.rdd 

# Convert DataFrame into RDD of string
df.toJSON().first() 

# Return the contents of DataFrame as a Pandas DataFrame
df.toPandas()

Unnamed: 0,firstName,lastName,age,address,phoneNumber,telePhoneType,city,postalCode,state,streetAddress
0,vijh,mfp,24,"IPWIA ,78644",748 668-4596,469 549-4496,IPWIA,78644,IP,78644
1,rgy,jtyxsl,50,"YGIB ,54852",789 988-4454,666 558-5677,YGIB,54852,YG,54852
2,kde,hjc,17,"XMJH ,75137",459 996-5597,857 767-8954,XMJH,75137,XM,75137
3,uzui,qyjq,25,"KHHHM ,97616",894 457-8954,644 596-5869,KHHHM,97616,KH,97616
4,qow,plpna,40,"AMI ,38797",644 488-5446,885 745-7768,AMI,38797,AM,38797
...,...,...,...,...,...,...,...,...,...,...
9995,qkdq,yxma,39,"DWINOY ,47814",955 546-9878,644 896-8444,DWINOY,47814,DW,47814
9996,qwcxw,ofoyns,46,"DJVWD ,33657",776 886-6497,847 695-8996,DJVWD,33657,DJ,33657
9997,onkw,fdihd,54,"CVWUG ,37543",799 554-5675,589 868-5599,CVWUG,37543,CV,37543
9998,xya,audnk,52,"FIZCAW ,28486",875 768-9799,847 844-9694,FIZCAW,28486,FI,28486


In [151]:
# Data Structures &Writing to Files
# ===============

# Save selected columns as a Parquet file
df.select("firstName", "city") \
  .write \
  .save("tmp_nameAndCity.parquet",mode="overwrite", format="parquet",compression="snappy")

# Save selected columns as a JSON file
df.select("firstName", "age") \
  .write \
  .save("tmp_namesAndAges.json", format="json", mode="overwrite")