# 0. **Install PySpark**:

In [19]:
!pip install pyspark



# **Full Script with Comments**


In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, when
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, MapType

In [21]:
# Create SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

In [43]:
# Sample data and schema
data = [("James", "Bond", "100", None),
        ("Ann", "Varsa", "200", 'F'),
        ("Tom Cruise", "XXX", "400", ''),
        ("Tom Brand", None, "400", 'M')]

columns = ["fname", "lname", "id", "gender"]
df = spark.createDataFrame(data, columns)

In [23]:
# Alias
df.select(df.fname.alias("first_name"),
          df.lname.alias("last_name"),
          expr("fname || ',' || lname").alias("fullName")).show()

+----------+---------+--------------+
|first_name|last_name|      fullName|
+----------+---------+--------------+
|     James|     Bond|    James,Bond|
|       Ann|    Varsa|     Ann,Varsa|
|Tom Cruise|      XXX|Tom Cruise,XXX|
| Tom Brand|     NULL|          NULL|
+----------+---------+--------------+



In [24]:
# Sort by ascending and descending order
df.sort(df.fname.asc()).show()
df.sort(df.fname.desc()).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|       Ann|Varsa|200|     F|
|     James| Bond|100|  NULL|
| Tom Brand| NULL|400|     M|
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
| Tom Brand| NULL|400|     M|
|     James| Bond|100|  NULL|
|       Ann|Varsa|200|     F|
+----------+-----+---+------+



In [25]:
# Cast
df.select(df.fname, df.id.cast("int")).printSchema()

root
 |-- fname: string (nullable = true)
 |-- id: integer (nullable = true)



In [26]:
# Between
df.filter(df.id.between(100, 300)).show()

+-----+-----+---+------+
|fname|lname| id|gender|
+-----+-----+---+------+
|James| Bond|100|  NULL|
|  Ann|Varsa|200|     F|
+-----+-----+---+------+



In [27]:
# Contains
df.filter(df.fname.contains("Cruise")).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



In [28]:
# Startswith, endswith
df.filter(df.fname.startswith("T")).show()
df.filter(df.fname.endswith("Cruise")).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
| Tom Brand| NULL|400|     M|
+----------+-----+---+------+

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



In [29]:
# isNull & isNotNull
df.filter(df.lname.isNull()).show()
df.filter(df.lname.isNotNull()).show()

+---------+-----+---+------+
|    fname|lname| id|gender|
+---------+-----+---+------+
|Tom Brand| NULL|400|     M|
+---------+-----+---+------+

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|     James| Bond|100|  NULL|
|       Ann|Varsa|200|     F|
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



In [46]:
# Like (incorrect usage, corrected below)
df.select(df.fname, df.lname, df.id).filter(df.fname.like("%om")).show()

+-----+-----+---+
|fname|lname| id|
+-----+-----+---+
+-----+-----+---+



In [47]:
# Substr
df.select(df.fname.substr(1, 2).alias("substr")).show()

+------+
|substr|
+------+
|    Ja|
|    An|
|    To|
|    To|
+------+



In [48]:
# When & Otherwise
df.select(df.fname, df.lname,
          when(df.gender == "M", "Male")
          .when(df.gender == "F", "Female")
          .when(df.gender == None, "")
          .otherwise(df.gender).alias("new_gender")).show()

+----------+-----+----------+
|     fname|lname|new_gender|
+----------+-----+----------+
|     James| Bond|      NULL|
|       Ann|Varsa|    Female|
|Tom Cruise|  XXX|          |
| Tom Brand| NULL|      Male|
+----------+-----+----------+



In [49]:
# isin
li = ["100", "200"]
df.select(df.fname, df.lname, df.id).filter(df.id.isin(li)).show()

+-----+-----+---+
|fname|lname| id|
+-----+-----+---+
|James| Bond|100|
|  Ann|Varsa|200|
+-----+-----+---+



In [50]:
# Working with complex data types
data = [(("James", "Bond"), ["Java", "C#"], {'hair': 'black', 'eye': 'brown'}),
        (("Ann", "Varsa"), [".NET", "Python"], {'hair': 'brown', 'eye': 'black'}),
        (("Tom Cruise", ""), ["Python", "Scala"], {'hair': 'red', 'eye': 'grey'}),
        (("Tom Brand", None), ["Perl", "Ruby"], {'hair': 'black', 'eye': 'blue'})]

schema = StructType([
    StructField('name', StructType([
        StructField('fname', StringType(), True),
        StructField('lname', StringType(), True)])),
    StructField('languages', ArrayType(StringType()), True),
    StructField('properties', MapType(StringType(), StringType()), True)
])

df = spark.createDataFrame(data, schema)
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- fname: string (nullable = true)
 |    |-- lname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [51]:
# getItem
df.select(df.languages.getItem(1)).show()
df.select(df.properties.getItem("hair")).show()

+------------+
|languages[1]|
+------------+
|          C#|
|      Python|
|       Scala|
|        Ruby|
+------------+

+----------------+
|properties[hair]|
+----------------+
|           black|
|           brown|
|             red|
|           black|
+----------------+



In [36]:
# getField from Struct or Map
df.select(df.properties.getField("hair")).show()
df.select(df.name.getField("fname")).show()

+----------------+
|properties[hair]|
+----------------+
|           black|
|           brown|
|             red|
|           black|
+----------------+

+----------+
|name.fname|
+----------+
|     James|
|       Ann|
|Tom Cruise|
| Tom Brand|
+----------+



# **Explanation and Key Points**


1. **Alias**:
   - Creates aliases for columns and constructs a full name using `expr`.

2. **Sorting**:
   - Sorts the DataFrame by the `fname` column in both ascending and descending order.

3. **Casting**:
   - Casts the `id` column from string to integer type and displays the schema.

4. **Filtering**:
   - Various filters using `between`, `contains`, `startswith`, `endswith`, `isNull`, and `isNotNull`.

5. **Pattern Matching**:
   - Filters rows where `fname` matches a pattern using `like`.

6. **Substring**:
   - Extracts a substring from the `fname` column.

7. **Conditional Logic**:
   - Uses `when` and `otherwise` to create a new column with conditional values based on `gender`.

8. **isin**:
   - Filters rows where `id` is in a given list.

9. **Complex Data Types**:
   - Demonstrates handling of nested structures like arrays and maps.
   - Uses `getItem` and `getField` to access elements of arrays and fields of maps and structs.