In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=9c20110022f3255f01e9b393d30ca000735a1429d6c2df8c7a9bdbcaa34ca122
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col

In [3]:
# Initialize Spark session
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

In [4]:
# Sample data with column names containing dots
data = [("James", 23), ("Ann", 40)]
df = spark.createDataFrame(data).toDF("name.fname", "gender")

In [5]:
# Print the schema and show the DataFrame
df.printSchema()
df.show()

root
 |-- name.fname: string (nullable = true)
 |-- gender: long (nullable = true)

+----------+------+
|name.fname|gender|
+----------+------+
|     James|    23|
|       Ann|    40|
+----------+------+



In [6]:
# Accessing columns with dots in their names using backticks
df.select(col("`name.fname`")).show()  # Select column 'name.fname'
df.select(df["`name.fname`"]).show()   # Alternative way to select column 'name.fname'

+----------+
|name.fname|
+----------+
|     James|
|       Ann|
+----------+

+----------+
|name.fname|
+----------+
|     James|
|       Ann|
+----------+



In [7]:
# Add a new column 'new_col' by extracting a substring from 'name.fname'
df.withColumn("new_col", col("`name.fname`").substr(1, 2)).show()

+----------+------+-------+
|name.fname|gender|new_col|
+----------+------+-------+
|     James|    23|     Ja|
|       Ann|    40|     An|
+----------+------+-------+



In [8]:
# Filter rows where 'name.fname' starts with 'J'
df.filter(col("`name.fname`").startswith("J")).show()

+----------+------+
|name.fname|gender|
+----------+------+
|     James|    23|
+----------+------+



In [9]:
# Renaming columns to remove dots
new_cols = (column.replace('.', '_') for column in df.columns)  # Replace '.' with '_'
df2 = df.toDF(*new_cols)  # Apply new column names to DataFrame
df2.show()

+----------+------+
|name_fname|gender|
+----------+------+
|     James|    23|
|       Ann|    40|
+----------+------+



In [10]:
# Using DataFrame object to access columns
df.select(df.gender).show()  # Select 'gender' column
df.select(df["gender"]).show()  # Alternative way to select 'gender' column

+------+
|gender|
+------+
|    23|
|    40|
+------+

+------+
|gender|
+------+
|    23|
|    40|
+------+



In [11]:
# Accessing column names with dots using backticks
df.select(df["`name.fname`"]).show()  # Select 'name.fname' column

+----------+
|name.fname|
+----------+
|     James|
|       Ann|
+----------+



In [12]:
# Using SQL col() function to access columns
df.select(col("gender")).show()  # Select 'gender' column using col()
df.select(col("`name.fname`")).show()  # Select 'name.fname' column using col() with backticks

+------+
|gender|
+------+
|    23|
|    40|
+------+

+----------+
|name.fname|
+----------+
|     James|
|       Ann|
+----------+



In [19]:
# Sample data with struct fields
data = [
    Row(name="James", prop=Row(hair="black", eye="blue")),
    Row(name="Ann", prop=Row(hair="grey", eye="black"))
]
df = spark.createDataFrame(data)

In [20]:
# Print the schema of the DataFrame with struct fields
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- prop: struct (nullable = true)
 |    |-- hair: string (nullable = true)
 |    |-- eye: string (nullable = true)

+-----+-------------+
| name|         prop|
+-----+-------------+
|James|{black, blue}|
|  Ann|{grey, black}|
+-----+-------------+



In [15]:
# Accessing nested fields in struct columns
df.select(df.prop.hair).show()  # Select 'hair' field from 'prop' struct
df.select(df["prop.hair"]).show()  # Alternative way to select 'hair' field from 'prop' struct
df.select(col("prop.hair")).show()  # Using col() to select 'hair' field from 'prop' struct
df.select(col("prop.*")).show()  # Select all fields from 'prop' struct

+---------+
|prop.hair|
+---------+
|    black|
|     grey|
+---------+

+-----+
| hair|
+-----+
|black|
| grey|
+-----+

+-----+
| hair|
+-----+
|black|
| grey|
+-----+

+-----+-----+
| hair|  eye|
+-----+-----+
|black| blue|
| grey|black|
+-----+-----+



In [16]:
# Sample data for column operations
data = [(100, 2, 1), (200, 3, 4), (300, 4, 4)]
df = spark.createDataFrame(data).toDF("col1", "col2", "col3")

In [17]:
# Performing arithmetic operations on columns
df.select(df.col1 + df.col2).show()  # Add 'col1' and 'col2'
df.select(df.col1 - df.col2).show()  # Subtract 'col2' from 'col1'
df.select(df.col1 * df.col2).show()  # Multiply 'col1' and 'col2'
df.select(df.col1 / df.col2).show()  # Divide 'col1' by 'col2'
df.select(df.col1 % df.col2).show()  # Modulus of 'col1' by 'col2'

+-------------+
|(col1 + col2)|
+-------------+
|          102|
|          203|
|          304|
+-------------+

+-------------+
|(col1 - col2)|
+-------------+
|           98|
|          197|
|          296|
+-------------+

+-------------+
|(col1 * col2)|
+-------------+
|          200|
|          600|
|         1200|
+-------------+

+-----------------+
|    (col1 / col2)|
+-----------------+
|             50.0|
|66.66666666666667|
|             75.0|
+-----------------+

+-------------+
|(col1 % col2)|
+-------------+
|            0|
|            2|
|            0|
+-------------+



In [18]:
# Performing comparison operations on columns
df.select(df.col2 > df.col3).show()  # Check if 'col2' is greater than 'col3'
df.select(df.col2 < df.col3).show()  # Check if 'col2' is less than 'col3'
df.select(df.col2 == df.col3).show()  # Check if 'col2' is equal to 'col3'

+-------------+
|(col2 > col3)|
+-------------+
|         true|
|        false|
|        false|
+-------------+

+-------------+
|(col2 < col3)|
+-------------+
|        false|
|         true|
|        false|
+-------------+

+-------------+
|(col2 = col3)|
+-------------+
|        false|
|        false|
|         true|
+-------------+



### Explanation of the Comments

1. **Initialization and Data Preparation**:
   - **SparkSession Initialization**: The script starts by initializing a Spark session.
   - **Sample Data and Schema**: Defines sample data with columns, some of which contain dots.

2. **DataFrame Operations**:
   - **Printing Schema and DataFrame**: Prints the schema and displays the DataFrame.
   - **Column Access with Dots**: Demonstrates how to access columns with dots in their names using backticks.
   - **Adding a New Column**: Adds a new column by extracting a substring from another column.
   - **Filtering Rows**: Filters rows based on conditions applied to columns with dots in their names.
   - **Renaming Columns**: Renames columns to remove dots and make them easier to work with.
   - **Using DataFrame Object and col() Function**: Shows various ways to access columns using the DataFrame object and the `col` function.

3. **Nested Struct Fields**:
   - **Sample Data with Struct Fields**: Defines sample data with nested struct fields.
   - **Accessing Nested Fields**: Demonstrates how to access nested fields within struct columns.

4. **Column Operations**:
   - **Sample Data for Column Operations**: Defines sample data for demonstrating arithmetic and comparison operations.
   - **Arithmetic Operations**: Performs addition, subtraction, multiplication, division, and modulus operations on columns.
   - **Comparison Operations**: Performs greater than, less than, and equality comparisons on columns.