In [1]:
# set env
import os
os.environ['SPARK_HOME'] = "/home/cloud_user/apps/spark/current"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [1]:
# Import PySpark
from pyspark.sql import SparkSession

In [35]:
# Create SparkSession
spark = SparkSession.builder \
    .appName("data-from-csv") \
    .getOrCreate()

25/01/26 01:57:19 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# Read from CSV

In [9]:
# Use bash to display the first ten rows of the products.csv file:
%%bash
head --help
#head -10 ./data/products.csv

UsageError: Line magic function `%%bash` not found.


In [11]:
csv_file_path = "./data/products.csv"
df = spark.read.csv(csv_file_path, header=True)

                                                                                

In [13]:
# Display the CSV schema
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- price: string (nullable = true)



In [15]:
# Show the first 5 rows
df.show(5)

                                                                                

+---+--------------------+---------------+-----+
| id|            category|       quantity|price|
+---+--------------------+---------------+-----+
|  1|           iPhone 12|    Electronics|   10|
|  2|     Nike Air max 90|       Clothing|   25|
|  3|KitchenAid Stand ...|Home Appliances|    5|
|  4|    The Great Gatsby|          Books|   50|
|  5|L'Oreal Paris Mas...|         Beauty|  100|
+---+--------------------+---------------+-----+
only showing top 5 rows



## Specifying Schema

In [17]:
# Because we know what the datatypes should be, we will modify the schema to explicitly set types.
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

In [29]:
# Define the Schema
updated_schema = StructType([
    StructField(name="id", dataType=IntegerType(), nullable=True),
    StructField(name="name", dataType=StringType(), nullable=True),
    StructField(name="category", dataType=StringType(), nullable=True),
    StructField(name="quantity", dataType=IntegerType(), nullable=True),
    StructField(name="price", dataType=DoubleType(), nullable=True)
])

In [30]:
# Read csv into a DataFrame now with the new Schema
df = spark.read.csv(csv_file_path, header=True, schema=updated_schema)

In [31]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)



In [32]:
df.show(5)

+---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  1|           iPhone 12|    Electronics|      10|899.99|
|  2|     Nike Air max 90|       Clothing|      25|119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
+---+--------------------+---------------+--------+------+
only showing top 5 rows



                                                                                

## InferSchema for CSV

In [33]:
# Read CSV into DataFrame with inferSchema
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

                                                                                

In [34]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)



In [2]:
spark.stop()

NameError: name 'spark' is not defined