# Getting Started with PySpark

##  Create Simple DataFrame

In [0]:

data = [("iPhone", 999), ("Samsung", 799), ("MacBook", 1299)]
df = spark.createDataFrame(data, ["product", "price"])
df.show()

+-------+-----+
|product|price|
+-------+-----+
| iPhone|  999|
|Samsung|  799|
|MacBook| 1299|
+-------+-----+



## # Filter expensive products

In [0]:
df.filter(df.price > 1000).show()

+-------+-----+
|product|price|
+-------+-----+
|MacBook| 1299|
+-------+-----+



## # Count total products

In [0]:
df.count()

3

## Select only product names

In [0]:
df.select("product").show()

+-------+
|product|
+-------+
| iPhone|
|Samsung|
|MacBook|
+-------+



## # Add a category column based on price

In [0]:
from pyspark.sql.functions import when
df.withColumn("category", 
    when(df.price > 1000, "Premium")
    .otherwise("Standard")
).show()

+-------+-----+--------+
|product|price|category|
+-------+-----+--------+
| iPhone|  999|Standard|
|Samsung|  799|Standard|
|MacBook| 1299| Premium|
+-------+-----+--------+



## # Group by price range and count

In [0]:
df.groupBy(
    when(df.price > 1000, "Premium").otherwise("Standard").alias("range")
).count().show()

+--------+-----+
|   range|count|
+--------+-----+
|Standard|    2|
| Premium|    1|
+--------+-----+



## # Calculate total value of all products

In [0]:
from pyspark.sql.functions import sum
df.agg(sum("price").alias("total_value")).show()

+-----------+
|total_value|
+-----------+
|       3097|
+-----------+

