# Basic PySpark examples. Part 2

## - Initialize SparkSession

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .appName("BasicExample_part_2") \
    .getOrCreate()

## 1. Join Two DataFrames

In [3]:
data1 = [("Alice", 1), ("Bob", 2)]
data2 = [(1, "HR"), (2, "Engineering")]

df1 = spark.createDataFrame(data1, ["Name", "DeptID"])
df2 = spark.createDataFrame(data2, ["DeptID", "DeptName"])

joined_df = df1.join(df2, on="DeptID", how="inner")
joined_df.show()

+------+-----+-----------+
|DeptID| Name|   DeptName|
+------+-----+-----------+
|     1|Alice|         HR|
|     2|  Bob|Engineering|
+------+-----+-----------+



## 1. Use SQL with Temp Views

In [5]:
df = spark.read.csv("file.csv", header=True, inferSchema=True)

df.createOrReplaceTempView("people")
sql_df = spark.sql("SELECT Name, Age FROM people WHERE Age > 27")
sql_df.show()

+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 30|
|Charlie| 35|
|    Eve| 28|
+-------+---+



## 1. User-Defined Function (UDF)

In [6]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def greet(name):
    return f"Hello, {name}"

greet_udf = udf(greet, StringType())

df = df.withColumn("Greeting", greet_udf(df.Name))
df.show()


+-------+---+--------------+
|   Name|Age|      Greeting|
+-------+---+--------------+
|  Alice| 30|  Hello, Alice|
|    Bob| 25|    Hello, Bob|
|Charlie| 35|Hello, Charlie|
|  Diana| 22|  Hello, Diana|
|    Eve| 28|    Hello, Eve|
+-------+---+--------------+



## 1. Window Functions

In [7]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

window_spec = Window.orderBy(df["Age"])
df.withColumn("Rank", rank().over(window_spec)).show()

+-------+---+--------------+----+
|   Name|Age|      Greeting|Rank|
+-------+---+--------------+----+
|  Diana| 22|  Hello, Diana|   1|
|    Bob| 25|    Hello, Bob|   2|
|    Eve| 28|    Hello, Eve|   3|
|  Alice| 30|  Hello, Alice|   4|
|Charlie| 35|Hello, Charlie|   5|
+-------+---+--------------+----+



## 1. Handle Missing Values

In [9]:
df = spark.read.csv("file_with_nulls.csv", header=True, inferSchema=True)

df.fillna({'Age': 0}).show()  # Fill missing Age with 0
df.dropna().show()            # Drop rows with any nulls

+-------+----+
|   Name| Age|
+-------+----+
|  Alice|30.0|
|    Bob| 0.0|
|Charlie|35.0|
|  Diana| 0.0|
|    Eve|28.0|
+-------+----+

+-------+----+
|   Name| Age|
+-------+----+
|  Alice|30.0|
|Charlie|35.0|
|    Eve|28.0|
+-------+----+



## 1. Rename Columns

In [10]:
df = df.withColumnRenamed("Age", "Years")
df.show()

+-------+-----+
|   Name|Years|
+-------+-----+
|  Alice| 30.0|
|    Bob| NULL|
|Charlie| 35.0|
|  Diana| NULL|
|    Eve| 28.0|
+-------+-----+



## 1. DataFrame Schema and Types

In [11]:
df.printSchema()
df.dtypes

root
 |-- Name: string (nullable = true)
 |-- Years: double (nullable = true)



[('Name', 'string'), ('Years', 'double')]

## 1. Filter with Multiple Conditions

In [13]:
df = spark.read.csv("file_with_nulls.csv", header=True, inferSchema=True)

df.filter((df.Age > 25) & (df.Name != "Bob")).show()

+-------+----+
|   Name| Age|
+-------+----+
|  Alice|30.0|
|Charlie|35.0|
|    Eve|28.0|
+-------+----+



## 8. Stop the SparkSession

In [2]:
spark.stop()