## Handling missing data

In [None]:
# Import SparkSession from pyspark.sql
from pyspark.sql import SparkSession

# Create my_spark
spark = SparkSession.builder.appName("my_spark").getOrCreate()

# Load the CSV file into a DataFrame
df = spark.read.csv("salaries.csv", header=True, inferSchema=True)

# Use .na.drop() to remove rows with null values

# Drop rows with any nulls
df_cleaned = df.na.drop()

# Filter out nulls
df_cleaned = df.where(col("columnName").isNotNull())

# Use .na.fill({"column": value) to replace nulls with a specific value
# Fill nulls in the age column with the value 0
df_filled = df.na.fill({"age": 0})

### Column operations

In [None]:

#Use .withColumn() to add a new column based on calculations or existing columns

# Create a new column 'age_plus_5'
df = df.withColumn("age_plus_5", df["age"] + 5)

#Use withColumnRenamed() to rename columns
# Rename the 'age' column to 'years'
df = df.withColumnRenamed("age", "years")

#Use drop() to remove unnecessary columns
# Drop the 'department' column
df = df.drop("department")

### Row operations

In [None]:
#Use .filter() to select rows based on specific conditions
# Filter rows where salary is greater than 50000
filtered_df = df.filter(df["salary"] > 50000)

#Use .groupBy() and aggregate functions (e.g., .sum() , .avg() ) to summarize data
# Group by department and calculate the average salary
grouped_df = df.groupBy("department").avg("salary")

In [None]:
# Drop rows with any nulls
census_cleaned = census_df.na.drop()

# Show the result
census_cleaned.show()

# Create a new column 'weekly_salary'
census_df_weekly = census_df.withColumn("weekly_salary", census_df.income / 52)

# Rename the 'age' column to 'years'
census_df_weekly = census_df_weekly.withColumnRenamed("age", "years")

# Show the result
census_df_weekly.show()

### Joins in PySpark

Syntax

```python
DataFrame1.join(DataFrame2, on="column", how="join_type")
```

In [None]:
# Joining on id column using an inner join
df_joined = df1.join(df2, on="id", how="inner")

# Joining on columns with different names
df_joined = df1.join(df2,df1.Id == df2.Name, "inner")

### Union

In [None]:
# Union of two DataFrames with identical schemas
df_union = df1.union(df2)