In [1]:
import pathlib
from datetime import datetime
from typing import List, Tuple, Union, Dict

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import Column

from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [2]:
# master configuration to use only 4 CPU cores
spark = SparkSession.builder.master("local[4]").getOrCreate()

# basic configuration to use only a reasonable number of partitions
spark.conf.set("spark.sql.shuffle.partition", 4)

# configuration to work in UTC
spark.conf.set("spark.sql.session.timeZone", "UTC")

23/07/21 14:38:55 WARN Utils: Your hostname, Emilianos-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.130 instead (on interface en0)
23/07/21 14:38:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/21 14:38:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/07/21 14:38:56 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
def drop_columns(df: DataFrame, columns_to_drop: list) -> DataFrame:
    """
    Drop specified columns from a PySpark DataFrame.

    Parameters:
        df (DataFrame): Input PySpark DataFrame.
        columns_to_drop (list): List of column names to drop.

    Returns:
        DataFrame: DataFrame with specified columns dropped.
    """
    return df.drop(*columns_to_drop)

In [4]:
# Sample data
data = [
    ("Alice", 28, "F", "Engineer"),
    ("Bob", 32, "M", "Doctor"),
    ("Charlie", 45, "M", "Lawyer"),
]

columns = ["Name", "Age", "Gender", "Profession"]

# Create a DataFrame
df = spark.createDataFrame(data, columns)

In [5]:
# Show the DataFrame before dropping columns
print("DataFrame before dropping columns:")
df.show()

DataFrame before dropping columns:


                                                                                

+-------+---+------+----------+
|   Name|Age|Gender|Profession|
+-------+---+------+----------+
|  Alice| 28|     F|  Engineer|
|    Bob| 32|     M|    Doctor|
|Charlie| 45|     M|    Lawyer|
+-------+---+------+----------+



In [6]:
# List of columns to drop
columns_to_drop = ["Age", "Profession"]

# Drop the specified columns using the function
df_dropped = drop_columns(df, columns_to_drop)

# Show the DataFrame after dropping columns
print("DataFrame after dropping columns:")
df_dropped.show()

DataFrame after dropping columns:
+-------+------+
|   Name|Gender|
+-------+------+
|  Alice|     F|
|    Bob|     M|
|Charlie|     M|
+-------+------+



In [None]:
def add_column(df: DataFrame, config: dict) -> DataFrame:
    new_column = config["column"]
    value_column = config["value_column"]
    return df.withColumn(new_column, F.lit(value_column))