In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit
import pandas as pd

In [2]:
TEMP_PATH = os.environ.get("TEMP", "C:/Temp")

In [3]:
# Inisialisasi Spark
spark = SparkSession.builder \
    .appName("FlaggingData") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.default.parallelism", "8") \
    .config("spark.python.worker.memory", "512m") \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .config("spark.local.dir", TEMP_PATH) \
    .getOrCreate()

In [4]:
# Baca file Excel
file_name = "flags_and_parameters.xlsx"
parameters_df = pd.read_excel(file_name, sheet_name="parameters")
categories_df = pd.read_excel(file_name, sheet_name="categories")

In [5]:
# Konversi ke PySpark DataFrame
flag_rules = spark.createDataFrame(categories_df)
input_df = spark.read.csv("data.csv", header=True, inferSchema=True)

In [6]:
flag_rules.show(5)

+------+-----+---------+----+----+-----------+
| field| mode|condition| min| max| flag_value|
+------+-----+---------+----+----+-----------+
|   age|range|     NULL| 5.0|11.0|      Child|
|   age|range|     NULL|12.0|25.0|   Teenager|
|   age|range|     NULL|26.0|45.0|      Adult|
|   age|range|     NULL|46.0|65.0|    Elderly|
|salary|exact|  > 50000|NULL|NULL|High Salary|
+------+-----+---------+----+----+-----------+
only showing top 5 rows



In [7]:
input_df.show(5)

+-----+---+------+
| name|age|salary|
+-----+---+------+
| John| 35| 60000|
|Alice| 28| 45000|
|  Bob| 47|300000|
+-----+---+------+



In [8]:
def generate_flag(df, rules, mode="exact", **kwargs):
    rules_column = kwargs.get("rules_column", "Unknown")
    output_column = kwargs.get("output_column", "Flag")

    # Filter rules berdasarkan kolom yang diinginkan
    rules = rules.filter(col("field") == rules_column)
    df = df.withColumn(output_column, lit("Unknown"))  # Default to Unknown

    if mode == "exact":
        # Mode exact: cek kondisi yang tepat
        for row in rules.collect():
            condition = row["condition"]
            flag_value = row["flag_value"]

            df = df.withColumn(
                output_column,
                when(eval(f"col('{rules_column}') {condition}"), flag_value).otherwise(col(output_column))
            )

    elif mode == "range":
        # Mode range: cek rentang nilai
        for row in rules.collect():
            min_val = row["min"]
            max_val = row["max"]
            flag_value = row["flag_value"]
            
            # Terapkan flagging dengan rentang nilai
            df = df.withColumn(
                output_column,
                when(col(rules_column).between(min_val, max_val), flag_value).otherwise(col(output_column))
            )
    else:
        print("Wrong input, should be 'exact' or 'range' as input mode.")

    return df


In [9]:
# Membuat kolom flag berdasarkan aturan
input_df = generate_flag(input_df, flag_rules, rules_column="age", output_column="age_category", mode="range")
input_df = generate_flag(input_df, flag_rules, rules_column="salary", output_column="salary_category", mode="exact")

In [10]:
# Tampilkan hasil
input_df.show()

+-----+---+------+------------+---------------+
| name|age|salary|age_category|salary_category|
+-----+---+------+------------+---------------+
| John| 35| 60000|       Adult|    High Salary|
|Alice| 28| 45000|       Adult|     Low Salary|
|  Bob| 47|300000|     Elderly|    High Salary|
+-----+---+------+------------+---------------+



In [11]:
spark.stop()