In [7]:
from  pyspark.sql import SparkSession 

spark = SparkSession.builder.appName('featurs').getOrCreate()

df = spark.read.csv('/workspaces/machine_learning-projects-/pending_work/second_task/ecommerce_customer_data.csv', header = True,
                    inferSchema = True)

df.show()

25/03/26 15:04:53 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

+-----------+---+------+-------+--------------------+--------------------+-----------+--------------+---------------+---------------------+-------------------+----------------+-------------+-----------+----------------------+
|customer_id|age|gender|country|         signup_date|          last_login|total_spent|purchase_count|avg_order_value|cart_abandonment_rate|discount_usage_rate|product_category|loyalty_score|return_rate|customer_service_calls|
+-----------+---+------+-------+--------------------+--------------------+-----------+--------------+---------------+---------------------+-------------------+----------------+-------------+-----------+----------------------+
|          1| 56|Female|    USA|2020-06-22 09:41:...|2021-02-07 09:41:...|    1375.76|            27|          50.95|                  0.2|               0.05|        Clothing|        27.26|       0.03|                     0|
|          2| 69|Female|    USA|2021-07-11 09:41:...|2021-11-27 09:41:...|     292.14|          

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, datediff, avg, when, count, sum as spark_sum


# Check if the DataFrame is loaded correctly
df.printSchema()

# 1. signup_date - last_login
df = df.withColumn("days_since_last_login", datediff(col("last_login"), col("signup_date")))

# 2. Age range (older and child)
df = df.withColumn("age_range", when(col("age") < 18, "child")
                              .when(col("age") < 60, "adult")
                              .otherwise("older"))

# 3. Which age range spends more
age_range_spend = df.groupBy("age_range").agg(spark_sum("total_spent").alias("total_spend_by_age_range"))

# 4. Which age range has more purchase_count
age_range_purchase_count = df.groupBy("age_range").agg(spark_sum("purchase_count").alias("total_purchase_count_by_age_range"))

# 5. Which age range purchases which thing more frequently
age_range_frequent_purchase = df.groupBy("age_range", "product_category").agg(count("product_category").alias("item_count")).orderBy("age_range", "item_count", ascending=False)

# 6. Mark users who exceed their own average (signup_date - last_login)
user_avg_days = df.groupBy("customer_id").agg(avg("days_since_last_login").alias("avg_days_since_last_login"))
df = df.join(user_avg_days, on="customer_id")
df = df.withColumn("exceeds_own_avg", when(col("days_since_last_login") > col("avg_days_since_last_login"), 1).otherwise(0))

# 7. Mark users who spend more than their country average
country_avg_spend = df.groupBy("country").agg(avg("total_spent").alias("avg_country_spend"))
df = df.join(country_avg_spend, on="country")
df = df.withColumn("exceeds_country_avg_spend", when(col("total_spent") > col("avg_country_spend"), 1).otherwise(0))

# 8. Mark users who exceed country average (signup_date - last_login) and average total spend
country_avg_days = df.groupBy("country").agg(avg("days_since_last_login").alias("avg_country_days"))
df = df.join(country_avg_days, on="country")
df = df.withColumn("exceeds_country_avg_days_and_spend", when((col("days_since_last_login") > col("avg_country_days")) & (col("total_spent") > col("avg_country_spend")), 1).otherwise(0))

# 9. Mark rich users (low days_since_last_login, low purchase_count, high total_spent, low discount)
df = df.withColumn("is_rich", when((col("days_since_last_login") < 30) & (col("purchase_count") < 10) & (col("total_spent") > 1000) & (col("discount_usage_rate") < 10), 1).otherwise(0))

# 10. Mark users who purchase more compared to their country + age range
country_age_avg_purchase = df.groupBy("country", "age_range").agg(avg("purchase_count").alias("avg_country_age_purchase"))
df = df.join(country_age_avg_purchase, on=["country", "age_range"])
df = df.withColumn("exceeds_country_age_avg_purchase", when(col("purchase_count") > col("avg_country_age_purchase"), 1).otherwise(0))

# Show the resulting DataFrame
df.show()

root
 |-- customer_id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- country: string (nullable = true)
 |-- signup_date: timestamp (nullable = true)
 |-- last_login: timestamp (nullable = true)
 |-- total_spent: double (nullable = true)
 |-- purchase_count: integer (nullable = true)
 |-- avg_order_value: double (nullable = true)
 |-- cart_abandonment_rate: double (nullable = true)
 |-- discount_usage_rate: double (nullable = true)
 |-- product_category: string (nullable = true)
 |-- loyalty_score: double (nullable = true)
 |-- return_rate: double (nullable = true)
 |-- customer_service_calls: integer (nullable = true)



                                                                                

+-------+---------+-----------+---+------+--------------------+--------------------+-----------+--------------+---------------+---------------------+-------------------+----------------+-------------+-----------+----------------------+---------------------+-------------------------+---------------+------------------+-------------------------+------------------+----------------------------------+-------+------------------------+--------------------------------+
|country|age_range|customer_id|age|gender|         signup_date|          last_login|total_spent|purchase_count|avg_order_value|cart_abandonment_rate|discount_usage_rate|product_category|loyalty_score|return_rate|customer_service_calls|days_since_last_login|avg_days_since_last_login|exceeds_own_avg| avg_country_spend|exceeds_country_avg_spend|  avg_country_days|exceeds_country_avg_days_and_spend|is_rich|avg_country_age_purchase|exceeds_country_age_avg_purchase|
+-------+---------+-----------+---+------+--------------------+-------

In [1]:
''' 
1. 3482. Analyze Organization Hierarchy
2.1093. Statistics from a Large Sample, 
3. 2241. Design an ATM Machine, 
4. 2276. Count Integers in Intervals

''' 
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("EmployeeData").getOrCreate()
schema = ["employee_id", "employee_name", "manager_id", "salary", "department"]

# Define the data
data = [
    (1, "Alice", None, 12000, "Executive"),
    (2, "Bob", 1, 10000, "Sales"),
    (3, "Charlie", 1, 10000, "Engineering"),
    (4, "David", 2, 7500, "Sales"),
    (5, "Eva", 2, 7500, "Sales"),
    (6, "Frank", 3, 9000, "Engineering"),
    (7, "Grace", 3, 8500, "Engineering"),
    (8, "Hank", 4, 6000, "Sales"),
    (9, "Ivy", 6, 7000, "Engineering"),
    (10, "Judy", 6, 7000, "Engineering")
]


df = spark.createDataFrame(data, schema)


25/03/26 16:09:17 WARN Utils: Your hostname, codespaces-749f79 resolves to a loopback address: 127.0.0.1; using 10.0.3.123 instead (on interface eth0)
25/03/26 16:09:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/26 16:09:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
print([0] * 3)

[0, 0, 0]


In [11]:
for i in range(4,-1,-1): 
    print(i)

4
3
2
1
0
