In [1]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import to_date, col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

In [2]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .config("spark.driver.memory", "4g") \
    .master("local[*]") \
    .getOrCreate()
# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/31 06:24:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Load all tables

In [36]:
df_userlogs = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .parquet("datamart/silver/user_logs"))

In [37]:
df_transactions = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .parquet("datamart/silver/transactions"))

In [38]:
df_members = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .parquet("datamart/silver/members"))



In [39]:
print("=== Members ===")
display(df_members.limit(5).toPandas())

print("\n=== User Logs ===")
display(df_userlogs.limit(5).toPandas())

print("\n=== Transactions ===")
display(df_transactions.limit(5).toPandas())

=== Members ===


                                                                                

Unnamed: 0,msno,city_clean,registered_via,registration_date,tenure_days_at_snapshot,registered_via_freq,city_freq,city_idx,via_idx,city_oh,via_oh
0,Z1SBrlbnzZzQZtlS3CUmas9SQkJyb0B1+gEeuP/V9O0=,1,1,2016-06-12,261,6e-06,0.709705,0.0,14.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2GkgHuwB+NCVnpSRSxw0nyzPKeCTeVOoYiu2TdRg4qg=,1,1,2016-01-19,406,6e-06,0.709705,0.0,14.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,X1AmJaNJ1bpGEgxLveRwBhxGHytaIHHuNfAAPFKCFg4=,1,13,2016-12-27,63,0.000806,0.709705,0.0,5.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
3,bJrFIXFymwWPhavSKeaA+yKE/Du1vgUwjEI3bJNQRuE=,1,13,2017-01-14,45,0.000806,0.709705,0.0,5.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
4,nq+4KRKNWTQkH9VNArdNfhBNl70Vh01WEi/i9rPlxqU=,1,13,2017-02-01,27,0.000806,0.709705,0.0,5.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."



=== User Logs ===


Unnamed: 0,msno,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs,year,month
0,kvfTVgxOfjbVBTXyYGiAbEHzI98lveMomdtk+s88Ido=,2016-11-26,3,0,1,0,27,30,7254.569,2016,11
1,TKjom9SvWQfr9/FaSicWUyCgzYqow8ogCXOgsn5XKnY=,2016-11-09,0,0,0,0,5,4,1246.0,2016,11
2,V7mwW25pIzSyhRvKtfJ0fEAhDZe4rmbiUGZ62OUzw9M=,2016-11-02,3,0,0,0,7,10,1529.59,2016,11
3,HpsUB9oBFNEkMz8LS4cytXeuqKe7uBJ/qF1zAM5OGZc=,2016-11-18,13,2,2,0,21,32,6463.842,2016,11
4,kWl8bmqhgiRnkvlGe7xNn5IueQxK485CAJSepMxCz8M=,2016-11-18,2,1,1,1,8,13,2557.191,2016,11



=== Transactions ===


Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,source_file,day,transaction_id,year,month
0,VIHHLepUEMXxF2ielWsw694ctqMrYUrAzubpTU6zmKo=,41,30,149,149,1,2016-11-08,2016-12-08,0,transactions.csv,8,84c76dce-ee16-403a-8428-be7b5db9a433,2016,11
1,TkAhpvvz+vU7LBuVHbphI4QzYY7QvK0mpypkzT/6DHQ=,26,1,0,0,0,2016-11-23,2016-11-30,0,transactions.csv,23,218cfc0b-0139-443c-a756-6805cbef01d9,2016,11
2,VIl7ZhgbpR4kLC/Zx0TS8j1Tg9Uui4OK9SKHXQtIWDQ=,41,30,149,149,1,2016-11-06,2016-12-06,0,transactions.csv,6,08bba8c6-5efe-4048-9a79-a66c2a704ffe,2016,11
3,UBfqR1od2RUoO5UfDkQD8Bdttmy3xP6KogdTf0B5Di8=,26,1,0,0,0,2016-11-18,2017-05-24,0,transactions_v2.csv,18,de5c9457-650f-45b6-8ca2-4a226d2e49d9,2016,11
4,VIm9MHW1FcFvwKm5AauSYxBMnr0yUuPuWn+SRbxzNDk=,37,30,149,149,1,2016-11-10,2016-12-10,0,transactions.csv,10,6b208684-2544-43a3-b9ba-3793222743d0,2016,11


In [40]:
df_members.select("registration_date").distinct().show()




+-----------------+
|registration_date|
+-----------------+
|       2014-11-12|
|       2013-01-22|
|       2015-03-09|
|       2013-05-21|
|       2012-10-06|
|       2014-09-26|
|       2013-09-09|
|       2013-03-26|
|       2015-05-19|
|       2012-04-17|
|       2016-03-01|
|       2007-11-23|
|       2007-04-20|
|       2009-07-25|
|       2010-08-11|
|       2007-11-15|
|       2005-06-06|
|       2006-05-17|
|       2006-05-21|
|       2009-11-22|
+-----------------+
only showing top 20 rows



# Set Today's date (aka inference date)

This date is training cutoff. we train on all users before this date

In [41]:
inference_date = "2017-03-01"
print(f"Today's date is set to: {inference_date}")

Today's date is set to: 2017-03-01


In [42]:
from pyspark.sql.functions import to_date, col, lit

registered_users = (
    df_members
    .withColumn("registration_date", F.to_date("registration_date"))
    .filter(F.col("registration_date") <= F.to_date(F.lit(inference_date)))
    .withColumn(
        "tenure_days_at_snapshot",
        F.datediff(F.to_date(F.lit(inference_date)), F.col("registration_date"))
    )
    .select(
        "msno",
        "registration_date",
        "tenure_days_at_snapshot",
        "registered_via",
        "city_clean",
        "via_oh",
        "city_oh"
    )
)


print("Registered users up to", inference_date, ":", registered_users.count())
registered_users.show(5, truncate=False)

Registered users up to 2017-03-01 : 6618540




+--------------------------------------------+-----------------+-----------------------+--------------+----------+---------------+--------------+
|msno                                        |registration_date|tenure_days_at_snapshot|registered_via|city_clean|via_oh         |city_oh       |
+--------------------------------------------+-----------------+-----------------------+--------------+----------+---------------+--------------+
|Z1SBrlbnzZzQZtlS3CUmas9SQkJyb0B1+gEeuP/V9O0=|2016-06-12       |262                    |1             |1         |(18,[14],[1.0])|(21,[0],[1.0])|
|2GkgHuwB+NCVnpSRSxw0nyzPKeCTeVOoYiu2TdRg4qg=|2016-01-19       |407                    |1             |1         |(18,[14],[1.0])|(21,[0],[1.0])|
|X1AmJaNJ1bpGEgxLveRwBhxGHytaIHHuNfAAPFKCFg4=|2016-12-27       |64                     |13            |1         |(18,[5],[1.0]) |(21,[0],[1.0])|
|bJrFIXFymwWPhavSKeaA+yKE/Du1vgUwjEI3bJNQRuE=|2017-01-14       |46                     |13            |1         |(18,[5],[1

                                                                                

# Aggregate userlogs data
Aggregate all userlog data from the last 30 days for all members

In [43]:
from pyspark.sql.functions import sum as _sum, to_date, col, lit, date_sub

ref_today = to_date(lit(inference_date))
lower30 = F.date_sub(ref_today, 29)   # inclusive window: [today-29, today]
lower7  = F.date_sub(ref_today, 6)    # inclusive window: [today-6,  today]

# Filter logs to the last 30 days
userlogs_30d = (
    df_userlogs
    .withColumn("date", to_date(col("date")))
    .filter((col("date") >= lower30) & (col("date") <= ref_today))
)

userlogs_7d = (
    df_userlogs
    .withColumn("date", to_date(col("date")))
    .filter((col("date") >= lower7) & (col("date") <= ref_today))
)




all user activity for the past 1 month (inference date) and past 7 days.

In [44]:
from pyspark.sql import functions as F

today = "2017-03-01"
ref_today = F.to_date(F.lit(today))
lower30 = F.date_sub(ref_today, 29)
lower7  = F.date_sub(ref_today, 6)

# Convert date column properly
df_userlogs = df_userlogs.withColumn("date", F.to_date("date"))

# Last 30 days
userlogs_30d = df_userlogs.filter((F.col("date") >= lower30) & (F.col("date") <= ref_today))

# Last 7 days
userlogs_7d  = df_userlogs.filter((F.col("date") >= lower7) & (F.col("date") <= ref_today))

# --- PRINT WINDOW RANGES CLEANLY ---
minmax_30 = userlogs_30d.select(F.min("date").alias("min_date"), F.max("date").alias("max_date")).collect()[0]
minmax_7  = userlogs_7d.select(F.min("date").alias("min_date"), F.max("date").alias("max_date")).collect()[0]

print(f"=== 30-day Window ===")
print(f"Expected Range : {lower30} → {ref_today}")
print(f"Actual Data    : {minmax_30['min_date']} → {minmax_30['max_date']}\n")

print(f"=== 7-day Window ===")
print(f"Expected Range : {lower7} → {ref_today}")
print(f"Actual Data    : {minmax_7['min_date']} → {minmax_7['max_date']}\n")

# Show sample rows for visual check
print("Sample rows (30-day):")
userlogs_30d.orderBy("date").show(5, truncate=False)

print("Sample rows (7-day):")
userlogs_7d.orderBy("date").show(5, truncate=False)


                                                                                

=== 30-day Window ===
Expected Range : Column<'date_sub(to_date(2017-03-01), 29)'> → Column<'to_date(2017-03-01)'>
Actual Data    : 2017-01-31 → 2017-02-28

=== 7-day Window ===
Expected Range : Column<'date_sub(to_date(2017-03-01), 6)'> → Column<'to_date(2017-03-01)'>
Actual Data    : 2017-02-23 → 2017-02-28

Sample rows (30-day):


                                                                                

+--------------------------------------------+----------+------+------+------+-------+-------+-------+----------+----+-----+
|msno                                        |date      |num_25|num_50|num_75|num_985|num_100|num_unq|total_secs|year|month|
+--------------------------------------------+----------+------+------+------+-------+-------+-------+----------+----+-----+
|MFgsxN5hxmtImcDzvIMS8re+fOYqIcJ/NxZQ9pOtdPk=|2017-01-31|5     |3     |1     |1      |4      |14     |1765.493  |2017|1    |
|6f2d8xP4+7I5dmNCs3L1bxDkMu4nTIiG2WaMFzXwWtI=|2017-01-31|1     |0     |0     |1      |1      |3      |370.908   |2017|1    |
|s/YsCwEawjHllDo4esbHe9+mlHsf+A5Sj9Wr8QXXD2c=|2017-01-31|6     |2     |2     |1      |42     |50     |11294.512 |2017|1    |
|w9IAl6AP2KDLbLc9QOH5zmqo/i+3WFE5eoz2dj7fo6o=|2017-01-31|0     |0     |0     |1      |19     |19     |5042.345  |2017|1    |
|jMnISH5O9cUm4HK/0i9A2Yw14zxD9KoLoxrpnVkmhY0=|2017-01-31|0     |0     |0     |0      |8      |8      |1984.512  |2017|1    |




+--------------------------------------------+----------+------+------+------+-------+-------+-------+----------+----+-----+
|msno                                        |date      |num_25|num_50|num_75|num_985|num_100|num_unq|total_secs|year|month|
+--------------------------------------------+----------+------+------+------+-------+-------+-------+----------+----+-----+
|AykiiZt/O0tj/C2vQJguJsy9XUY5ntQx3s7joKmQqXU=|2017-02-23|2     |2     |0     |0      |0      |3      |181.891   |2017|2    |
|Q/HIJwlgnmJ/uNIWKzttysNMAG0prIvbWXt/8TpTwOM=|2017-02-23|8     |0     |0     |0      |1      |9      |342.633   |2017|2    |
|TXYD1o7RFxaEtVoIhAXRBuAion+OLv+SaKSV6UrkCBY=|2017-02-23|0     |0     |2     |2      |11     |9      |3484.156  |2017|2    |
|23/PDSfc0cUySvkV5rIPtl9vit6kr3lZSn/Bei4+BzE=|2017-02-23|0     |1     |0     |0      |16     |16     |3640.11   |2017|2    |
|N3LSaE5py+qQZqzGXNa6cdMOmZNce3WGHsf/DSebU/4=|2017-02-23|2     |0     |1     |2      |39     |32     |10297.15  |2017|2    |


                                                                                

In [45]:
registered_users.select("msno").distinct().count()

                                                                                

6618540

In [46]:
userlogs_30d.select("msno").distinct().count()

                                                                                

1098181

In [47]:
userlogs_7d.select("msno").distinct().count()

                                                                                

847338

In [48]:
user_sum_30d = (
    userlogs_30d
    .groupBy("msno")
    .agg(F.sum("total_secs").alias("sum_secs_w30"))
)

registered_users = (
    registered_users
    .join(user_sum_30d, on="msno", how="left")
    .na.fill({"sum_secs_w30": 0.0})   # fill 0 for users with no activity
)




In [49]:
user_active_days_30d = (
    userlogs_30d
    .groupBy("msno")
    .agg(F.countDistinct("date").alias("active_days_w30"))
)

registered_users = (
    registered_users
    .join(user_active_days_30d, on="msno", how="left")
    .na.fill({"active_days_w30": 0})
)





In [50]:
user_complete_rate_30d = (
    userlogs_30d
    .groupBy("msno")
    .agg(
        (F.sum("num_100") / F.sum("num_unq")).alias("complete_rate_w30")
    )
)

# Join with registered_users
registered_users = (
    registered_users
    .join(user_complete_rate_30d, on="msno", how="left")
    .na.fill({"complete_rate_w30": 0.0})
)


In [51]:
user_sum_7d = (
    userlogs_7d
    .groupBy("msno")
    .agg(F.sum("total_secs").alias("sum_secs_w7"))
)

# === 2️⃣ Join with registered_users ===
registered_users = (
    registered_users
    .join(user_sum_7d, on="msno", how="left")
    .na.fill({"sum_secs_w7": 0.0})
)

# === 3️⃣ Compute engagement_ratio_7_30 ===
registered_users = (
    registered_users
    .withColumn(
        "engagement_ratio_7_30",
        F.col("sum_secs_w7") / F.when(F.col("sum_secs_w30") > 0, F.col("sum_secs_w30")).otherwise(F.lit(1))
    )
)


In [52]:
last_play = (
    df_userlogs
    .filter(F.col("date") <= ref_today)
    .groupBy("msno")
    .agg(F.max("date").alias("last_play_date"))
)

# 3) Join to registered_users and compute gap
registered_users = (
    registered_users
    .join(last_play, on="msno", how="left")
    .withColumn("days_since_last_play", F.datediff(ref_today, F.col("last_play_date")))
)

In [53]:
from pyspark.sql import Window

# 1️⃣ Aggregate total seconds per user per day (in the 30d window)
daily_secs = (
    userlogs_30d
    .groupBy("msno", "date")
    .agg(F.sum("total_secs").alias("daily_secs"))
)

# 2️⃣ Assign day index relative to the start of the window
window_spec = Window.partitionBy("msno").orderBy("date")
daily_secs = (
    daily_secs
    .withColumn("day_idx", F.row_number().over(window_spec))
)

# 3️⃣ Compute slope = cov(x, y) / var(x)
trend = (
    daily_secs
    .groupBy("msno")
    .agg(
        (F.covar_pop("day_idx", "daily_secs") / F.var_pop("day_idx")).alias("trend_secs_w30")
    )
)

# 4️⃣ Join back to registered_users
registered_users = (
    registered_users
    .join(trend, on="msno", how="left")
    .na.fill({"trend_secs_w30": 0.0})
)

In [54]:
registered_users = registered_users.na.fill(0)
registered_users.show(5)



+--------------------+-----------------+-----------------------+--------------+----------+--------------+--------------+------------+---------------+------------------+-----------+---------------------+--------------+--------------------+--------------+
|                msno|registration_date|tenure_days_at_snapshot|registered_via|city_clean|        via_oh|       city_oh|sum_secs_w30|active_days_w30| complete_rate_w30|sum_secs_w7|engagement_ratio_7_30|last_play_date|days_since_last_play|trend_secs_w30|
+--------------------+-----------------+-----------------------+--------------+----------+--------------+--------------+------------+---------------+------------------+-----------+---------------------+--------------+--------------------+--------------+
|++4RuqBw0Ss6bQU4o...|       2014-07-14|                    961|             7|         1|(18,[3],[1.0])|(21,[0],[1.0])|    1368.191|              1|0.5714285714285714|        0.0|                  0.0|    2017-02-20|                   9|

                                                                                

# Transaction data

In [55]:
df_transactions.show(5)

+--------------------+-----------------+-----------------+---------------+------------------+-------------+----------------+----------------------+---------+-------------------+---+--------------------+----+-----+
|                msno|payment_method_id|payment_plan_days|plan_list_price|actual_amount_paid|is_auto_renew|transaction_date|membership_expire_date|is_cancel|        source_file|day|      transaction_id|year|month|
+--------------------+-----------------+-----------------+---------------+------------------+-------------+----------------+----------------------+---------+-------------------+---+--------------------+----+-----+
|VIHHLepUEMXxF2iel...|               41|               30|            149|               149|            1|      2016-11-08|            2016-12-08|        0|   transactions.csv|  8|84c76dce-ee16-403...|2016|   11|
|TkAhpvvz+vU7LBuVH...|               26|                1|              0|                 0|            0|      2016-11-23|            2016-11-

In [56]:

df_transactions_filtered = (
    df_transactions
    .filter(F.to_date(F.col("transaction_date")) <= F.to_date(F.lit(inference_date)))
)


In [57]:
# Latest transaction per user
latest_tx = (df_transactions_filtered.groupBy("msno")
               .agg(F.max("transaction_date").alias("latest_transaction_date")))

# Join and compute tenure_days
registered_users = (registered_users
    .join(latest_tx, on="msno", how="left")
    .withColumn("tenure_days",
                F.datediff(F.col("latest_transaction_date"), F.col("registration_date")))
    .na.fill({"tenure_days": 0})
)


In [58]:
registered_users.show(5)



+--------------------+-----------------+-----------------------+--------------+----------+--------------+--------------+------------+---------------+------------------+-----------+---------------------+--------------+--------------------+--------------+-----------------------+-----------+
|                msno|registration_date|tenure_days_at_snapshot|registered_via|city_clean|        via_oh|       city_oh|sum_secs_w30|active_days_w30| complete_rate_w30|sum_secs_w7|engagement_ratio_7_30|last_play_date|days_since_last_play|trend_secs_w30|latest_transaction_date|tenure_days|
+--------------------+-----------------+-----------------------+--------------+----------+--------------+--------------+------------+---------------+------------------+-----------+---------------------+--------------+--------------------+--------------+-----------------------+-----------+
|++4RuqBw0Ss6bQU4o...|       2014-07-14|                    961|             7|         1|(18,[3],[1.0])|(21,[0],[1.0])|    1368.1

                                                                                

In [59]:
# 1️⃣ Compute counts per user
auto_renew_stats = (
    df_transactions_filtered
    .groupBy("msno")
    .agg(
        F.sum(F.when(F.col("is_auto_renew") == 1, 1).otherwise(0)).alias("auto_renew_count"),
        F.count("*").alias("total_tx_before_expire")
    )
    .withColumn(
        "auto_renew_share",
        F.col("auto_renew_count") / F.when(F.col("total_tx_before_expire") > 0, F.col("total_tx_before_expire")).otherwise(F.lit(1))
    )
)

# 2️⃣ Join to registered_users
registered_users = (
    registered_users
    .join(auto_renew_stats.select("msno", "auto_renew_share"), on="msno", how="left")
    .na.fill({"auto_renew_share": 0.0})
)


In [60]:
registered_users.show(5)



+--------------------+-----------------+-----------------------+--------------+----------+--------------+--------------+------------+---------------+------------------+-----------+---------------------+--------------+--------------------+--------------+-----------------------+-----------+----------------+
|                msno|registration_date|tenure_days_at_snapshot|registered_via|city_clean|        via_oh|       city_oh|sum_secs_w30|active_days_w30| complete_rate_w30|sum_secs_w7|engagement_ratio_7_30|last_play_date|days_since_last_play|trend_secs_w30|latest_transaction_date|tenure_days|auto_renew_share|
+--------------------+-----------------+-----------------------+--------------+----------+--------------+--------------+------------+---------------+------------------+-----------+---------------------+--------------+--------------------+--------------+-----------------------+-----------+----------------+
|++4RuqBw0Ss6bQU4o...|       2014-07-14|                    961|             7|

                                                                                

In [61]:
w_latest = Window.partitionBy("msno").orderBy(F.col("transaction_date").desc())

# 2️⃣ Rank transactions and keep the latest
latest_tx_flag = (
    df_transactions_filtered
    .withColumn("rn", F.row_number().over(w_latest))
    .filter(F.col("rn") == 1)
    .select("msno", F.col("is_auto_renew").alias("last_is_auto_renew"))
)

# 3️⃣ Join into registered_users
registered_users = (
    registered_users
    .join(latest_tx_flag, on="msno", how="left")
    .na.fill({"last_is_auto_renew": 0})
)

In [62]:
registered_users.show(5)

[Stage 294:>                                                        (0 + 1) / 1]

+--------------------+-----------------+-----------------------+--------------+----------+--------------+--------------+------------+---------------+------------------+-----------+---------------------+--------------+--------------------+--------------+-----------------------+-----------+----------------+------------------+
|                msno|registration_date|tenure_days_at_snapshot|registered_via|city_clean|        via_oh|       city_oh|sum_secs_w30|active_days_w30| complete_rate_w30|sum_secs_w7|engagement_ratio_7_30|last_play_date|days_since_last_play|trend_secs_w30|latest_transaction_date|tenure_days|auto_renew_share|last_is_auto_renew|
+--------------------+-----------------+-----------------------+--------------+----------+--------------+--------------+------------+---------------+------------------+-----------+---------------------+--------------+--------------------+--------------+-----------------------+-----------+----------------+------------------+
|++4RuqBw0Ss6bQU4o...|

                                                                                

In [2]:
output_path = "datamart/gold/feature_store/date"

(registered_users.write.mode("overwrite").parquet(output_path))

NameError: name 'registered_users' is not defined

In [2]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .config("spark.driver.memory", "4g") \
    .master("local[*]") \
    .getOrCreate()
# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/31 07:30:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
from utils.gold_feature_store import create_gold_features
from datetime import datetime
from dateutil.relativedelta import relativedelta

start_date = datetime(2016, 12, 1)
end_date   = datetime(2017, 3, 1)

current = start_date
while current <= end_date:
    inference_date = current.strftime("%Y-%m-%d")
    print(f"🏗️ Creating features for {inference_date} ...")
    create_gold_features(inference_date, spark)
    current += relativedelta(months=1)

print("✅ All monthly feature stores created successfully!")


[Stage 42:=>(31 + 3) / 34][Stage 44:=> (6 + 7) / 13][Stage 46:>  (0 + 2) / 13]2 + 2) / 34][Stage 44:=> (6 + 7) / 13][Stage 46:>  (0 + 3) / 13]

🏗️ Creating features for 2016-12-01 ...


                                                                                

✅ Feature store snapshot saved to: datamart/gold/feature_store/2016-12-01
🏗️ Creating features for 2017-01-01 ...


                                                                                

✅ Feature store snapshot saved to: datamart/gold/feature_store/2017-01-01
🏗️ Creating features for 2017-02-01 ...


[Stage 125:(53 + 12) / 158][Stage 126:>(0 + 0) / 158][Stage 127:>(0 + 0) / 158]]

[Stage 42:=>(31 + 3) / 34][Stage 44:=> (6 + 7) / 13][Stage 46:>  (0 + 2) / 13]