In [138]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from pyspark.sql import SparkSession

# Stop any existing Spark sessions
try:
    spark.stop()
except:
    pass

In [None]:
# Create SparkSession with additional configs to avoid connection issues
spark = (SparkSession.builder
    .master("local[*]")
    .appName("feature_discovery")
    .config("spark.driver.host", "127.0.0.1")
    .config("spark.driver.bindAddress", "127.0.0.1")
    .config("spark.ui.enabled", "false")  # Disable Spark UI to avoid port conflicts
    .getOrCreate())

df = spark.read.options(header=True, inferSchema='True',delimiter=',').csv("data/bank-additional-full.csv")

df.head()

In [141]:
df.show(5)

+---+---------+-------+-----------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---+
|age|      job|marital|  education|default|housing|loan|  contact|month|day_of_week|duration|campaign|pdays|previous|   poutcome|emp.var.rate|cons.price.idx|cons.conf.idx|euribor3m|nr.employed|  y|
+---+---------+-------+-----------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---+
| 56|housemaid|married|   basic.4y|     no|     no|  no|telephone|  may|        mon|     261|       1|  999|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|     5191.0| no|
| 57| services|married|high.school|unknown|     no|  no|telephone|  may|        mon|     149|       1|  999|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|     5191.0| no|
| 37| serv

In [142]:
df = df.toDF(*[c.replace(".", "_") for c in df.columns])
df.show(5)

+---+---------+-------+-----------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---+
|age|      job|marital|  education|default|housing|loan|  contact|month|day_of_week|duration|campaign|pdays|previous|   poutcome|emp_var_rate|cons_price_idx|cons_conf_idx|euribor3m|nr_employed|  y|
+---+---------+-------+-----------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---+
| 56|housemaid|married|   basic.4y|     no|     no|  no|telephone|  may|        mon|     261|       1|  999|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|     5191.0| no|
| 57| services|married|high.school|unknown|     no|  no|telephone|  may|        mon|     149|       1|  999|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|     5191.0| no|
| 37| serv

In [143]:
print(df.columns)

['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'y']


In [144]:
for col, _ctype in df.dtypes:
    if _ctype in ["string"]:
        print("*"*10)
        print(col)
        category_rows = df.select(col).distinct().collect()
        categories = [row[0] for row in category_rows]
        print(categories)

**********
job
['management', 'retired', 'unknown', 'self-employed', 'student', 'blue-collar', 'entrepreneur', 'admin.', 'technician', 'services', 'housemaid', 'unemployed']
**********
marital
['unknown', 'divorced', 'married', 'single']
**********
education
['high.school', 'unknown', 'basic.6y', 'professional.course', 'university.degree', 'illiterate', 'basic.4y', 'basic.9y']
**********
default
['unknown', 'no', 'yes']
**********
housing
['unknown', 'no', 'yes']
**********
loan
['unknown', 'no', 'yes']
**********
contact
['cellular', 'telephone']
**********
month
['jun', 'aug', 'may', 'mar', 'oct', 'jul', 'nov', 'apr', 'dec', 'sep']
**********
day_of_week
['fri', 'thu', 'tue', 'wed', 'mon']
**********
poutcome
['success', 'failure', 'nonexistent']
**********
y
['no', 'yes']


In [145]:
category_rows = df.select("job").distinct().collect()
categories = [row[0] for row in category_rows]
print(categories)

['management', 'retired', 'unknown', 'self-employed', 'student', 'blue-collar', 'entrepreneur', 'admin.', 'technician', 'services', 'housemaid', 'unemployed']


In [None]:
from core.discovery import Problem

In [148]:
import pyspark.sql.functions as sf

null_counts = df.select([sf.count(sf.when(sf.col(c).isNull(), c)).alias(c) for c in df.columns])

null_counts.show()
# print([c for c in df.columns])



+---+---+-------+---------+-------+-------+----+-------+-----+-----------+--------+--------+-----+--------+--------+------------+--------------+-------------+---------+-----------+---+
|age|job|marital|education|default|housing|loan|contact|month|day_of_week|duration|campaign|pdays|previous|poutcome|emp_var_rate|cons_price_idx|cons_conf_idx|euribor3m|nr_employed|  y|
+---+---+-------+---------+-------+-------+----+-------+-----+-----------+--------+--------+-----+--------+--------+------------+--------------+-------------+---------+-----------+---+
|  0|  0|      0|        0|      0|      0|   0|      0|    0|          0|       0|       0|    0|       0|       0|           0|             0|            0|        0|          0|  0|
+---+---+-------+---------+-------+-------+----+-------+-----+-----------+--------+--------+-----+--------+--------+------------+--------------+-------------+---------+-----------+---+



In [150]:
null_counts['age']

Column<'age'>

In [None]:
for c in null_counts.columns:
    # print(type(null_counts))
    # print(type(c))
    # print(null_counts.select(str(c)).first()[0])
    if null_counts.select(str(c)).first()[0] > 0:
        print(c)

In [None]:
cols_with_nulls = [c for c in df.columns if null_counts[c] > 0]
print(f"Columns with nulls: {cols_with_nulls}")

Auto Feature Generator

In [151]:
from core.auto_feature_generator import AutoFeatureGenerator

In [None]:
# Reuse existing SparkSession instead of creating a new one
# from pyspark.sql import SparkSession 
# spark = SparkSession.builder.master("local[*]").appName("testing testing 123").getOrCreate()

# The spark session is already created above, no need to recreate it

In [153]:
df = spark.read.options(header=True, inferSchema='True',delimiter=',').csv("data/bank-additional-full.csv")

df = df.toDF(*[c.replace(".", "_").replace(" ", "_") for c in df.columns])
df.show(5)

df.head()

+---+---------+-------+-----------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---+
|age|      job|marital|  education|default|housing|loan|  contact|month|day_of_week|duration|campaign|pdays|previous|   poutcome|emp_var_rate|cons_price_idx|cons_conf_idx|euribor3m|nr_employed|  y|
+---+---------+-------+-----------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---+
| 56|housemaid|married|   basic.4y|     no|     no|  no|telephone|  may|        mon|     261|       1|  999|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|     5191.0| no|
| 57| services|married|high.school|unknown|     no|  no|telephone|  may|        mon|     149|       1|  999|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|     5191.0| no|
| 37| serv

Row(age=56, job='housemaid', marital='married', education='basic.4y', default='no', housing='no', loan='no', contact='telephone', month='may', day_of_week='mon', duration=261, campaign=1, pdays=999, previous=0, poutcome='nonexistent', emp_var_rate=1.1, cons_price_idx=93.994, cons_conf_idx=-36.4, euribor3m=4.857, nr_employed=5191.0, y='no')

In [154]:
feature_gen = AutoFeatureGenerator(spark)
    
# Generate all features
df_with_features = feature_gen.generate_all_features(
    df,
    include_numerical=True,
    include_interactions=False,  # Set to True for interactions
    include_binning=True,
    include_datetime=True,
    include_string=True
)

Generating numerical features...
Generating binning features...
Generating datetime features...
Generating string features...
Total features generated: 111


In [155]:
# Show results
print("\nOriginal columns:", df.columns)
print("\nNew columns:", df_with_features.columns)
print("\nGenerated features:", feature_gen.get_generated_features())

df_with_features.show(5, truncate=False)


Original columns: ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'y']

New columns: ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'y', 'age_log', 'age_sqrt', 'age_square', 'age_cube', 'duration_log', 'duration_sqrt', 'duration_square', 'duration_cube', 'campaign_log', 'campaign_sqrt', 'campaign_square', 'campaign_cube', 'pdays_log', 'pdays_sqrt', 'pdays_square', 'pdays_cube', 'previous_log', 'previous_sqrt', 'previous_square', 'previous_cube', 'emp_var_rate_log', 'emp_var_rate_sqrt', 'emp_var_rate_square', 'emp_var_rate_cube', 'cons_price_idx_log', 'cons_price_idx_sqrt', 'cons_price_idx_square', 'cons_pri

In [156]:
from core.metrics.base import Metrics
from core.discovery import Problem
from core.features.process import target_processing

In [157]:
problem = Problem(target="y", type="classification", desired_result="yes")

In [160]:
df_new = df.copy()

PySparkAttributeError: [ATTRIBUTE_NOT_SUPPORTED] Attribute `copy` is not supported.

25/12/23 23:40:21 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 126875 ms exceeds timeout 120000 ms
25/12/23 23:40:21 WARN SparkContext: Killing executors is not supported by current scheduler.
25/12/23 23:40:29 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:359)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$

In [158]:
df_new = target_processing(df, problem)
df_new.show(5)

+---+---------+-------+-----------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---+
|age|      job|marital|  education|default|housing|loan|  contact|month|day_of_week|duration|campaign|pdays|previous|   poutcome|emp_var_rate|cons_price_idx|cons_conf_idx|euribor3m|nr_employed|  y|
+---+---------+-------+-----------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---+
| 56|housemaid|married|   basic.4y|     no|     no|  no|telephone|  may|        mon|     261|       1|  999|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|     5191.0| no|
| 57| services|married|high.school|unknown|     no|  no|telephone|  may|        mon|     149|       1|  999|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|     5191.0| no|
| 37| serv

In [159]:
dimensions = df_new.columns
dimensions.remove(problem.target)
print(dimensions)

['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed']


In [None]:
calculator = Metrics(dataframe=df_new, problem=problem)

_ = calculator.calculate(dimensions=['job'])#categorical_features[:-1])
_.show()

In [None]:
from pyspark.sql import functions as psf
_ = df.groupby(["y"]).agg((psf.count(psf.col('y'))/df.count()))
_.show()

In [None]:
categorical_type = ["string"]

categorical_features = [_col for _col, _type in df.dtypes if _type in categorical_type]

categorical_features[:-1]

In [None]:
df.select("y").distinct().collect()