---
# Algoritmos para Big Data

**Handout 3 - Feature extraction and transformation**

**2024/25**

This lab class will focus on feature extractors and transformers, which are are critical components in the field of machine learning and 
also for data preprocessing.

This notebook should contain the implementation of the tasks presented in the handout.

Hence both handout and notebook must be considered together as one.

---
# Task A - Data ingestion

**Datasest**

Recall that the file can be downloaded from 

https://bigdata.iscte-iul.eu/datasets/iot-devices.csv



**Spark setup**

In [1]:
# Basic imports
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F


In [2]:
# Build SparkSession
spark = SparkSession.builder.appName('Features').getOrCreate()

**Reading and checking data**

In [3]:
# Reading data
data_dir = '../../Datasets/'
file_iot = data_dir + 'credit-cards-transactions.csv'

! head $file_iot


User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754.0,5651,,No
0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750.0,5912,,No
0,0,2002,9,3,13:53,$86.19,Swipe Transaction,-7146670748125200898,Monterey Park,CA,91755.0,5970,,No
0,0,2002,9,4,05:51,$93.84,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
0,0,2002,9,4,06:09,$123.50,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
0,0,2002,9,5,06:14,$61.72,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No


In [4]:

df_cards = spark.read.csv(file_iot, header=True, sep=',', inferSchema=True)


In [5]:
# Checking iot data
df_cards.show(10,truncate=False)
print(f'df_cards - number of rows: {df_cards.count()      }')
df_cards.printSchema()
      

+----+----+----+-----+---+-------------------+-------+-----------------+--------------------+-------------+--------------+-------+----+-------+---------+
|User|Card|Year|Month|Day|Time               |Amount |Use Chip         |Merchant Name       |Merchant City|Merchant State|Zip    |MCC |Errors?|Is Fraud?|
+----+----+----+-----+---+-------------------+-------+-----------------+--------------------+-------------+--------------+-------+----+-------+---------+
|0   |0   |2002|9    |1  |2025-03-27 06:21:00|$134.09|Swipe Transaction|3527213246127876953 |La Verne     |CA            |91750.0|5300|NULL   |No       |
|0   |0   |2002|9    |1  |2025-03-27 06:42:00|$38.48 |Swipe Transaction|-727612092139916043 |Monterey Park|CA            |91754.0|5411|NULL   |No       |
|0   |0   |2002|9    |2  |2025-03-27 06:22:00|$120.34|Swipe Transaction|-727612092139916043 |Monterey Park|CA            |91754.0|5411|NULL   |No       |
|0   |0   |2002|9    |2  |2025-03-27 17:45:00|$128.95|Swipe Transaction|3414

In [6]:
df_cards = df_cards.dropDuplicates()

In [7]:
#print(f'df_cards - number of rows is {df_cards.count()}; after dropDuplicates() applied would be {df_cards.dropDuplicates().count()}.')

In [8]:
print(f'''df_iot - number of rows after dropna(how='any') applied would be {df_iot.dropna(how='any').count()     }.''')

NameError: name 'df_iot' is not defined

In [None]:
print('Checking nulls at each column of df_iot')
dict_nulls_iot = {col: df_iot.filter(df_iot[col].isNull()).count() for col in df_iot.columns}
dict_nulls_iot

In [None]:
# Prepare a dataframe with columns of interest, as well as arrays with names of columns to look at later on

# column 'device': use F.regexp_replace() on column 'device_name' (use F.col()), replacing '-' by ' '.
# column 'device_words': use F.split() on column 'device_name' (use F.col()) by '-'

df_devices = ( df_iot
            .withColumn('device', F.regexp_replace(F.col('device_name'), '-', ' '))
            .withColumn('device_words', F.split(F.col('device_name'), '-'))
            .select('device_id','device', 'device_words', 'battery_level', 'c02_level', 'humidity', 'temp', 'cn')
)
df_devices.show(10,truncate=False)


In [None]:

# call describe on df_devices and show
df_devices.describe().show()

# numeric columns
input_cols_num = ['battery_level', 'c02_level', 'humidity', 'temp']
# string columns
input_cols_str = ['cn']
# all interest columns together
input_cols_all = input_cols_num + input_cols_str

In [None]:
# Plots (histograms) to grasp data

# df_devices
# input_cols_num = ['battery_level', 'c02_level', 'humidity', 'temp']
# input_cols_str = ['cn']


import plotly.express as px

df_pandas = df_devices.toPandas()

id = 2
# col = 'cn'
col = input_cols_num[id]
fig = px.histogram(df_pandas, x=col)
fig.show()

---
# Task B - Basic statistics

Applying the following statistical algorithms upon the dataframe of interest:
- Correlation, with help of feature transformer VectorAssembler
- Summarizer

In [None]:
# Correlations among numeric columns
#
# Correlation needs vectors so we convert to vector column first
# See VectorAssembler

from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler

# The columns to compute correlations - numeric types but no nulls
cols_corr = input_cols_num

# Assemble columns
col_features = "features"
assembler = VectorAssembler(inputCols=cols_corr, outputCol=col_features, handleInvalid = "skip")
# Apply transform on df_devices an select col_futures 
df_features = assembler.transform(df_devices)
    

df_features.show(10, truncate=False)

# Get correlation matrix - it can be Pearson’s (default) or Spearman’s correlation
corr_matrix = Correlation.corr(df_features, col_features).collect()[0][0].toArray().tolist()

corr_matrix


In [None]:
# Plot computed correlation
fig = px.imshow(corr_matrix, text_auto=True)
fig.show()

In [None]:
# Summarizer
from pyspark.ml.stat import Summarizer

summarizer = Summarizer.metrics('min', 'max', 'mean', 'sum', 'count', 'variance', 'std', 'normL1', 'normL2')

df_features.show(10, truncate=False)

print('Aggregated metric below:\n')
df_features.select(summarizer.summary(df_features.features)).show(truncate=False)

print('Single metrics below:\n')
df_features.select(Summarizer.min(df_features.features)).show(truncate=False)
df_features.select(Summarizer.max(df_features.features)).show(truncate=False)
df_features.select(Summarizer.mean(df_features.features)).show(truncate=False)
df_features.select(Summarizer.sum(df_features.features)).show(truncate=False)
df_features.select(Summarizer.count(df_features.features)).show(truncate=False)
df_features.select(Summarizer.variance(df_features.features)).show(truncate=False)
df_features.select(Summarizer.std(df_features.features)).show(truncate=False)
df_features.select(Summarizer.normL2(df_features.features)).show(truncate=False)

---
# Task C - Features extraction

Extracting features frow raw data, according to the following algorithms:
- TF-IDF
- Word2Vec
- CountVectorizer
- FeatureHasher

In [None]:

df_devices.show(10,truncate=False)

In [None]:
# TF-IDF
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(inputCol="device_words", outputCol="rawFeatures", numFeatures=10) 
hashing_df = hashingTF.transform(df_devices) 

hashing_df.show(10,truncate=False)


idf = IDF(inputCol="rawFeatures", outputCol="featuresTFIDF") 
idf_model = idf.fit(hashing_df) 

tfidf_df = idf_model.transform(hashing_df) 
tfidf_df.select('device_id','device_words','featuresTFIDF').show(10,truncate=False) 

In [None]:
# Word2Vec
from pyspark.ml.feature import Word2Vec

word2vec = Word2Vec(vectorSize=3, minCount=0, inputCol="device_words", outputCol="features")
word2vec_model = word2vec.fit(df_devices)
word2vec_df = word2vec_model.transform(df_devices)
word2vec_df.select("device_id", "device_words", "features").show(10,truncate=False) 

In [None]:
# CountVectorizer
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="device_words", outputCol="features", vocabSize=3) 
cv_model = cv.fit(df_devices)
cv_df = cv_model.transform(df_devices)
cv_df.select("device_id",'device_words', "features").show(10, truncate=False)

In [None]:
# FeatureHasher
from pyspark.ml.feature import FeatureHasher 

hasher = FeatureHasher(inputCols=input_cols_all, outputCol='features', numFeatures=4) 
hasher_df = hasher.transform(df_devices)
cols_to_show = ['device_id','device'] + input_cols_all + ['features']
hasher_df.select(cols_to_show).show(10,truncate=False)

---
# Task D - Feature transformation

Modifying features into more suitable formats, according to the following algorithms: 
- Tokenizer
- StringIndexer
- OneHotEncoder
- Binarizer
- Bucketizer
- StandardScaler
- MinMaxScaler
- PCA

In [None]:
# Tokenizer
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol='device', outputCol='device_tokens')
df_tokenizer = tokenizer.
df_tokenizer.select('device_id', 'device', 'device_words', 'device_tokens').show(10, truncate=False)

In [None]:
# StringIndexer
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="cn", outputCol="cn_indexed") 
indexer_model = indexer.
indexer_df = indexer_model.
indexer_df.select('device_id', 'device', 'cn', 'cn_indexed').show(10, truncate=False) 

In [None]:
# OneHotEncoder
from pyspark.ml.feature import OneHotEncoder

incols = ['battery_level', 'c02_level']
outcols = ['battery_level_vec', 'c02_level_vec']
encoder = OneHotEncoder(inputCols=incols,outputCols=outcols) 
encoder_model = encoder.
encoder_df = encoder_model.
cols_to_show = ['device_id', 'device'] + incols + outcols
encoder_df.select(cols_to_show).show(10,truncate=False)


In [None]:
# Binarizer
from pyspark.ml.feature import Binarizer

# As operator requires numeric data but not integer, we have to create a new but adequate column
df = df_devices.select('device_id', 'device','battery_level').withColumn('battery_level_d', F.col('battery_level').cast('double'))

threshold_cut = 5.0
binarizer = Binarizer(threshold=threshold_cut, inputCol='battery_level_d', outputCol='battery_level_binary') 
binarizer_df = binarizer.
print(f'Binarizer output with threshold {threshold_cut}:') 
binarizer_df.show(10, truncate=False) 

In [None]:
# Bucketizer
from pyspark.ml.feature import Bucketizer 
 
splits = [0.0, 2.0, 4.0, float('inf')] 
bucketizer = Bucketizer(splits=splits, inputCol='battery_level', outputCol='battery_level_bucket') 
bucketizer_df = bucketizer.
print(f'Bucketizer output with {(len(bucketizer.getSplits()) - 1)} buckets:') 
bucketizer_df.select('device_id', 'device', 'battery_level', 'battery_level_bucket').show(10, truncate=False)

In [None]:
# StandardScaler
from pyspark.ml.feature import StandardScaler 

# Assemble columns
col_features = 'features'
col_features_scaled = 'features_scaled' 
assembler = VectorAssembler(inputCols=input_cols_num, outputCol=col_features, handleInvalid = "skip") # "keep"
df_features = assembler.

scaler = StandardScaler(inputCol=col_features, outputCol=col_features_scaled, withStd=True, withMean=True) 
scaler_model = scaler.
scaler_df = scaler_model.

cols_to_show = ['device_id','device'] + input_cols_num + [col_features, col_features_scaled]
scaler_df.select(cols_to_show).show(10,truncate=False) 


In [None]:
# MinMaxScaler
from pyspark.ml.feature import MinMaxScaler

#col_features = 'features'
#col_features_scaled = 'features_scaled' 
#assembler = VectorAssembler(inputCols=input_cols_num, outputCol=col_features, handleInvalid = "skip") # "keep"
#df_features = assembler.transform(df_devices)

# Use of col_features, col_features_scaled, and df_features 
# from previous exercise StandardScaler

scaler = MinMaxScaler(inputCol=col_features, outputCol=col_features_scaled) 
scaler_model = scaler.
scaler_df = scaler_model.

cols_to_show = ['device_id','device'] + input_cols_num + [col_features, col_features_scaled]
print(f'Features scaled to range [{scaler.getMin()}, {scaler.getMax()}]:') 
scaler_df.select(cols_to_show).show(10,truncate=False)  

In [None]:
# PCA (Principal Component Analysis)
from pyspark.ml.feature import PCA 

# Use of col_features and df_features 
# from previous exercise StandardScaler

col_features_pca = 'features_PCA'
pca = PCA(k=2, inputCol=col_features, outputCol=col_features_pca) 
pca_model = pca.
pca_df = pca_model.

cols_to_show = ['device_id','device'] + input_cols_num + [col_features, col_features_pca] 
pca_df.select(cols_to_show).show(10,truncate=False)


---
# Task E - Feature selection

Selecting a relevant subset of features from a larger set of features
- ChiSqSelector
- VectorSlicer


In [None]:
# ChiSqSelector
from pyspark.ml.feature import ChiSqSelector 

# Use of col_features and df_features 
# from previous exercise StandardScaler

# Add a label column, such as danger = 1 if battery_level < 2 || c02_level > 1000 || temp > 26; 0 otherwise
df = df_features.withColumn('danger', 
            F.when(        , 1.0)
            .otherwise(0.0)
        )
# df.show()
col_selected_features = 'selected_features'
col_label = 'danger'
selector = ChiSqSelector(numTopFeatures=2, featuresCol=col_features, outputCol=col_selected_features, labelCol=col_label) 
selector_model = selector.
selector_df = selector_model.

cols_to_show = ['device_id','device'] + input_cols_num + [col_features, col_label, col_selected_features] 
selector_df.select(cols_to_show).show(10,truncate=False)

In [None]:
# VectorSlicer
from pyspark.ml.feature import VectorSlicer

# Use of col_features and df_features 
# from previous exercise StandardScaler

col_selected_features = 'selected_features'
# indices: humidity -> 2, temp -> 3
slicer = VectorSlicer(inputCol=col_features, outputCol=col_selected_features, indices=[2,3]) 
slicer_df = slicer.

cols_to_show = ['device_id','device'] + input_cols_num + [col_features, col_selected_features] 
slicer_df.select(cols_to_show).show(10,truncate=False)