In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA
from pyspark.ml import Pipeline

In [2]:
spark = SparkSession.builder \
    .appName("mkt_analysis") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/22 23:35:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
# Read a CSV file with header and infer schema
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("delimiter", ",") \
    .csv("../data/Advertising_Data.csv")

In [4]:
df.show(5)

+------+----------+----------+------------+--------------------+-------------------+------------+
|    TV|Billboards|Google_Ads|Social_Media|Influencer_Marketing|Affiliate_Marketing|Product_Sold|
+------+----------+----------+------------+--------------------+-------------------+------------+
|281.42|     538.8|    123.94|       349.3|              242.77|              910.1|      7164.0|
|702.97|    296.53|    558.13|      180.55|              781.06|             132.43|      5055.0|
|313.14|    295.94|    642.96|      505.71|              438.91|             464.23|      6154.0|
|898.52|     61.27|    548.73|      240.93|              278.96|             432.27|      5480.0|
|766.52|    550.72|    651.91|      666.33|              396.33|             841.93|      9669.0|
+------+----------+----------+------------+--------------------+-------------------+------------+
only showing top 5 rows



In [None]:
# Feature columns (excluding Product_Sold which is the target)
feature_cols = ["TV", "Billboards", "Google_Ads", "Social_Media", 
                "Influencer_Marketing", "Affiliate_Marketing"]

In [None]:
# Step 1: Assemble features into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Demonstration of generated vectors...
output = assembler.transform(df)
output.select("features").show(5)

+--------------------+
|            features|
+--------------------+
|[281.42,538.8,123...|
|[702.97,296.53,55...|
|[313.14,295.94,64...|
|[898.52,61.27,548...|
|[766.52,550.72,65...|
+--------------------+
only showing top 5 rows



In [27]:
# Step 2: Standardize the features
scaler = StandardScaler(inputCol="features", outputCol="scaled_features",
                        withStd=True, withMean=True)

# Demonstration of scaled features...
scaled_output = scaler.fit(output).transform(output)
scaled_output.select("scaled_features").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------+
|scaled_features                                                                                                            |
+---------------------------------------------------------------------------------------------------------------------------+
|[-0.8191561355470773,0.13107147661820287,-1.3611551377860516,-0.5129914254811337,-0.7733322182187209,1.5314301427984658]   |
|[0.643975959736068,-0.7472199930962898,0.1600640683124651,-1.1291283752053067,1.0936941888072516,-1.2663922339352824]      |
|[-0.7090611288755525,-0.7493588958375765,0.4572727217898304,0.058089939297283226,-0.09303245512805543,-0.07267579302813974]|
|[1.3226984866785874,-1.6000983980371886,0.12713042051466686,-0.9086700106165922,-0.6478093854143405,-0.18765824465982248]  |
|[0.8645477653468944,0.17428456250996338,0.4886297587887977,0.6445427799769673,-0.24071861762627383,1.2861750199032278

In [28]:
# Step 3: Apply PCA
# Step 3: Apply PCA with component exploration

# Method 1: Try different k values and analyze explained variance
k_values = [2, 3, 4, 5, 6]  # All possible values since we have 6 features
explained_variances = []

for k in k_values:
    pca_temp = PCA(k=k, inputCol="scaled_features", outputCol="pca_features")
    pca_model = pca_temp.fit(scaled_output)
    explained_var = pca_model.explainedVariance.toArray()
    total_explained = sum(explained_var)
    explained_variances.append((k, total_explained, explained_var))
    print(f"k={k}: Total explained variance = {total_explained:.4f}")
    print(f"  Individual variances: {explained_var}")
    print()

# Method 2: Find optimal k based on cumulative explained variance threshold
cumulative_variance = 0
optimal_k = 2
variance_threshold = 0.95  # Capture 95% of variance

for k, total_var, individual_vars in explained_variances:
    if total_var >= variance_threshold:
        optimal_k = k
        break

print(f"Optimal k for {variance_threshold*100}% variance: {optimal_k}")

# Method 3: Elbow method - look for diminishing returns
print("\nVariance contribution by each component:")
for k, total_var, individual_vars in explained_variances:
    if len(individual_vars) > 1:
        marginal_contribution = individual_vars[-1] if k > 2 else individual_vars[0]
        print(f"Adding component {k}: marginal variance = {marginal_contribution:.4f}")

# Final PCA with chosen k (using optimal_k or manual selection)
k_selected = 3  # You can change this to optimal_k or any preferred value
pca = PCA(k=k_selected, inputCol="scaled_features", outputCol="pca_features")
print(f"\nUsing k={k_selected} components for final PCA")


k=2: Total explained variance = 0.3772
  Individual variances: [0.20029233 0.17685879]

k=3: Total explained variance = 0.5481
  Individual variances: [0.20029233 0.17685879 0.17090627]

k=4: Total explained variance = 0.7104
  Individual variances: [0.20029233 0.17685879 0.17090627 0.16233868]

k=5: Total explained variance = 0.8664
  Individual variances: [0.20029233 0.17685879 0.17090627 0.16233868 0.15597116]

k=6: Total explained variance = 1.0000
  Individual variances: [0.20029233 0.17685879 0.17090627 0.16233868 0.15597116 0.13363277]

Optimal k for 95.0% variance: 6

Variance contribution by each component:
Adding component 2: marginal variance = 0.2003
Adding component 3: marginal variance = 0.1709
Adding component 4: marginal variance = 0.1623
Adding component 5: marginal variance = 0.1560
Adding component 6: marginal variance = 0.1336

Using k=3 components for final PCA


In [None]:
# Step 4: Create and fit the pipeline
pipeline = Pipeline(stages=[assembler, scaler, pca])
model = pipeline.fit(df)
result = model.transform(df)

In [None]:
# Display results
result.select("pca_features").show(5, truncate=False)