In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
        .appName("Test") \
        .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
        .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
        .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
        .config("spark.hadoop.fs.s3a.path.style.access", True) \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
        .getOrCreate()

In [6]:
file_path = f"s3a://datawarehouse/core_data.csv"
data =  spark.read.csv(file_path, header=True)
print(data.head())

Row(event_time='2019-11-04T19:08:52.000Z', event_type='view', product_id='19100002', category_id='2053013556227473861', main_category='construction', sub_category='tools', subsub_category='saw', brand='bosch', price='105.76', user_id='512589760', user_session='80f6f04e-5ae7-4592-847d-67f4f2e4bc74')


In [11]:

# Anzahl der Interaktionen pro Benutzer und Produkt zählen
interaction_counts = data.groupby(['user_id', 'product_id']).count()
interaction_counts = interaction_counts.withColumnRenamed("count", "interaction_count")

In [10]:

# Anzahl der Käufe pro Benutzer und Produkt zählen
# Filter the data where event_type is 'purchase'
purchase_data = data.filter(data['event_type'] == 'purchase')

# Group by user_id and product_id, and count the number of purchases
purchase_counts = purchase_data.groupby('user_id', 'product_id').count()

# Rename the count column to purchase_count
purchase_counts = purchase_counts.withColumnRenamed("count", "purchase_count")


In [14]:
# Zusammenführen der Interaktions- und Kaufzählungen
combined_data = interaction_counts.join(purchase_counts, on=['user_id', 'product_id'], how='left')



In [16]:
# Ersetzen von NaN-Werten mit 0, da keine Käufe gleichbedeutend mit 0 Käufen sind
combined_data = combined_data.fillna({'purchase_count': 0})



In [21]:
# Berechnung der Korrelation
from pyspark.sql.functions import corr

# Calculate the correlation between interaction_count and purchase_count
correlation_result = combined_data.select(corr("interaction_count", "purchase_count").alias("correlation"))

In [22]:
# Collect the result into a variable
correlation_result_collected = correlation_result.collect()

# Extract the correlation value from the collected data
correlation_value = correlation_result_collected[0]['correlation']

# Print the correlation
print(f"Korrelation zwischen Interaktionsanzahl und Kaufanzahl: {correlation_value}")

Korrelation zwischen Interaktionsanzahl und Kaufanzahl: 0.44961662830493343
