# Interaction information

## Data preparation

In [None]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pyspark.sql.functions as sf

pd.options.display.max_columns = 500

## Watched interactions
Gain all events of watched series. (Train set + Test set)

In [None]:
paths = []
for i in range(1, 23):
    paths.append('gs://mit-processed-events-prod.npo-data.nl/all/year=2019/month=3/dayofmonth=' + str(i))
events = spark.read.parquet(*paths)

print(events.count())

In [None]:
# group on streamWaypoint
events_waypoint = (
    events
    .filter(sf.col('eventtype') == 'streamWaypoint')
    .filter(~sf.col('detectedduplicate'))
    .filter(~sf.col('detectedcorruption'))
    .filter(sf.col('topspinBrand') == 'npoportal') # only npo-start
    .filter(~sf.col('npoprofileid').isNull()) # have an account
    .filter(~(sf.col('npoprofileid') == ''))  
    .withColumn('year', sf.year(sf.col('timestamp')))
    .withColumn('month', sf.month(sf.col('timestamp')))
    .withColumn('day', sf.dayofmonth(sf.col('timestamp')))
    .groupby('month', 'day', 'sessionId', 'streamid', 'npoprofileid')
    .agg(sf.sum(sf.when(sf.col('eventtype') == 'streamWaypoint', 1).otherwise(0)).alias('waypoint_count'))
    .withColumn('waypoint_duration', sf.col('waypoint_count') * 30000) #times 30s (or 30.000 ms)
)
print(events_waypoint.count())

In [None]:
poms = (
    spark.read.parquet("gs://mit-processed-events-prod.npo-data.nl/poms-enriched/")
    .select('mid', 'type', 'duration', 'seriesRef')
    .filter(sf.col("type")=="BROADCAST")
    .filter(~sf.col("seriesRef").isNull())
).cache()

# join events with poms data to get seriesRef, and filter on watch proportion > 0.5
events_join = (
    events_waypoint
    .join(poms, events_waypoint.streamid == poms.mid, how='left')
    .drop('streamid')
    .withColumn('watch_proportion', sf.round((sf.col('waypoint_duration') / sf.col('duration')), 2))
    .filter(sf.col('watch_proportion') > 0.5)
)
print(events_join.count())

In [None]:
# the series presented in 'Aanbevolen voor jou' that were watched by users
final_events = watched_events_rank.select('month', 'day', 'npoprofileid', 'seriesRef', 'recofferid').dropDuplicates()
final_events.write.parquet("gs://dataproc-jupyter-eileen.npo-data.nl/data/events/february/")

## Recommended and watch interactions
Gain all events of watched series that were recommended. (Recommended test set)

In [None]:
paths = []
for i in range(22, 23):
    paths.append('gs://mit-processed-events-prod.npo-data.nl/all/year=2019/month=3/dayofmonth=' + str(i))
events = spark.read.parquet(*paths)

print(events.count())

In [None]:
# all offers and choices
events_recommender = (
    events
    .filter(~sf.col('detectedduplicate'))
    .filter(~sf.col('detectedcorruption'))
    .filter(sf.col('topspinBrand') == 'npoportal') # only npo-start
    .filter(~sf.col('npoprofileid').isNull()) # have an account
    .filter(~(sf.col('npoprofileid') == ''))  
    .filter(sf.col('rectype') == 'algorithm') # recommended by algorithm
    .select('partyid', 'sessionid', 'eventid', 'eventtype', 'streamid', 
            'rectype', 'recpanel', 'recofferid', 'recdestinations', 
            'npouserid', 'npoprofileid', 'nposubscription', 'environment')
    .withColumn('contentid', sf.explode('recdestinations.contentId'))
    .withColumn('index', sf.explode('recdestinations.index'))
    .withColumn('numberdisplayed', sf.explode('recdestinations.numberDisplayed'))
    .withColumn('recommender', sf.explode('recdestinations.recommender'))
    .filter(sf.col('recommender').startswith('ps-')) # aanbevolen voor jou
    .filter(sf.col('index') < 5 ) #k=5
    .withColumn('chosen', sf.col('index') + 1) 
)
print(events_recommender.count())

In [None]:
# get all offers
offer_events = events_recommender.filter(sf.col('eventtype') == 'offer')
offer_events.write.parquet("gs://dataproc-jupyter-eileen.npo-data.nl/data/events/rec_offered")

In [None]:
# join offers to gain the series reference
offer_events_join = offer_events.join(events_join.select('month', 'day', 'sessionId', 'mid', 'seriesRef'), 
                            (offer_events.sessionid == events_join.sessionId) & 
                            (offer_events.contentid == events_join.mid))

# filter for watched series with rank
watched_events_rank = ( 
    offer_events_join
    .select('month', 'day', 'npoprofileid', 'seriesRef', 'chosen', 'recofferid')
    .dropna()
    .dropDuplicates()
)
watched_events_rank.write.parquet("gs://dataproc-jupyter-eileen.npo-data.nl/data/events/recwatched_rank")

In [None]:
# the series presented in 'Aanbevolen voor jou' that were watched by users
final_events = watched_events_rank.select('month', 'day', 'npoprofileid', 'seriesRef', 'recofferid').dropDuplicates()
final_events.write.parquet("gs://dataproc-jupyter-eileen.npo-data.nl/data/events/recwatched")