### Install & import dependencies

In [None]:
%pip install -q -U pip
%pip install -q -U pyspark
%pip install -q -U matplotlib
%pip install -q -U numpy

In [None]:
import pyspark
import numpy
from collections import Counter
from pyspark.sql.functions import col, array, array_contains, arrays_overlap, array, lit, collect_list
from pyspark import SparkContext
import matplotlib.pyplot as plt

### Initialize Spark

In [None]:
SparkContext.setSystemProperty('spark.executor.memory', '16g')
SparkContext.setSystemProperty('spark.driver.memory', '16g')
sc = SparkContext(appName="AAP-15319")
spark = pyspark.sql.SparkSession(sc)
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)

### ℹ️ Debug: Print all spark config options

In [None]:
#sc._conf.getAll()

### Import telemetry dataset
all.jsonl is generated by:
```bash
aws s3 sync s3://$BUCKET/segment-logs/$PROJECTID/ data/raw 
find data/raw -type f | parallel --bar 'gzcat {} | jq -c "."' > data/all.jsonl
```

In [None]:
df_all = spark.read.json("./data/all.jsonl")
#df_all = spark.read.json("./data/raw/**/*.gz")

### ℹ️ Debug: Print schema and distinct event types

In [None]:
#df_all.printSchema()
#df_all.select("event").distinct().collect()

### Isolate completion and inline suggestion feedback events

In [None]:
completions = df_all.filter("event == 'completion'").alias("completions")
feedback = df_all.filter("event == 'inlineSuggestionFeedback'").alias("feedback")

### Find all completions that lack corresponding inline suggestion feedback

In [None]:
unmatched = completions.join(feedback, col("completions.properties.suggestionId") == col("feedback.properties.suggestionId"), "leftanti")
# Isolate users from test users
users = unmatched.where(
    ~(arrays_overlap("properties.groups", array(lit("test"),lit("lightspeed-internal"),lit("wca-users"))))
)

# Find user errors
# user_errors = users.filter("completions.properties.response.status_code != 200")

# Successful action
user_ok = users.filter("completions.properties.response.status_code == 200")

# Unseated users
unseated_users = user_ok.filter(
    col("completions.properties.rh_user_has_seat").isNull() & ~array_contains(user_ok.properties.groups, "Commercial")
)

In [None]:
total = unmatched.count()
num_users = users.count()
num_ok_users = user_ok.count()
num_test_users = total - num_users
num_errors = num_users - num_ok_users
num_seated_users = num_ok_users - unseated_users.count()


counts = [num_test_users, num_errors, num_seated_users]
tbd = total - sum(counts)
counts.append(tbd)

### Visualize breakdown of unmatched users

In [None]:
labels = 'Test users', 'Errors', 'Seated users', 'Mystery'
fig, ax = plt.subplots()
pie = ax.pie(counts, labels=labels)

### Timestamp of 5 most recent mystery events

In [None]:
unseated_users.select("userId", "receivedAt").orderBy("receivedAt", ascending=False).take(5)

### ℹ️ Debug: Write files to disk

In [None]:
# unseated_users.write.json("./unseated_users")

### 🕵️‍♀️ Search for patterns in the mystery events

In [None]:
## Of the ok user requests, which have:
# no prompt
no_prompt = unseated_users.where(col("completions.properties.request.prompt").isNull())

# post-processing anomalies
# postprocess = df_all.filter("event == 'postprocess'").alias("postprocess")
# postprocess_all = postprocess.join(user_ok, col("postprocess.properties.suggestionId") == col("completions.properties.suggestionId"))

# are power users
# user_hist = users.groupBy("completions.userId").count().orderBy("count", ascending=0).limit(20).collect()


In [None]:
durations = unseated_users.select(collect_list("properties.duration")).first()[0]
userids = unseated_users.select(collect_list("userId")).first()[0]
#userids, bins = numpy.histogram(userids)
#dhist = plt.bar(durations, bins)


In [None]:
userhist = Counter(userids)
user_values = list(userhist.values())
userbins = numpy.linspace(min(user_values), max(user_values), num=len(user_values))


In [None]:
max(user_values)

In [None]:
durations, bins = numpy.histogram(durations, range=(0.0, 10.0))

In [None]:
dhist = plt.bar(bins[:-1], durations, width=numpy.diff(bins), edgecolor="black", align="edge")

In [None]:
uhist = plt.bar(userbins, user_values, edgecolor="black", align="edge")