# Load data

In [1]:
packages = [
    f'org.apache.hadoop:hadoop-aws:3.3.1',
    'com.google.guava:guava:30.1.1-jre',
    'org.apache.httpcomponents:httpcore:4.4.14', 
    'com.google.inject:guice:4.2.2', 
    'com.google.inject.extensions:guice-servlet:4.2.2'
]

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

conf = SparkConf()\
    .setMaster("local")\
    .setAppName("pyspark-unittests")\
    .set("spark.sql.parquet.compression.codec", "snappy")\
    .set('spark.jars.packages', ','.join(packages))

sc = SparkContext.getOrCreate(conf)

23/09/13 11:00:24 WARN Utils: Your hostname, DESKTOP-HT1RH4E resolves to a loopback address: 127.0.1.1; using 172.26.69.25 instead (on interface eth0)
23/09/13 11:00:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/greg/.pyenv/versions/3.8.12/envs/lewagon/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/greg/.ivy2/cache
The jars for the packages stored in: /home/greg/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.google.guava#guava added as a dependency
org.apache.httpcomponents#httpcore added as a dependency
com.google.inject#guice added as a dependency
com.google.inject.extensions#guice-servlet added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-896e62c7-8136-45d6-998b-38d1e44ba5b3;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.1 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found com.google.guava#guava;30.1.1-jre in central
	found com.google.guava#failureaccess;1.0.1 in central
	found com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central
	found com.google.code.findbugs#jsr305;3.0.2 in central
	found org.checkerframework#checker-qual;3.8.0 in central
	found c

In [3]:
spark = SparkSession(sc)

In [4]:
import pandas as pd

In [5]:
pandas_df = pd.read_json("s3a://udacity-dsnd/sparkify/mini_sparkify_event_data.json", lines=True)

In [6]:
df = spark.createDataFrame(pandas_df)

# Process data for prediction

Filter out all not logged-in users

In [7]:
df = df.filter(df["userId"] != "")

In [8]:
df.createOrReplaceTempView("events")

In [9]:
df.printSchema()

root
 |-- ts: long (nullable = true)
 |-- userId: string (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- page: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- method: string (nullable = true)
 |-- status: long (nullable = true)
 |-- level: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- location: string (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- song: string (nullable = true)
 |-- length: double (nullable = true)



## Get average session length and average number of items per session

In [10]:
avg_session_len_per_user = spark.sql('''
    SELECT userId, 
        AVG(sessionLength) AS avg_session_length, 
        AVG(items) AS avg_items_per_session 
    FROM (
        SELECT userId,
            MAX(ts) - MIN(ts) AS sessionLength, 
            MAX(itemInSession) AS items FROM events
        GROUP BY userId, sessionId
    )
    GROUP BY userId
''')

## Get average time between sessions

In [11]:
import pandas as pd
from pyspark.sql.functions import last, udf, pandas_udf, PandasUDFType
from pyspark.sql.types import FloatType

In [12]:
session_ends = df.sort("ts").groupby(df.userId, df.sessionId).agg(last(df.ts).alias("end"))
session_ends.show(1)

23/09/13 11:01:31 WARN TaskSetManager: Stage 0 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:01:34 WARN TaskSetManager: Stage 1 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
[Stage 1:>                                                          (0 + 1) / 1]

+------+---------+-------------+
|userId|sessionId|          end|
+------+---------+-------------+
|300011|       60|1538587993000|
+------+---------+-------------+
only showing top 1 row



                                                                                

In [13]:
@pandas_udf(FloatType())
def time_bw_sessions(s: pd.Series) -> float:
    return s.diff().mean()

In [14]:
avg_time_bw_sessions = session_ends.sort("sessionId") \
    .groupby("userId").agg(time_bw_sessions("end").alias("avg_time_bw_sessions"))

## How much time per day is user spending on the platform

In [15]:
time_per_day = spark.sql('''
    SELECT events.userId, first(a.days), ROUND(COUNT(*) / first(a.days), 2) AS pages_per_day
    FROM events
    JOIN (
        SELECT userId, GREATEST(1, ROUND((MAX(ts) - MIN(ts)) / 3600 / 24 / 1000, 2)) AS days FROM events
        GROUP BY userId
    ) AS a ON events.userId = a.userId
    GROUP BY events.userId
''')

## Find how many times a user visits each page

In [16]:
pages_per_user_by_type = spark.sql('''
    SELECT userId, page, COUNT(*) AS page_visits
    FROM events
    GROUP BY userId, page
''')

In [17]:
pages_per_user_by_type = pages_per_user_by_type.groupby("userId").pivot("page").sum("page_visits").na.fill(0)

23/09/13 11:01:37 WARN TaskSetManager: Stage 7 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

## Get gender, level and is_mobile as 0/1. Get total time spent listening to music

In [18]:
user_data = spark.sql('''
    SELECT userId, 
        CASE WHEN first(gender) = 'F' THEN 1 ELSE 0 END AS gender,
        CASE WHEN last(level) = 'paid' THEN 1 ELSE 0 END AS paid,
        CASE WHEN first(userAgent) LIKE '%Mobi%' THEN 1 ELSE 0 END AS is_mobile,
        SUM(nanvl(length, 0)) AS listening_time
    FROM (SELECT * FROM events ORDER BY ts ASC)
    GROUP BY userId
''')

## Prepare final dataframe

In [19]:
final = pages_per_user_by_type.join(time_per_day, on="userId") \
    .join(avg_session_len_per_user, on="userId") \
    .join(user_data, on="userId") \
    .join(avg_time_bw_sessions, on="userId")

In [48]:
from pyspark.sql.functions import isnan, when, count, col

final.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in final.columns]).show()

23/09/11 16:28:43 WARN TaskSetManager: Stage 156 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:28:44 WARN TaskSetManager: Stage 157 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:28:45 WARN TaskSetManager: Stage 158 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:28:46 WARN TaskSetManager: Stage 159 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:28:47 WARN TaskSetManager: Stage 160 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:28:49 WARN TaskSetManager: Stage 161 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:28:50 WARN TaskSetManager: Stage 162 contains a task of very large size (74761 KiB). The maximum recommended task size is 10

+------+-----+----------+---------------+------+-------------------------+---------+-----+----+----+------+--------+-----------+-------------+--------+----------------+--------------+-----------+---------+-------+-----------+-------------+------------------+---------------------+------+----+---------+--------------+--------------------+
|userId|About|Add Friend|Add to Playlist|Cancel|Cancellation Confirmation|Downgrade|Error|Help|Home|Logout|NextSong|Roll Advert|Save Settings|Settings|Submit Downgrade|Submit Upgrade|Thumbs Down|Thumbs Up|Upgrade|first(days)|pages_per_day|avg_session_length|avg_items_per_session|gender|paid|is_mobile|listening_time|avg_time_bw_sessions|
+------+-----+----------+---------------+------+-------------------------+---------+-----+----+----+------+--------+-----------+-------------+--------+----------------+--------------+-----------+---------+-------+-----------+-------------+------------------+---------------------+------+----+---------+--------------+-----

We will handle missing values after the train/test split

# Modeling

Split into train and test

In [20]:
from pyspark.ml.feature import Imputer
# Split the data into training and test sets (30% held out for testing)
(train_df, test_df) = final.randomSplit([0.7, 0.3])

# impute missing values in avg_time_bw_sessions
imputer = Imputer(inputCol="avg_time_bw_sessions", outputCol="avg_time_bw_sessions")
model = imputer.fit(train_df)
train_df = model.transform(train_df)
test_df = model.transform(test_df)

23/09/13 11:01:40 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
23/09/13 11:01:40 WARN TaskSetManager: Stage 15 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:01:42 WARN TaskSetManager: Stage 16 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:01:43 WARN TaskSetManager: Stage 17 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:01:45 WARN TaskSetManager: Stage 18 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:01:47 WARN TaskSetManager: Stage 19 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:01:48 WARN TaskSetManager: Stage 20 contains a task of very large size (74761 KiB). The maxim

In [21]:
from pyspark.ml.feature import VectorAssembler

Including visits to the "Cancel" page would likely introduce data leakage. And we want to predict churn before the user cancels so let's drop those columns

In [22]:
feature_cols = list(set(final.columns) - set(["userId", "Cancel", "Cancellation Confirmation", "listening_time"]))

In [23]:
vec_assembler = VectorAssembler(outputCol="features")
vec_assembler.setInputCols(feature_cols)

VectorAssembler_98cedac9072e

In [24]:
trainingData = vec_assembler.transform(train_df).select("features", final["Cancellation Confirmation"].alias("label"))
testData = vec_assembler.transform(test_df).select("features", final["Cancellation Confirmation"].alias("label"))

## Gradient boosted trees

In [56]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(maxIter=5, maxDepth=5, labelCol="label", seed=42,
    leafCol="leafId")

model = gbt.fit(trainingData)

23/09/11 16:56:33 WARN TaskSetManager: Stage 343 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:56:34 WARN TaskSetManager: Stage 344 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:56:35 WARN TaskSetManager: Stage 345 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:56:37 WARN TaskSetManager: Stage 346 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:56:38 WARN TaskSetManager: Stage 347 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:56:39 WARN TaskSetManager: Stage 348 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:56:41 WARN TaskSetManager: Stage 349 contains a task of very large size (74761 KiB). The maximum recommended task size is 10

In [59]:
# Evaluate model on test instances and compute test error
predictions = model.transform(testData)
predictions_df = predictions.toPandas()

23/09/11 16:57:52 WARN TaskSetManager: Stage 509 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:57:54 WARN TaskSetManager: Stage 510 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:57:55 WARN TaskSetManager: Stage 511 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:57:56 WARN TaskSetManager: Stage 512 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:57:57 WARN TaskSetManager: Stage 513 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:57:59 WARN TaskSetManager: Stage 514 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 16:58:00 WARN TaskSetManager: Stage 515 contains a task of very large size (74761 KiB). The maximum recommended task size is 10

In [62]:
tn = (predictions_df["prediction"] + predictions_df["label"] == 0).sum()
tp = (predictions_df["prediction"] + predictions_df["label"] == 2).sum()
fn = (predictions_df["prediction"] < predictions_df["label"]).sum()
fp = (predictions_df["prediction"] > predictions_df["label"]).sum()
precision = tp / (tp+fp)
recall = tp / (tp+fn)
precision, recall

(0.4, 0.6)

Not the best results, let's try some cross validation and grid-searching.

In [25]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
import tempfile

In [65]:
gbt = GBTClassifier(maxIter=5, maxDepth=5, labelCol="label", seed=42,
    leafCol="leafId")
grid = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [5, 10, 50]) \
    .addGrid(gbt.maxDepth, [2, 5, 10]) \
    .build()
evaluator = BinaryClassificationEvaluator(metricName="areaUnderPR")
cv = CrossValidator(estimator=gbt, estimatorParamMaps=grid, evaluator=evaluator)

cvModel = cv.fit(trainingData)

23/09/11 17:01:22 WARN CacheManager: Asked to cache already cached data.) / 200]
23/09/11 17:01:22 WARN CacheManager: Asked to cache already cached data.
23/09/11 17:01:25 WARN TaskSetManager: Stage 785 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 17:01:26 WARN TaskSetManager: Stage 786 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 17:01:28 WARN TaskSetManager: Stage 787 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 17:01:29 WARN TaskSetManager: Stage 789 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 17:01:36 WARN TaskSetManager: Stage 791 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 18:06:11 WARN TaskSetManager: Stage 6334 contains a task of very large size (74761 KiB). The maximum recommended task

In [66]:
cvModel.bestModel.getMaxDepth()

2

In [67]:
cvModel.bestModel.getMaxIter()

50

In [69]:
# Evaluate model on test instances and compute test error
predictions = cvModel.bestModel.transform(testData)
predictions_df = predictions.toPandas()
tn = (predictions_df["prediction"] + predictions_df["label"] == 0).sum()
tp = (predictions_df["prediction"] + predictions_df["label"] == 2).sum()
fn = (predictions_df["prediction"] < predictions_df["label"]).sum()
fp = (predictions_df["prediction"] > predictions_df["label"]).sum()
precision = tp / (tp+fp)
recall = tp / (tp+fn)
precision, recall

23/09/11 19:14:43 WARN TaskSetManager: Stage 32694 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 19:14:45 WARN TaskSetManager: Stage 32695 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 19:14:46 WARN TaskSetManager: Stage 32696 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 19:14:48 WARN TaskSetManager: Stage 32697 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 19:14:49 WARN TaskSetManager: Stage 32698 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 19:14:50 WARN TaskSetManager: Stage 32699 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/11 19:14:52 WARN TaskSetManager: Stage 32700 contains a task of very large size (74761 KiB). The maximum recommended t

(0.8, 0.8)

Much better score when using a shallower tree. There will be a lot of variance on these models given the smaller dataset so we'll have to conduct a grid search on the full dataset as well.

## LinearSVM

In [26]:
from pyspark.ml.classification import LinearSVC

In [27]:
from pyspark.ml.feature import RobustScaler

In [28]:
r_scaler = RobustScaler(inputCol='features', outputCol="features_scaled")
robust_model = r_scaler.fit(trainingData)
trainScaled = robust_model.transform(trainingData)

23/09/13 11:01:58 WARN TaskSetManager: Stage 63 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:01:59 WARN TaskSetManager: Stage 64 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:02:01 WARN TaskSetManager: Stage 65 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:02:02 WARN TaskSetManager: Stage 66 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:02:03 WARN TaskSetManager: Stage 67 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:02:05 WARN TaskSetManager: Stage 68 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:02:06 WARN TaskSetManager: Stage 69 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.

In [29]:
trainScaled.schema

StructType([StructField('features', VectorUDT(), True), StructField('label', LongType(), True), StructField('features_scaled', VectorUDT(), True)])

In [30]:
svm = LinearSVC(regParam=0.1, featuresCol="features_scaled")
model = svm.fit(trainScaled)

23/09/13 11:02:11 WARN TaskSetManager: Stage 108 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:02:13 WARN TaskSetManager: Stage 109 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:02:14 WARN TaskSetManager: Stage 110 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:02:15 WARN TaskSetManager: Stage 111 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:02:17 WARN TaskSetManager: Stage 112 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:02:18 WARN TaskSetManager: Stage 113 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/13 11:02:20 WARN TaskSetManager: Stage 114 contains a task of very large size (74761 KiB). The maximum recommended task size is 10

In [57]:
testScaled = robust_model.transform(testData)

In [65]:
# Evaluate model on test instances and compute test error
predictions = model.transform(testScaled)
predictions_df = predictions.toPandas()
tn = (predictions_df["prediction"] + predictions_df["label"] == 0).sum()
tp = (predictions_df["prediction"] + predictions_df["label"] == 2).sum()
fn = (predictions_df["prediction"] < predictions_df["label"]).sum()
fp = (predictions_df["prediction"] > predictions_df["label"]).sum()
precision = tp / (tp+fp)
recall = tp / (tp+fn)
precision, recall

23/09/12 15:56:28 WARN TaskSetManager: Stage 2896 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/12 15:56:31 WARN TaskSetManager: Stage 2897 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/12 15:56:32 WARN TaskSetManager: Stage 2898 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/12 15:56:34 WARN TaskSetManager: Stage 2899 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/12 15:56:36 WARN TaskSetManager: Stage 2900 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/12 15:56:37 WARN TaskSetManager: Stage 2901 contains a task of very large size (74761 KiB). The maximum recommended task size is 1000 KiB.
23/09/12 15:56:39 WARN TaskSetManager: Stage 2902 contains a task of very large size (74761 KiB). The maximum recommended task siz

(0.6666666666666666, 0.46153846153846156)

In [32]:
svm = LinearSVC(regParam=0.1, featuresCol="features_scaled")
grid = ParamGridBuilder() \
    .addGrid(svm.regParam, [0.001, 0.01, 0.1]) \
    .build()
evaluator = BinaryClassificationEvaluator(metricName="areaUnderPR")
cv = CrossValidator(estimator=svm, estimatorParamMaps=grid, evaluator=evaluator, )

cvModel = cv.fit(trainScaled)

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/greg/.pyenv/versions/3.8.12/envs/lewagon/lib/python3.8/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/greg/.pyenv/versions/3.8.12/envs/lewagon/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/greg/.pyenv/versions/3.8.12/envs/lewagon/lib/python3.8/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


Py4JError: org does not exist in the JVM

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/greg/.pyenv/versions/3.8.12/envs/lewagon/lib/python3.8/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/greg/.pyenv/versions/3.8.12/envs/lewagon/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/greg/.pyenv/versions/3.8.12/envs/lewagon/lib/python3.8/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


In [None]:
# Evaluate model on test instances and compute test error
predictions = cvModel.bestModel.transform(testScaled)
predictions_df = predictions.toPandas()
tn = (predictions_df["prediction"] + predictions_df["label"] == 0).sum()
tp = (predictions_df["prediction"] + predictions_df["label"] == 2).sum()
fn = (predictions_df["prediction"] < predictions_df["label"]).sum()
fp = (predictions_df["prediction"] > predictions_df["label"]).sum()
precision = tp / (tp+fp)
recall = tp / (tp+fn)
precision, recall