In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
from erisk.utils import get_spark
from pyspark.ml import PipelineModel
from pyspark.ml.functions import array_to_vector
from pyspark.sql import functions as F

dataset_path = "gs://dsgt-clef-erisk-2024/task1/parquet/combined_tfidf"
model_path = "gs://dsgt-clef-erisk-2024/task1/processed/baseline_nb_tfidf/model"

spark = get_spark()
df = spark.read.parquet(dataset_path).withColumn("tfidf", array_to_vector("tfidf"))
model = PipelineModel.load(model_path)

df.printSchema()
model

root
 |-- DOCNO: string (nullable = true)
 |-- TEXT: string (nullable = true)
 |-- filename: string (nullable = true)
 |-- dataset: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hashingtf: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- tfidf: vector (nullable = true)



PipelineModel_3b6d55b3d5dc

In [7]:
# apply the predictions to the dataset
predictions = model.transform(df)
predictions.printSchema()
predictions.show(n=1, vertical=True, truncate=80)

root
 |-- DOCNO: string (nullable = true)
 |-- TEXT: string (nullable = true)
 |-- filename: string (nullable = true)
 |-- dataset: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hashingtf: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- tfidf: vector (nullable = true)
 |-- tfidf_scaled: vector (nullable = true)
 |-- target_1_raw: vector (nullable = true)
 |-- target_1_probability: vector (nullable = true)
 |-- target_1_prediction: double (nullable = false)
 |-- target_10_raw: vector (nullable = true)
 |-- target_10_probability: vector (nullable = true)
 |-- target_10_prediction: double (nullable = false)
 |-- target_11_raw: vector (nullable = true)
 |-- target_11_probability: vector (nullable = true)
 |-- target_11_prediction: double (nullable = false)
 |-- target_12_raw: vector (nullable = true)
 |-- target_12_probability: vector (nullable = true)
 |-- target_12_prediction: double (nullabl

24/03/24 23:21:12 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 276:>                                                        (0 + 1) / 1]

-RECORD 0-------------------------------------------------------------------------------------------------
 DOCNO                 | s_0_0_0                                                                          
 TEXT                  | 1.ye katiliyorum                                                                 
 filename              | s_0.trec                                                                         
 dataset               | train                                                                            
 words                 | [1.ye, katiliyorum]                                                              
 hashingtf             | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0... 
 tfidf                 | [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.16... 
 tfidf_scaled          | (256,[18,181],[0.25,0.009259259259259259])                                       
 target_1_raw          | [-1.35656427

                                                                                

In [13]:
from pyspark.sql import functions as F, Window
from pyspark.ml.functions import vector_to_array
from functools import reduce


# let's take a subset of the columns and cache them
def score_predictions(df, primary_key="DOCNO", k=1000):
    target_probs = [c for c in df.columns if "_probability" in c]
    target_probs_relevant = [vector_to_array(c)[1].alias(c) for c in target_probs]
    subset = (df.select(primary_key, *target_probs_relevant)).cache()
    # now for each target, we can compute the most relevant documents
    top_docs = []
    for c in target_probs:
        ordered = (
            subset.select(
                F.lit(int(c.split("_")[1])).alias("symptom_number"),
                F.lit("Q0").alias("Qo"),
                primary_key,
                F.col(c).alias("score"),
            )
            .withColumn(
                "rank",
                F.row_number().over(
                    Window.partitionBy("symptom_number").orderBy(F.col("score").desc())
                ),
            )
            .where(F.col("rank") <= k)
        )
        top_docs.append(ordered)
    # union all the documents together
    return reduce(lambda a, b: a.union(b), top_docs)


scored = score_predictions(
    predictions.where(F.col("dataset") == "train").sample(0.01), k=10
)
scored.printSchema()
scored.show()

root
 |-- symptom_number: integer (nullable = false)
 |-- Qo: string (nullable = false)
 |-- DOCNO: string (nullable = true)
 |-- score: double (nullable = true)
 |-- rank: integer (nullable = false)



                                                                                

+--------------+---+------------+--------------------+----+
|symptom_number| Qo|       DOCNO|               score|rank|
+--------------+---+------------+--------------------+----+
|             1| Q0|s_2742_853_1|   0.668868563480664|   1|
|             1| Q0|s_2742_607_1|  0.6023149557527278|   2|
|             1| Q0|s_2742_416_1| 0.34437959581525857|   3|
|             1| Q0|s_1122_880_1| 0.33972811282571225|   4|
|             1| Q0|s_2889_637_2|  0.1882093569129803|   5|
|             1| Q0|s_1122_316_1| 0.13894283229597693|   6|
|             1| Q0|s_1122_624_1| 0.09751266134483094|   7|
|             1| Q0|s_1122_325_1| 0.08145508119471051|   8|
|             1| Q0|s_2742_931_2| 0.05674612408963607|   9|
|             1| Q0|s_2742_931_1|0.052201574830414356|  10|
|            10| Q0|s_2742_607_1| 0.44152607244699116|   1|
|            10| Q0| s_1122_23_1|  0.3255970745790591|   2|
|            10| Q0|s_2742_853_1| 0.24475960791711163|   3|
|            10| Q0|s_1122_880_1|  0.191