In [1]:
!ls
!pwd

00-erisk25task1EDA.ipynb     04-eRiskAnalysisSelfReferential.ipynb
01-erisktokenestimate.ipynb  05-eRiskPySparkSelfRefFiltering.ipynb
03-pyterrier-test2.ipynb
/storage/home/hcoda1/6/dahumada3/clef/erisk-2025/user/dahumada3/notebooks


In [2]:
import sys

sys.path.append("/storage/home/hcoda1/6/dahumada3/clef/erisk-2025")

from erisk.spark import get_spark

spark = get_spark()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/26 23:24:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/26 23:24:32 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [3]:
# Set paths
LABELS_MAJ_PATH = "/storage/home/hcoda1/6/dahumada3/erisk_shared/raw/training_data/2023/g_qrels_majority_2.csv"
LABELS_CONS_PATH = "/storage/home/hcoda1/6/dahumada3/erisk_shared/raw/training_data/2023/g_rels_consenso.csv"
PARQUET_DIR = "/storage/home/hcoda1/6/dahumada3/erisk_shared/parquet/training_data/2023/partitions"

In [4]:
df = spark.read.parquet(PARQUET_DIR)
df.printSchema()
df.show(5, truncate=False)


[Stage 0:>                                                          (0 + 1) / 1]

                                                                                

root
 |-- DOCNO: string (nullable = true)
 |-- TEXT: string (nullable = true)



                                                                                

+------------+-------------------------------------------------------------------------------------+
|DOCNO       |TEXT                                                                                 |
+------------+-------------------------------------------------------------------------------------+
|s_2457_109_0|LADWP, Inyo agree to test run on well 395                                            |
|s_2457_110_0|State representatives recognize NIHDs District of Year designation                   |
|s_2457_111_0|Small plane crashes en route from Bishop to Nanaimo, BC (Canada)                     |
|s_2457_112_0|Audio and video production professionals, as an example: https://youtu.be/jv5HIrOrn2o|
|s_2457_113_0|Sure, its a professional powerhouse for audio and video production professionals.    |
+------------+-------------------------------------------------------------------------------------+
only showing top 5 rows



In [5]:
from pyspark.sql.functions import length, avg, min, max

row_count = df.count()
col_count = len(df.columns)

print(f"Shape: ({row_count:,} rows, {col_count} columns)")

df_stats = df.withColumn("text_length", length("TEXT"))

summary = df_stats.agg(
    avg("text_length").alias("avg_length"),
    min("text_length").alias("min_length"),
    max("text_length").alias("max_length"),
)

summary.show()

Shape: (4,264,693 rows, 2 columns)




+-----------------+----------+----------+
|       avg_length|min_length|max_length|
+-----------------+----------+----------+
|86.27327078408692|         1|     39999|
+-----------------+----------+----------+




                                                                                

In [6]:
# Dataframe cleanup
from pyspark.sql.functions import col, trim
from pyspark.sql.functions import regexp_replace

df_clean = df.filter(trim(col("TEXT")) != "")
df_clean = df_clean.withColumn("TEXT", regexp_replace("TEXT", r"http\S+|www\S+", ""))
df_clean.select("TEXT").show(5, truncate=False)

print("Remaining rows:", df_clean.count())

+---------------------------------------------------------------------------------+
|TEXT                                                                             |
+---------------------------------------------------------------------------------+
|LADWP, Inyo agree to test run on well 395                                        |
|State representatives recognize NIHDs District of Year designation               |
|Small plane crashes en route from Bishop to Nanaimo, BC (Canada)                 |
|Audio and video production professionals, as an example:                         |
|Sure, its a professional powerhouse for audio and video production professionals.|
+---------------------------------------------------------------------------------+
only showing top 5 rows





Remaining rows: 4264560




                                                                                

In [7]:
SELF_REF_PATTERN = r"\b(i|me|my|mine|myself|i'm|i’ve|i'd|i’ll|i am|i was)\b"
df_with_flag = df_clean.withColumn("is_self_ref", col("TEXT").rlike(SELF_REF_PATTERN))

In [9]:
self_ref_df = df_with_flag.filter(col("is_self_ref"))

total_count = df_with_flag.count()
self_ref_count = self_ref_df.count()

print(f"Self-referential posts: {self_ref_count:,} out of {total_count:,}")



Self-referential posts: 434,789 out of 4,264,560




                                                                                

PySpark filters more sentences out than pandas, since PySpark doesn't support re.IGNORECASE directly, but you can lowercase the text before applying the regex

In [10]:
from pyspark.sql.functions import lower

SELF_REF_PATTERN = r"\b(i|me|my|mine|myself|i'm|i’ve|i'd|i’ll|i am|i was)\b"

df_with_flag = df_clean.withColumn(
    "is_self_ref", lower(col("TEXT")).rlike(SELF_REF_PATTERN)
)

In [11]:
self_ref_df = df_with_flag.filter(col("is_self_ref"))

total_count = df_with_flag.count()
self_ref_count = self_ref_df.count()

print(f"Self-referential posts: {self_ref_count:,} out of {total_count:,}")



Self-referential posts: 1,191,200 out of 4,264,560



                                                                                

In [12]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

SELF_REF_WORDS = set(
    ["i", "me", "my", "mine", "myself", "i'm", "i’ve", "i'd", "i’ll", "i am", "i was"]
)


def fast_self_ref_ratio(text):
    if not text:
        return 0.0
    words = text.lower().split()
    if not words:
        return 0.0
    count = sum(1 for word in words if word in SELF_REF_WORDS)
    return count / len(words)


self_ref_ratio_udf = udf(fast_self_ref_ratio, DoubleType())

df_final = df_with_flag.withColumn("self_ref_ratio", self_ref_ratio_udf(col("TEXT")))

In [15]:
df_final.show(5)

+------------+--------------------+-----------+--------------+
|       DOCNO|                TEXT|is_self_ref|self_ref_ratio|
+------------+--------------------+-----------+--------------+
|s_2457_109_0|LADWP, Inyo agree...|      false|           0.0|
|s_2457_110_0|State representat...|      false|           0.0|
|s_2457_111_0|Small plane crash...|      false|           0.0|
|s_2457_112_0|Audio and video p...|      false|           0.0|
|s_2457_113_0|Sure, its a profe...|      false|           0.0|
+------------+--------------------+-----------+--------------+
only showing top 5 rows



In [16]:
import spacy

nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])  # Faster

SELF_REF_WORDS = {
    "i",
    "me",
    "my",
    "mine",
    "myself",
    "i'm",
    "i’ve",
    "i'd",
    "i’ll",
    "i am",
    "i was",
}


def spacy_self_ref_ratio(text):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if not token.is_space]
    if not tokens:
        return 0.0
    self_ref_count = sum(1 for token in tokens if token in SELF_REF_WORDS)
    return self_ref_count / len(tokens)

ModuleNotFoundError: No module named 'spacy'