In [2]:
!conda list

Package                           Version
--------------------------------- ---------------
aiofiles                          22.1.0
aiosqlite                         0.18.0
anaconda-anon-usage               0.4.3
anaconda-client                   1.11.2
anaconda-cloud-auth               0.1.4
anaconda-navigator                2.5.0
anaconda-project                  0.11.1
anyio                             3.5.0
appdirs                           1.4.4
archspec                          0.2.3
argon2-cffi                       21.3.0
argon2-cffi-bindings              21.2.0
asttokens                         2.0.5
attrs                             22.1.0
Babel                             2.11.0
backcall                          0.2.0
backports.functools-lru-cache     1.6.4
backports.tempfile                1.0
backports.weakref                 1.0.post1
bash_kernel                       0.9.0
beautifulsoup4                    4.12.2
bleach                            4.1.0
boltons          

In [3]:
import pandas as pd
import os

# Set paths
LABELS_MAJ_PATH = "/storage/home/hcoda1/6/dahumada3/erisk_shared/raw/training_data/2023/g_qrels_majority_2.csv"
LABELS_CONS_PATH = "/storage/home/hcoda1/6/dahumada3/erisk_shared/raw/training_data/2023/g_rels_consenso.csv"
PARQUET_DIR = "/storage/home/hcoda1/6/dahumada3/erisk_shared/parquet/training_data/2023/partitions"

In [4]:
# Load CSVs
majority_labels = pd.read_csv(LABELS_MAJ_PATH)
consensus_labels = pd.read_csv(LABELS_CONS_PATH)

majority_labels.columns = ["query", "q0", "docid", "rel"]
consensus_labels.columns = ["query", "q0", "docid", "rel"]

# Preview
majority_labels.head()
# print("\n")
# consensus_labels.head()

Unnamed: 0,query,q0,docid,rel
0,1,0,s_405_1279_15,1
1,1,0,s_2519_356_0,0
2,1,0,s_2038_51_7,1
3,1,0,s_975_61_2,1
4,1,0,s_577_923_1,1


In [5]:
consensus_labels.head()

Unnamed: 0,query,q0,docid,rel
0,1,0,s_405_1279_15,1
1,1,0,s_2519_356_0,0
2,1,0,s_2038_51_7,1
3,1,0,s_975_61_2,0
4,1,0,s_577_923_1,1


In [6]:
# Load all parquet files into one DataFrame
parquet_files = [
    os.path.join(PARQUET_DIR, f)
    for f in os.listdir(PARQUET_DIR)
    if f.endswith(".parquet")
]
print(f"Found {len(parquet_files)} partition files.")

df = pd.concat([pd.read_parquet(f) for f in sorted(parquet_files)], ignore_index=True)
print("Combined shape:", df.shape)
df.head()

Found 2 partition files.
Combined shape: (4264693, 2)


Unnamed: 0,DOCNO,TEXT
0,s_1673_0_0,Ya that actually makes me suspicious.
1,s_1673_0_1,Bots dont do things they arent programmed to do.
2,s_1673_0_2,Why would this bot be accepting mod invites fr...
3,s_1673_0_3,And then suddenly remove themselves.
4,s_1673_0_4,Apprentice bot might not be just a bot...


In [7]:
df.describe()

Unnamed: 0,DOCNO,TEXT
count,4264693,4264693
unique,4264693,3624894
top,s_855_1999_0,The email shipped off your given account will ...
freq,1,12006


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4264693 entries, 0 to 4264692
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   DOCNO   object
 1   TEXT    object
dtypes: object(2)
memory usage: 65.1+ MB


In [9]:
df = df[df["TEXT"].notnull()]
df = df[df["TEXT"].str.strip().str.len() > 0]

print("After cleaning:")
print("Cleaned shape:", df.shape)
print("Unique texts:", df["TEXT"].nunique())

After cleaning:
Cleaned shape: (4264560, 2)
Unique texts: 3624863


In [10]:
import re

SELF_REF_PATTERNS = [
    r"\bI\b",
    r"\bI'm\b",
    r"\bI’m\b",
    r"\bI am\b",
    r"\bI was\b",
    r"\bI feel\b",
    r"\bI think\b",
    r"\bme\b",
    r"\bmy\b",
    r"\bmine\b",
    r"\bmyself\b",
    r"\bI've\b",
    r"\bI’ve\b",
]

self_ref_regex = re.compile("|".join(SELF_REF_PATTERNS), flags=re.IGNORECASE)

In [11]:
df["is_self_ref"] = df["TEXT"].apply(lambda x: bool(self_ref_regex.search(x)))

self_ref_df = df[df["is_self_ref"]].copy()
print(f"Self-referential sentences: {len(self_ref_df):,} out of {len(df):,}")

Self-referential sentences: 1,205,375 out of 4,264,560


In [12]:
import nltk
from nltk.tokenize import word_tokenize

# Make sure tokenizer is ready
nltk.download("punkt")

# Define self-referential word lists
SELF_REFERENTIAL_WORDS = {
    "en": {
        "i",
        "me",
        "my",
        "mine",
        "myself",
        "i'm",
        "i’ve",
        "i'd",
        "i’ll",
        "i’d",
        "i’d",  # contractions
        "i’ve",
        "i'd",
        "i’ll",
        "i’ve",
        "i am",
        "i was",
    }
    # Extend with other languages if needed
}


# Function to calculate self-referential ratio
def calculate_self_referential_ratio(text, lang="en"):
    if lang not in SELF_REFERENTIAL_WORDS or not text:
        return 0.0

    words = word_tokenize(text.lower())
    total_words = len(words)

    if total_words == 0:
        return 0.0

    self_ref_words = SELF_REFERENTIAL_WORDS[lang]
    self_ref_count = sum(1 for word in words if word in self_ref_words)

    return self_ref_count / total_words

[nltk_data] Downloading package punkt to
[nltk_data]     /storage/home/hcoda1/6/dahumada3/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [14]:
nltk.data.path.append("/storage/home/hcoda1/6/dahumada3/nltk_data")
df["self_ref_ratio"] = df["TEXT"].apply(
    lambda x: calculate_self_referential_ratio(x, lang="en")
)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/storage/home/hcoda1/6/dahumada3/nltk_data'
    - '/storage/scratch1/6/dahumada3/venvs/erisk-2025/venv/nltk_data'
    - '/storage/scratch1/6/dahumada3/venvs/erisk-2025/venv/share/nltk_data'
    - '/storage/scratch1/6/dahumada3/venvs/erisk-2025/venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/storage/home/hcoda1/6/dahumada3/nltk_data'
**********************************************************************


In [15]:
nltk.download("punkt_tab")
# Some issues with the latest nltk version see: https://github.com/nltk/nltk/issues/3293

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /storage/home/hcoda1/6/dahumada3/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

##### The next cell Takes too long on >4 Million posts

In [16]:
df["self_ref_ratio"] = df["TEXT"].apply(
    lambda x: calculate_self_referential_ratio(x, lang="en")
)

In [23]:
df[df["is_self_ref"]]

Unnamed: 0,DOCNO,TEXT,is_self_ref,self_ref_ratio
0,s_1673_0_0,Ya that actually makes me suspicious.,True,0.142857
7,s_1673_1_2,I checked his post history at one point and he...,True,0.066667
9,s_1673_1_4,The next two are my posts.,True,0.142857
10,s_1673_1_5,Night Mind is a youtuber I watch who specializ...,True,0.071429
11,s_1673_1_6,He doesnt have the most active sub but i thoug...,True,0.050000
...,...,...,...,...
4264557,s_855_1863_0,I am.. sadness,True,0.250000
4264561,s_855_1867_1,Source: http://pre04.deviantart.net/c6b6/th/p...,True,0.000000
4264575,s_855_1881_1,Source: http://pre14.deviantart.net/9bb6/th/p...,True,0.000000
4264577,s_855_1883_1,Source: http://pre12.deviantart.net/80fd/th/p...,True,0.000000


In [25]:
df[df["DOCNO"] == "s_855_1867_1"]

Unnamed: 0,DOCNO,TEXT,is_self_ref,self_ref_ratio
4264561,s_855_1867_1,Source: http://pre04.deviantart.net/c6b6/th/p...,True,0.0


In [26]:
print(df.loc[df["DOCNO"] == "s_855_1867_1", "TEXT"].values[0])

 Source: http://pre04.deviantart.net/c6b6/th/pre/i/2016/161/a/7/lighting_the_way___lux_by_zarory-da5m4js.jpgArtist: **Casper Hansen** from *Denmark*You can find more on him [here](http://zarory.deviantart.com)Software used: *Photoshop*


#### Looks like there's still some junk posts after filtering, If self ref ratio is zero these documents can be filtered out