# Saving the Reddit Data

First, we load in the zst files that we got from the Torrent.

In [None]:
import gdown
import pandas as pd

# List of Google Drive links
google_drive_links = [
    "https://drive.google.com/file/d/1s5RrA78XXEwq0X2oyUnoSCocnez7HaAL/view?usp=share_link",
    "https://drive.google.com/file/d/1mOdseJhyoCe76mU3xnsSEjND7vkzg1ZY/view?usp=share_link",
    "https://drive.google.com/file/d/1ZvsSTim8nwKTMoxRljs6ANpFAJu9rlcv/view?usp=share_link"
]

# List of subreddits
subreddits = [
    "dsa", "thenewright", "statistics"
]

for i in [0,1,2]:
    # Extract file ID from link
    file_id = google_drive_links[i].split("/")[-2]

    # Create download URL
    download_url = f"https://drive.google.com/uc?id={file_id}"

    # Specify output file name
    output_file = subreddits[i] + ".zst"

    # Download the file
    gdown.download(download_url, output_file, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1s5RrA78XXEwq0X2oyUnoSCocnez7HaAL
To: /content/dsa.zst
100%|██████████| 8.64M/8.64M [00:00<00:00, 29.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1mOdseJhyoCe76mU3xnsSEjND7vkzg1ZY
To: /content/thenewright.zst
100%|██████████| 17.3M/17.3M [00:00<00:00, 86.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ZvsSTim8nwKTMoxRljs6ANpFAJu9rlcv
To: /content/statistics.zst
100%|██████████| 77.4M/77.4M [00:00<00:00, 106MB/s]


Now, we specify a function to turn this zst file into a data frame we can analyse.

In [None]:
!pip install zstandard ndjson
import zstandard as zstd
import ndjson

def read_zst_file(file_path):
    with open(file_path, "rb") as f:
        dctx = zstd.ZstdDecompressor()
        with dctx.stream_reader(f) as reader:
            return reader.read()

def read_ndjson_zst(file):
    data = read_zst_file(file)
    return ndjson.loads(data)

def extract_fields(data, fields):
    return [{field: item[field] for field in fields} for item in data]


paths = ["/content/" + subreddit + ".zst" for subreddit in subreddits]
data_list = []

for path in paths:
    data = read_ndjson_zst(path)
    data = extract_fields(data, ["author", "body", "controversiality", "score", "created_utc"])
    data = pd.DataFrame(data)
    data = data[data['body'] != '[deleted]']
    data = data.sort_values(by="score")

    data_list.append(data)

Collecting zstandard
  Downloading zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting ndjson
  Downloading ndjson-0.3.1-py2.py3-none-any.whl.metadata (3.2 kB)
Downloading zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ndjson-0.3.1-py2.py3-none-any.whl (5.3 kB)
Installing collected packages: ndjson, zstandard
Successfully installed ndjson-0.3.1 zstandard-0.23.0


Lastly, we can download the files as a compressed csv file

In [None]:
from google.colab import files

for i, data in enumerate(data_list):
  filename = f"{subreddits[i]}.csv.gz"
  data.to_csv(filename, index = False, compression='gzip')
  files.download(filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Add fake labels usin chatGPT
(based on the file from lab5)

In [None]:
from openai import OpenAI

# Replace with your OpenAI API key
client = OpenAI(
    # This is the default and can be omitted
    api_key="KEY HERE",
)
def classify_username_gender(username):
  prompt = f"Classify the username '{username}' into one of the following categories: male, female, queer, other. Respond with only one word and try to avoid the other category."
  chat_completion = client.chat.completions.create(
      messages=[
          {
              "role": "user",
              "content": prompt,
          }
      ],
      model="gpt-4o-mini",
      n=1,
      max_tokens=10
  )

  return chat_completion.choices[0].message.content.lower()

def classify_comment(comment):
  prompt = f"Classify the comment '''{comment}''' according to the presence of hate speech into one of the following categories: HATE, NON_HATE. Respond with only one word"
  chat_completion = client.chat.completions.create(
      messages=[
          {
              "role": "user",
              "content": prompt,
          }
      ],
      model="gpt-4o-mini",
      n=1,
      max_tokens=10
  )

  return chat_completion.choices[0].message.content.lower()

from tqdm.notebook import tqdm

genders = []
for author in tqdm(df_dsa_label["author"]):
  genders.append(classify_username_gender(author))

hate_labels = []
for com in tqdm(df_dsa_label["body"]):
  hate_labels.append(classify_comment(com))
df_dsa_label["label_value"] = hate_labels
df_dsa_label["gender"] = genders
df_dsa_label.to_csv("df_dsa_label_true_full.csv.gz", compression="gzip")

df_dsa_label["score"] = df_dsa_label["score"] > 0.5
df_dsa_label["label_value"] = df_dsa_label["label_value"] == "hate"
df_dsa_label["text"] = df_dsa_label["body"]
df_dsa_label.loc[:,["score", "label_value", "gender", "text"]].to_csv("df_dsa_label_true.csv.gz", compression="gzip")
