### Imports

In [1]:
import gzip
import json
import regex  # Use 'regex' module instead of 're' for better Unicode support
import unicodedata

import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from torch.nn.functional import softmax
import torch

### Data

In [2]:
## GET LABELS
train_classification_df = pd.read_csv("training-dataset-labels.csv")

# Rename columns to match the original code's structure
train_classification_df = train_classification_df.rename(columns={'username': 'user_id', 'label': 'category'})

# Unify labels
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)

# Convert the DataFrame into a dictionary mapping user_id to category
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]

## GET INPUT DATA
train_data_path = "training-dataset.jsonl.gz"

username2posts_train = dict()
username2profile_train = dict()

username2posts_test = dict()
username2profile_test = dict()


with gzip.open(train_data_path, "rt") as fh:
  for line in fh:
    sample = json.loads(line)
    profile = sample["profile"]
    username = profile["username"]
    if username in username2_category:
      # train data info
      username2posts_train[username] = sample["posts"]
      username2profile_train[username] = profile


    else:
      # it is test data info
      username2posts_test[username] = sample["posts"]
      username2profile_test[username] = profile

# Remove labels with no inputs username2profile_train.loc["username"]:
len_before = len(username2_category)
username2_category = {k: v for k, v in username2_category.items() if k in username2posts_train}
len_after = len(username2_category)
print(f"Removed {len_before - len_after} labels with no input data.")

train_profile_df = pd.DataFrame(username2profile_train).T.reset_index(drop=True)
test_profile_df = pd.DataFrame(username2profile_test).T.reset_index(drop=True)

Removed 120 labels with no input data.


In [3]:
# Load the usernames from "test-classification-round3.dat"
round_usernames = []
with open("test-classification-round3.dat", "r") as f:
  for line in f:
    round_usernames.append(line.strip())
  
len_before = len(round_usernames)
print(f"Loaded {len_before} usernames from 'test-classification-round3.dat'.")

Loaded 1000 usernames from 'test-classification-round3.dat'.


In [4]:
test_count = 0
train_count = 0
not_found = 0

round_usernames_to_classify = []
round_usernames_classified = {}

for username in round_usernames:
  if username in test_profile_df.username.values:
    round_usernames_to_classify.append(username)
    test_count += 1
  if username in train_profile_df.username.values:
    round_usernames_classified[username] = username2_category[username]
    train_count += 1

print(f"Found {test_count} usernames in test data and {train_count} usernames in train data.")
print(f"Found {not_found} usernames in neither train nor test data.")

Found 713 usernames in test data and 287 usernames in train data.
Found 0 usernames in neither train nor test data.


### Preprocessing

In [5]:
def preprocess_text(text: str) -> str:
    # 1) Normalize the text to NFC so that Turkish characters are properly composed
    text = unicodedata.normalize("NFC", text)
    
    # 2) Remove or replace escape sequences (both actual and literal)
    #    This handles actual newlines (\n) and also the literal two-character sequence '\n'
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = text.replace('\t', ' ')
    text = regex.sub(r'\\n', ' ', text)  # if you have literal "\n" in your data

    # 3) Remove URLs (http, https, www)
    text = regex.sub(r'http\S+|www\S+|https\S+', '', text)

    # 4) Remove extra spaces
    text = regex.sub(r'\s+', ' ', text).strip()

    return text


In [6]:
username2posts_test_preprocessed = dict()
for username, posts in username2posts_test.items():
    username2posts_test_preprocessed[username] = [
        {**post, "caption": preprocess_text(post["caption"])} 
        for post in posts if "caption" in post and post["caption"]
    ]

In [7]:
round = []

for username in round_usernames_to_classify:
    if username not in username2posts_test_preprocessed:
        print(f"Warning: {username} has no posts with captions.")
    else:
        posts = username2posts_test_preprocessed[username]
        for post in posts:
            round.append({
                "username": username,
                "caption": post["caption"]
            })

        if username2profile_test[username]["biography"]:
            round.append({"username": username, "caption": preprocess_text(username2profile_test[username]["biography"]) })

df_round = pd.DataFrame(round)

### Evaluate the model

In [8]:
# username2posts_train_preprocessed = dict()
# for username, posts in username2posts_train.items():
#     username2posts_train_preprocessed[username] = [
#         {**post, "caption": preprocess_text(post["caption"]), "class": username2_category[username]} 
#         for post in posts if "caption" in post and post["caption"]
#     ]

In [9]:
# merged_train_df = pd.merge(
#     train_profile_df,               # has column "id"
#     train_classification_df,        # has column "user_id"
#     left_on="username",                   # train_profile_df.id
#     right_on="user_id",            # train_classification_df.user_id
#     how="inner"                     # only keep matching rows
# )

In [10]:
# captions = []

# for username, posts in username2posts_train_preprocessed.items():
#     for post in posts:
#         if "caption" in post and post["caption"]:
#             captions.append({"caption": post["caption"], "class": post["class"], "username": username})

In [11]:
# label_list = ['tech', 'food', 'health and lifestyle', 'travel', 'sports', 'fashion', 'entertainment', 'mom and children', 'art', 'gaming']
# label2id = {lbl: i for i, lbl in enumerate(label_list)}
# id2label = {i: lbl for i, lbl in enumerate(label_list)}

In [12]:
# df_train_captions = pd.DataFrame(captions)
# df_train_captions["label_id"] = df_train_captions["class"].map(label2id)

# dataset = Dataset.from_pandas(df_train_captions[["caption", "label_id", "username"]])

# dataset_split = dataset.train_test_split(test_size=0.2, seed=42)
# train_dataset = dataset_split["train"]
# eval_dataset = dataset_split["test"]

In [13]:
# model_path = "./checkpoint-9869"

# model = AutoModelForSequenceClassification.from_pretrained(model_path)
# tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")

In [14]:
# def tokenize_function(examples):
#     return tokenizer(
#         examples["caption"],
#         truncation=True,
#         padding="max_length",
#         max_length=128
#     )

# def rename_label_column(examples):
#     examples["labels"] = examples["label_id"]
#     return examples

In [15]:
# tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)
# tokenized_eval_dataset = tokenized_eval_dataset.map(rename_label_column, batched=True)

In [16]:
# # Extract logits and labels
# logits = predictions.predictions
# labels = predictions.label_ids

# # Compute predictions from logits
# predicted_labels = torch.argmax(torch.tensor(logits), axis=1)

# # Compute accuracy
# accuracy = accuracy_metric.compute(predictions=predicted_labels, references=labels)["accuracy"]

# # Print the accuracy
# print(f"Accuracy of the model: {accuracy * 100:.2f}%")

### Prediction

In [17]:
model_path = "./model"

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [18]:
def tokenize_function(examples):
    return tokenizer(
        examples["caption"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

def rename_label_column(examples):
    examples["labels"] = examples["label_id"]
    return examples

In [19]:
label_list = ['tech', 'food', 'health and lifestyle', 'travel', 'sports', 'fashion', 'entertainment', 'mom and children', 'art', 'gaming']
label2id = {lbl: i for i, lbl in enumerate(label_list)}
id2label = {i: lbl for i, lbl in enumerate(label_list)}

In [24]:
# Create a Dataset from df_round
ds_infer = Dataset.from_pandas(df_round)

# Tokenize
ds_infer = ds_infer.map(tokenize_function, batched=True)

# Define a Trainer for prediction
trainer = Trainer(model=model, tokenizer=tokenizer)

# Predict
preds = trainer.predict(ds_infer)



Map:   0%|          | 0/24574 [00:00<?, ? examples/s]

  trainer = Trainer(model=model, tokenizer=tokenizer)


  0%|          | 0/3072 [00:00<?, ?it/s]

In [27]:
pred_label_ids = np.argmax(preds.predictions, axis=1)
df_round["predicted_class"] = [label_list[i] for i in pred_label_ids]

post_probs = softmax(torch.tensor(preds.predictions), dim=-1).numpy()
df_round["probs"] = list(post_probs)

In [29]:
def avg_probs(probs_list):
    # probs_list is a list of arrays, each array is shape [num_labels, ]
    # We can stack them and take mean
    arr = np.vstack(probs_list)  # shape [num_posts_for_user, num_labels]
    return arr.mean(axis=0)      # shape [num_labels,]

In [30]:
df_round_res = (
    df_round
    .groupby("username")["probs"]
    .apply(lambda x: avg_probs(x))
    .reset_index(name="avg_probs")
)

In [34]:
df_round_res["user_predicted_category"] = df_round_res["avg_probs"].apply(lambda x: label_list[np.argmax(x)])

In [47]:
round_usernames_classified.update(df_round_res.set_index("username")["user_predicted_category"].to_dict())

In [51]:
with open("prediction-classification-round3.json", "w") as f:
    json.dump(round_usernames_classified, f, indent=2)