### Imports

In [16]:
import gzip
import json
from collections import Counter, defaultdict
import regex  # Use 'regex' module instead of 're' for better Unicode support
import unicodedata

import numpy as np
import pandas as pd
from langdetect import detect, DetectorFactory, LangDetectException
import matplotlib.pyplot as plt
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.nn.functional import softmax
import torch

### Data Loading

In [17]:
## GET LABELS
train_classification_df = pd.read_csv("training-dataset-labels.csv")

# Rename columns to match the original code's structure
train_classification_df = train_classification_df.rename(columns={'username': 'user_id', 'label': 'category'})

# Unify labels
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)

# Convert the DataFrame into a dictionary mapping user_id to category
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]

## GET INPUT DATA
train_data_path = "training-dataset.jsonl.gz"

username2posts_train = dict()
username2profile_train = dict()

username2posts_test = dict()
username2profile_test = dict()


with gzip.open(train_data_path, "rt") as fh:
  for line in fh:
    sample = json.loads(line)
    profile = sample["profile"]
    username = profile["username"]
    if username in username2_category:
      # train data info
      username2posts_train[username] = sample["posts"]
      username2profile_train[username] = profile


    else:
      # it is test data info
      username2posts_test[username] = sample["posts"]
      username2profile_test[username] = profile

# Remove labels with no inputs username2profile_train.loc["username"]:
len_before = len(username2_category)
username2_category = {k: v for k, v in username2_category.items() if k in username2posts_train}
len_after = len(username2_category)
print(f"Removed {len_before - len_after} labels with no input data.")

train_profile_df = pd.DataFrame(username2profile_train).T.reset_index(drop=True)
test_profile_df = pd.DataFrame(username2profile_test).T.reset_index(drop=True)

Removed 120 labels with no input data.


### Preprocessing

In [18]:
def preprocess_text(text: str) -> str:
    # 1) Normalize the text to NFC so that Turkish characters are properly composed
    text = unicodedata.normalize("NFC", text)
    
    # 2) Remove or replace escape sequences (both actual and literal)
    #    This handles actual newlines (\n) and also the literal two-character sequence '\n'
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = text.replace('\t', ' ')
    text = regex.sub(r'\\n', ' ', text)  # if you have literal "\n" in your data

    # 3) Remove URLs (http, https, www)
    text = regex.sub(r'http\S+|www\S+|https\S+', '', text)

    # 4) Remove extra spaces
    text = regex.sub(r'\s+', ' ', text).strip()

    return text


In [19]:
username2posts_train_preprocessed = dict()
for username, posts in username2posts_train.items():
    username2posts_train_preprocessed[username] = [
        {**post, "caption": preprocess_text(post["caption"]), "class": username2_category[username]} 
        for post in posts if "caption" in post and post["caption"]
    ]

username2posts_test_preprocessed = dict()
for username, posts in username2posts_train.items():
    username2posts_test_preprocessed[username] = [
        {**post, "caption": preprocess_text(post["caption"])} 
        for post in posts if "caption" in post and post["caption"]
    ]

In [5]:
# # Add a column for post captions from username2posts_train
# train_profile_df['captions'] = train_profile_df['username'].apply(
#     lambda username: ' '.join(
#         [post['caption'] for post in username2posts_train.get(username, []) if 'caption' in post and post['caption']]
#     )
# )

# # Combine biography and captions
# train_profile_df['biography'] = train_profile_df['biography'].fillna('')
# train_profile_df['combined_raw_text'] = train_profile_df['biography'] + ' ' + train_profile_df['captions']

# # Drop rows with missing combined text
# train_profile_df = train_profile_df.dropna(subset=['combined_raw_text'])

# def preprocess_text0(text: str):
#     # 1) Normalize text so that Turkish casefolding doesn't split into multiple chars
#     text = unicodedata.normalize("NFC", text)

#     # 2) Lower-case (casefold) the text
#     text = text.casefold()

#     # Remove URLs
#     text = regex.sub(r'http\S+|www\S+|https\S+', '', text, flags=regex.MULTILINE)

#     # IMPORTANT: Include \p{M} to allow combining diacritics (otherwise "i + dot" can break words)
#     # This pattern keeps letters (L), combining marks (M), numbers (N), whitespace, #, @, emoji, etc.
#     text = regex.sub(r'[^\p{L}\p{M}\p{N}\s#@\p{So}\p{Sk}\p{Sm}\p{Emoji}]+', ' ', text)

#     # Remove standalone numbers (optional)
#     text = regex.sub(r'\s\d+\s', ' ', text)

#     # Remove extra whitespaces
#     text = regex.sub(r'\s+', ' ', text).strip()

#     return text

# train_profile_df['combined_processed_text'] = train_profile_df['combined_raw_text'].apply(preprocess_text0)

In [None]:
train_classification_df.head()

In [None]:
train_classification_df.groupby("category").count()

In [20]:
train_classification_df.columns

Index(['user_id', 'category'], dtype='object')

In [9]:
# print("Number of users in the training set: ", len(train_profile_df))
# print("Number of users in the test set: ", len(test_profile_df))

In [21]:
train_profile_df.columns
test_profile_df.columns

Index(['username', 'id', 'full_name', 'biography', 'category_name',
       'post_count', 'follower_count', 'following_count',
       'is_business_account', 'is_private', 'is_verified',
       'highlight_reel_count', 'bio_links', 'entities', 'ai_agent_type',
       'fb_profile_biolink', 'restricted_by_viewer', 'country_block',
       'eimu_id', 'external_url', 'fbid', 'has_clips',
       'hide_like_and_view_counts', 'is_professional_account',
       'is_supervision_enabled', 'is_guardian_of_viewer',
       'is_supervised_by_viewer', 'is_supervised_user', 'is_embeds_disabled',
       'is_joined_recently', 'business_address_json',
       'business_contact_method', 'business_email', 'business_phone_number',
       'business_category_name', 'overall_category_name', 'category_enum',
       'is_verified_by_mv4b', 'is_regulated_c18', 'profile_pic_url',
       'should_show_category', 'should_show_public_contacts',
       'show_account_transparency_details', 'profile_picture_base64'],
      dtyp

In [11]:
# username2posts_train['sercevdernegi']

In [12]:
# train_profile_df.loc[0, 'combined_raw_text']

In [13]:
# train_profile_df.loc[0, 'combined_processed_text']

In [14]:
# train_profile_df.loc[0, 'combined_processed_text']

In [22]:
merged_train_df = pd.merge(
    train_profile_df,               # has column "id"
    train_classification_df,        # has column "user_id"
    left_on="username",                   # train_profile_df.id
    right_on="user_id",            # train_classification_df.user_id
    how="inner"                     # only keep matching rows
)

In [23]:
label_list = merged_train_df['category'].unique()
label2id = {lbl: i for i, lbl in enumerate(label_list)}
id2label = {i: lbl for i, lbl in enumerate(label_list)}

In [24]:
label_list

array(['tech', 'food', 'health and lifestyle', 'travel', 'sports',
       'fashion', 'entertainment', 'mom and children', 'art', 'gaming'],
      dtype=object)

In [None]:
merged_train_df.columns

### EDA

In [None]:
train_classification_df.head()

In [None]:
train_profile_df.head()

In [None]:
username2posts_test_preprocessed.keys()

In [None]:
username2posts_train_preprocessed["deparmedya"]


In [None]:
train_classification_df.groupby("category").count()

In [22]:
DetectorFactory.seed = 0
def detect_language_distribution(texts):
    lang_counts = Counter()
    for text in texts:
        try:
            lang = detect(text)
            lang_counts[lang] += 1
        except LangDetectException:
            # Could not detect language (too short text or empty, etc.)
            # print("Could not detect language for text:", text[:50])
            # print("Exception:", e)
            lang_counts['unknown'] += 1
    return lang_counts

In [23]:
def classify_emoji_content(text: str) -> int:
    """
    Return:
      0 -> if text has no emojis
      1 -> if text has emojis AND other non-emoji characters
      2 -> if text consists only of emojis
    """
    # Find all potential emojis
    found_emojis = regex.findall(r'\p{Emoji}', text)
    
    # Filter out numbers and keycap symbols (#, *)
    filtered_emojis = [e for e in found_emojis if e not in "0123456789#*"]

    if not filtered_emojis:
        # 0: No valid emojis found
        return 0

    # Check if text consists only of emojis (ignoring spaces)
    stripped_text = text.strip()
    emoji_text = ''.join(filtered_emojis)

    if stripped_text == emoji_text:
        # 2: Text consists only of emojis
        return 2

    # 1: Text contains both emojis and other characters
    return 1

In [24]:
captions = []
missing_captions = 0
empty_captions = 0

emoji_usage = [0,0,0]  # [no_emojis, mixed_text, only_emojis]
emoji_usage_per_category = defaultdict(Counter)

emojis = Counter()
emojis_per_category = defaultdict(Counter)


dates = Counter()

for username, posts in username2posts_train_preprocessed.items():
    for post in posts:
        if "caption" in post and post["caption"]:
            captions.append({"caption": post["caption"], "class": post["class"], "username": username})
            emojis_class = classify_emoji_content(post["caption"])
            emoji_usage[emojis_class] += 1
            date = post["timestamp"][5:10]
            dates[date] += 1
            emoji_usage_per_category[username2_category[username]][emojis_class] += 1
            found_emojis = regex.findall(r'\p{Emoji}', post["caption"])
            for e in found_emojis:
                if e not in "0123456789#*":
                    emojis[e] += 1
                    emojis_per_category[username2_category[username]][e] += 1
        else:
            missing_captions += 1
            if "caption" in post:
                empty_captions += 1




In [None]:
# Distribution of emoji content in captions
labels = ["No emojis", "Mixed text", "Only emojis"]

plt.figure(figsize=(6, 4))
plt.bar(labels, emoji_usage, color=["steelblue", "salmon", "seagreen"])
plt.title("Emoji Distribution")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
# Most Common Emojis
emojis.most_common(20)

In [None]:
# Most Common Emojis per Category
for category, counter in emojis_per_category.items():
    print(f"Category: {category}")
    print(counter.most_common(10))
    print()

In [None]:
# Distribution of emoji content in captions per category
# for category, emoji_counts in emojis_per_category.items():
#     total = sum(emoji_counts.values())
#     print(f"Category: {category}, Total: {total}")
#     print("Emoji Distribution:", {labels[i]: count/total for i, count in emoji_counts.items()})

# Plot the distribution of emojis per category
num_categories = len(emoji_usage_per_category)
fig, axes = plt.subplots((num_categories + 2) // 3, 3, figsize=(15, 15))
axes = axes.flatten()  # Flatten the axes array for easy indexing

for i, (category, emoji_counts) in enumerate(emoji_usage_per_category.items()):
    total = sum(emoji_counts.values())
    ax = axes[i]
    ax.bar(labels, [count/total for count in emoji_counts.values()], color=["steelblue", "salmon", "seagreen"])
    ax.set_title(f"Category: {category}")
    ax.set_ylabel("Proportion")

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


In [None]:
print("Number of captions:", len(captions) + missing_captions)
print("Number of posts with missing captions:", missing_captions)
print("Number of posts with empty captions:", empty_captions)

In [30]:
# Detect language distribution in captions
caption_lang_dist = detect_language_distribution([item["caption"] for item in captions])
caption_lang_dist_df = pd.DataFrame(
        caption_lang_dist.items(), 
        columns=['language', 'count']
    ).sort_values(by='count', ascending=False)

# print(caption_lang_dist_df)

In [None]:
caption_lang_dist_df.plot.bar(x='language', y='count', legend=False, logy=True)
plt.xlabel("Language")
plt.ylabel("Count (Log Scale)")
plt.title("Language Distribution of Captions")
plt.tight_layout()
plt.show()

In [None]:
dates_df = pd.DataFrame(list(dates.most_common(30)), columns=['Date', 'Count'])
dates_df["Month"] = dates_df["Date"].str[:2]
dates_df["Day"] = dates_df["Date"].str[3:]
dates_df = dates_df.sort_values(by=["Month", "Day"])

plt.figure(figsize=(12, 6))
plt.scatter(dates_df['Date'], dates_df['Count'], s=100, color='b', alpha=0.7)
plt.title("Most Common Dates in Posts")
plt.xlabel("Dates")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
dates_df = pd.DataFrame(list(dates.items()), columns=['Date', 'Count'])
dates_df["Month"] = dates_df["Date"].str[:2]
dates_df["Day"] = dates_df["Date"].str[3:]
dates_df = dates_df.sort_values(by=["Month", "Day"])

plt.figure(figsize=(12, 6))
plt.scatter(dates_df['Date'], dates_df['Count'], s=100, color='b', alpha=0.7)
plt.title("Date Distribution in Posts")
plt.xlabel("Dates")
plt.ylabel("Count")
plt.xticks([])
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

#### Biographs

In [None]:
# use merged_train_df to get the biographs and its category
bio_cat_list = (
    merged_train_df[['username','biography','category']]
    .dropna(subset=['biography'])
    .to_dict(orient='records')
)

for row in bio_cat_list:
    processed_bio = preprocess_text(row['biography'])
    captions.append({
        "caption": processed_bio,
        "class": row['category'],
        "username": row['username']
    })

### BERT

In [43]:
df_train_captions = pd.DataFrame(captions)
df_train_captions["label_id"] = df_train_captions["class"].map(label2id)

dataset = Dataset.from_pandas(df_train_captions[["caption", "label_id", "username"]])

dataset_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

In [None]:
model_name = "dbmdz/bert-base-turkish-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_list)
)

# Optionally set model.config.id2label and label2id for readability
model.config.id2label = id2label
model.config.label2id = label2id

In [36]:
def tokenize_function(examples):
    return tokenizer(
        examples["caption"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

def rename_label_column(examples):
    examples["labels"] = examples["label_id"]
    return examples

In [None]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_train_dataset = tokenized_train_dataset.map(rename_label_column, batched=True)
tokenized_eval_dataset = tokenized_eval_dataset.map(rename_label_column, batched=True)

In [None]:
tokenized_train_dataset.column_names

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,  # if you have a separate eval, pass it as well
    eval_dataset=tokenized_eval_dataset,
)

trainer.train()

In [None]:
# Save the trained model and tokenizer locally
save_directory = "./saved_model"
trainer.save_model(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")