In [60]:
import numpy as np
import pandas as pd
import gzip
import json
from pprint import pprint
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import joblib

train_classification_df = pd.read_csv("./train-classification.csv")
train_classification_df = train_classification_df.rename(columns={'Unnamed: 0': 'user_id', 'label': 'category'})
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]

train_data_path = "./training-dataset.jsonl.gz"

username2posts_train = dict()
username2profile_train = dict()
username2posts_test = dict()
username2profile_test = dict()

with gzip.open(train_data_path, "rt", errors="ignore") as fh:
    for line in fh:
        sample = json.loads(line)
        profile = sample["profile"]
        username = profile["username"]
        if username in username2_category:
            username2posts_train[username] = sample["posts"]
            username2profile_train[username] = profile
        else:
            username2posts_test[username] = sample["posts"]
            username2profile_test[username] = profile

train_profile_df = pd.DataFrame(username2profile_train).T.reset_index(drop=True)
test_profile_df = pd.DataFrame(username2profile_test).T.reset_index(drop=True)

total_captions = 0
for posts in username2posts_train.values():
    for post in posts:
        if "caption" in post and isinstance(post["caption"], str) and post["caption"].strip():
            total_captions += 1
print(f"Total number of valid captions in username2posts_train: {total_captions}")

Total number of valid captions in username2posts_train: 91285


In [61]:
# Filter captions with a length of at least 10 characters in username2posts_train
for username, posts in username2posts_train.items():
    username2posts_train[username] = [
        post for post in posts if "caption" in post and isinstance(post["caption"], str) and len(post["caption"].strip()) >= 10
    ]

total_captions_filtered = sum(
    len(posts) for posts in username2posts_train.values()
)
print(f"Total number of valid captions in username2posts_train after filtering: {total_captions_filtered}")


Total number of valid captions in username2posts_train after filtering: 89690


In [None]:
model_name = "BAAI/bge-m3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

In [None]:
def get_embeddings(captions, batch_size=32):
    embeddings = []
    with tqdm(total=len(captions), desc="Generating Embeddings", unit="caption") as pbar:
        for i in range(0, len(captions), batch_size):
            batch_captions = captions[i:i+batch_size]
            inputs = tokenizer(batch_captions, padding=True, truncation=True, return_tensors="pt")
            inputs = {key: value.to(device) for key, value in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

            embeddings.append(batch_embeddings)
            pbar.update(len(batch_captions))

    return np.vstack(embeddings)

In [None]:
train_embeddings = []
train_labels = []

all_captions = []
all_labels = []

for username, posts in tqdm(username2posts_train.items(), desc="Collecting Captions"):
    category = username2_category[username]
    for post in posts:
        if post and isinstance(post, dict):  # Ensure post is valid
            caption = post.get("caption", "")
            if caption and isinstance(caption, str):  # Ensure caption is a string
                caption = caption.strip()
                if caption:
                    all_captions.append(caption)
                    all_labels.append(category)

print(f"Total number of valid captions for training: {len(all_captions)}")

Collecting Captions: 100%|██████████| 2741/2741 [00:00<00:00, 8637.03it/s]

Total number of valid captions for training: 91285





In [9]:
train_embeddings = get_embeddings(all_captions)
train_labels = np.array(all_labels)

print(f"Shape of train_embeddings: {train_embeddings.shape}")
print(f"Shape of train_labels: {train_labels.shape}")

Generating Embeddings: 100%|██████████| 89690/89690 [14:16<00:00, 104.68caption/s]

Shape of train_embeddings: (89690, 1024)
Shape of train_labels: (89690,)





In [21]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tqdm import tqdm

label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)

# Convert embeddings and labels to torch tensors
X = torch.FloatTensor(train_embeddings)  # shape: [N, embedding_dim]
y = torch.LongTensor(train_labels_encoded)  # shape: [N]

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

class CaptionDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = CaptionDataset(X_train, y_train)
val_dataset = CaptionDataset(X_val, y_val)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=False)

In [46]:
import torch
import torch.nn as nn
import torch.optim as optim

class AdvancedMLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes, dropout_rate=0.5):
        super(AdvancedMLPClassifier, self).__init__()

        self.layer1 = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Dropout(p=dropout_rate)
        )

        self.layer2 = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(p=dropout_rate * 0.5)
        )

        self.layer3 = nn.Sequential(
            nn.Linear(hidden_dim // 2, hidden_dim // 4),
            nn.BatchNorm1d(hidden_dim // 4),
            nn.GELU(),
            nn.Dropout(p=dropout_rate * 0.3)
        )

        self.output_layer = nn.Linear(hidden_dim // 4, num_classes)

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.output_layer(x)
        return x


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Hyperparameters
input_dim = X.shape[1]  # Embedding dimension
hidden_dim = 512
num_classes = len(label_encoder.classes_)
dropout_rate = 0.4

model = AdvancedMLPClassifier(input_dim, hidden_dim, num_classes, dropout_rate)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for batch_X, batch_y in train_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)

    # Learning rate scheduler step
    scheduler.step()

    model.eval()
    val_loss = 0.0
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)

            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(batch_y.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)

    print(f"Epoch [{epoch+1}/{num_epochs}], "
          f"Train Loss: {avg_train_loss:.4f}, "
          f"Val Loss: {avg_val_loss:.4f}, "
          f"Learning Rate: {scheduler.get_last_lr()[0]:.6f}")


Using device: cuda
Epoch [1/10], Train Loss: 1.2504, Val Loss: 1.0714, Learning Rate: 0.000976
Epoch [2/10], Train Loss: 1.0995, Val Loss: 0.9886, Learning Rate: 0.000905
Epoch [3/10], Train Loss: 1.0137, Val Loss: 0.9362, Learning Rate: 0.000794
Epoch [4/10], Train Loss: 0.9412, Val Loss: 0.8975, Learning Rate: 0.000655
Epoch [5/10], Train Loss: 0.8728, Val Loss: 0.8667, Learning Rate: 0.000500
Epoch [6/10], Train Loss: 0.8135, Val Loss: 0.8477, Learning Rate: 0.000345
Epoch [7/10], Train Loss: 0.7628, Val Loss: 0.8275, Learning Rate: 0.000206
Epoch [8/10], Train Loss: 0.7206, Val Loss: 0.8187, Learning Rate: 0.000095
Epoch [9/10], Train Loss: 0.6895, Val Loss: 0.8134, Learning Rate: 0.000024
Epoch [10/10], Train Loss: 0.6729, Val Loss: 0.8175, Learning Rate: 0.000000


In [47]:
print("Classification report on validation set:")
print(classification_report(all_targets, all_preds, target_names=label_encoder.classes_))

Classification report on validation set:
                      precision    recall  f1-score   support

                 art       0.71      0.54      0.61      1198
       entertainment       0.66      0.56      0.61      2048
             fashion       0.74      0.79      0.76      1937
                food       0.80      0.86      0.83      3349
              gaming       0.89      0.38      0.54        86
health and lifestyle       0.65      0.75      0.69      3367
    mom and children       0.71      0.53      0.61       995
              sports       0.72      0.64      0.68       715
                tech       0.74      0.82      0.78      2331
              travel       0.83      0.73      0.77      1912

            accuracy                           0.73     17938
           macro avg       0.74      0.66      0.69     17938
        weighted avg       0.73      0.73      0.72     17938



In [48]:
# Combine train and validation datasets
full_dataset = torch.utils.data.ConcatDataset([train_dataset, val_dataset])

combined_loader = torch.utils.data.DataLoader(full_dataset, batch_size=64, shuffle=True)

model = AdvancedMLPClassifier(input_dim, hidden_dim, num_classes, dropout_rate)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for batch_X, batch_y in combined_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_train_loss = running_loss / len(combined_loader)

    # Update scheduler
    scheduler.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], "
          f"Train Loss: {avg_train_loss:.4f}, "
          f"Learning Rate: {scheduler.get_last_lr()[0]:.6f}")

test_dataset = torch.utils.data.ConcatDataset([train_dataset, val_dataset])
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)

        outputs = model(batch_X)
        preds = torch.argmax(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(batch_y.cpu().numpy())

print("Classification Report:")
print(classification_report(all_targets, all_preds, target_names=label_encoder.classes_))

Epoch [1/10], Train Loss: 1.2091, Learning Rate: 0.000976
Epoch [2/10], Train Loss: 1.0434, Learning Rate: 0.000905
Epoch [3/10], Train Loss: 0.9572, Learning Rate: 0.000794
Epoch [4/10], Train Loss: 0.8845, Learning Rate: 0.000655
Epoch [5/10], Train Loss: 0.8261, Learning Rate: 0.000500
Epoch [6/10], Train Loss: 0.7666, Learning Rate: 0.000345
Epoch [7/10], Train Loss: 0.7191, Learning Rate: 0.000206
Epoch [8/10], Train Loss: 0.6774, Learning Rate: 0.000095
Epoch [9/10], Train Loss: 0.6475, Learning Rate: 0.000024
Epoch [10/10], Train Loss: 0.6296, Learning Rate: 0.000000
Classification Report:
                      precision    recall  f1-score   support

                 art       0.86      0.74      0.80      5988
       entertainment       0.77      0.73      0.75     10238
             fashion       0.85      0.88      0.86      9687
                food       0.90      0.91      0.91     16747
              gaming       0.93      0.55      0.69       428
health and lifestyle   

In [49]:
embedding_model_name = "BAAI/bge-m3"
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name).to(device).eval()

In [50]:
test_usernames = []
with open("test-classification-round3.dat", "r") as f:
    test_usernames = [line.strip() for line in f]

test_usernames = [u for u in test_usernames if u in username2posts_test or username2posts_train]
print(f"Number of valid test usernames found: {len(test_usernames)}")

Number of valid test usernames found: 1000


In [52]:
import re
from collections import Counter
from tqdm import tqdm
import json
import torch

# Helper function to check if a caption is meaningful
def is_meaningful_caption(caption):
    # Remove emojis and other non-alphanumeric characters
    text = re.sub(r'[^\w\s]', '', caption)
    # Check if there are any alphabetic characters
    return bool(re.search(r'[a-zA-Z]', text))

# Classify each post and perform hard voting for that user
def classify_user_by_posts(username, posts, tokenizer, embedding_model, model, device, batch_size=32):
    captions = []
    for post in posts:
        if "caption" in post and isinstance(post["caption"], str):
            caption = post["caption"].strip()
            if is_meaningful_caption(caption):  # Filter out meaningless captions
                captions.append(caption)

    if not captions:
        return None  # No meaningful captions available for this user

    # Predict category for each post
    predictions = []
    with torch.no_grad():
        for i in range(0, len(captions), batch_size):
            batch_captions = captions[i:i + batch_size]
            inputs = tokenizer(batch_captions, padding=True, truncation=True, return_tensors="pt")
            inputs = {key: value.to(device) for key, value in inputs.items()}

            outputs = embedding_model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).to(device)

            # Classify using the trained model
            logits = model(batch_embeddings)
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)

    # Perform hard voting
    category_counts = Counter(predictions)
    majority_category = category_counts.most_common(1)[0][0]

    return label_encoder.inverse_transform([majority_category])[0]

# Classify each user using hard voting
output_dict = {}
for username in tqdm(test_usernames, desc="Classifying users via hard voting"):
    if username in username2posts_test:
        posts = username2posts_test[username]
    elif username in username2posts_train:
        posts = username2posts_train[username]
    predicted_category = classify_user_by_posts(username, posts, tokenizer, embedding_model, model, device)
    if predicted_category:
        output_dict[username] = predicted_category

with open("test_predictions_round3.json", "w") as json_file:
    json.dump(output_dict, json_file, indent=4)

Classifying users via hard voting: 100%|██████████| 1000/1000 [05:08<00:00,  3.24it/s]


In [53]:
def log_mse_like_counts(y_true, y_pred):
  """
  Calculate the Log Mean Squared Error (Log MSE) for like counts (log(like_count + 1)).

  Parameters:
  - y_true: array-like, actual like counts
  - y_pred: array-like, predicted like counts

  Returns:
  - log_mse: float, Log Mean Squared Error
  """
  # Ensure inputs are numpy arrays
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)

  # Log transformation: log(like_count + 1)
  log_y_true = np.log1p(y_true)
  log_y_pred = np.log1p(y_pred)

  # Compute squared errors
  squared_errors = (log_y_true - log_y_pred) ** 2

  # Return the mean of squared errors
  return np.mean(squared_errors)

In [54]:
from datetime import datetime

def extract_post_features(post, user_avg_likes):
    features = {}

    # 1) Caption length
    caption = post.get("caption") or ""
    features["caption_length"] = len(caption)

    # 2) Number of hashtags
    # Count occurrences of # in caption
    features["num_hashtags"] = caption.count("#")

    # 3) Comments count
    comments_count = post.get("comments_count", 0) or 0
    features["comments_count"] = comments_count

    # 4) Media type
    media_type = post.get("media_type", "").upper()
    features["is_video"] = 1 if media_type == "VIDEO" else 0
    features["is_image"] = 1 if media_type == "IMAGE" else 0

    # 5) Day of week & hour of the day from timestamp
    timestamp_str = post.get("timestamp", "")
    if timestamp_str:
        dt = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S")
        features["day_of_week"] = dt.weekday()  # Monday=0, Sunday=6
        features["hour_of_day"] = dt.hour
    else:
        features["day_of_week"] = -1
        features["hour_of_day"] = -1

    # 6) average likes for the user
    features["user_avg_likes"] = user_avg_likes

    return features

In [55]:
def compute_user_average_likes(username2posts):
    user_avg_likes = {}
    for username, posts in username2posts.items():
        total_likes = 0
        num_posts = 0
        for post in posts:
            like_count = post.get("like_count", 0) or 0
            total_likes += like_count
            num_posts += 1
        avg_likes = total_likes / num_posts if num_posts > 0 else 0
        user_avg_likes[username] = avg_likes
    return user_avg_likes

user_train_avg_likes = compute_user_average_likes(username2posts_train)

train_rows = []
train_targets = []

for uname, posts in username2posts_train.items():
    avg_likes_for_user = user_train_avg_likes.get(uname, 0)

    for post in posts:
        features = extract_post_features(post, avg_likes_for_user)
        true_likes = post.get("like_count", 0) or 0

        train_rows.append(features)
        train_targets.append(true_likes)

train_df = pd.DataFrame(train_rows)
train_y = np.array(train_targets)

print("Training features shape:", train_df.shape)
print("Training target size:", train_y.shape)
train_df.head()

Training features shape: (89690, 8)
Training target size: (89690,)


Unnamed: 0,caption_length,num_hashtags,comments_count,is_video,is_image,day_of_week,hour_of_day,user_avg_likes
0,41,0,0,0,1,6,9,11.848485
1,76,2,1,1,0,1,19,11.848485
2,30,1,0,1,0,0,21,11.848485
3,58,1,1,1,0,0,21,11.848485
4,39,2,0,1,0,0,21,11.848485


In [56]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_df, train_y, test_size=0.2, random_state=42)

In [57]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    random_state=42
)

model.fit(X_train, y_train)

y_val_pred = model.predict(X_val)

mse_val = mean_squared_error(y_val, y_val_pred)
print("Validation MSE:", mse_val)

log_mse_val = log_mse_like_counts(y_val, y_val_pred)
print("Validation Log MSE:", log_mse_val)

Validation MSE: 817244974.8120159
Validation Log MSE: 0.9167332623255651


In [58]:
def calculate_overall_average_like_count(username2posts):
    total_likes = 0
    total_posts = 0
    for posts in username2posts.values():
        for post in posts:
            like_count = post.get("like_count", 0) or 0
            total_likes += like_count
            total_posts += 1
    return total_likes / total_posts if total_posts > 0 else 0


# Calculate root of the overall average like count from training data
overall_avg_likes =  calculate_overall_average_like_count(username2posts_train)
print(f"Overall average like count: {overall_avg_likes}")

Overall average like count: 5085.267822499722


In [59]:
import json
from tqdm import tqdm

test_file_path = "/content/test-regression-round3.jsonl"
output_file_path = "/content/output_test-regression-round3.json"

def predict_like_count_rf(model, post):
    user_avg_likes = user_train_avg_likes.get(post.get("username"), overall_avg_likes)
    features = extract_post_features(post, user_avg_likes)
    features_df = pd.DataFrame([features])
    return model.predict(features_df)[0]

id_like_count_map = {}

with open(test_file_path, "rt") as test_file:
    total_lines = sum(1 for _ in test_file)

with open(test_file_path, "rt") as test_file:
    for line in tqdm(test_file, total=total_lines, desc="Processing test posts"):
        sample = json.loads(line)
        post_id = sample.get("id")
        pred_val = predict_like_count_rf(model, sample)

        if post_id is not None:
            id_like_count_map[post_id] = int(pred_val)

with open(output_file_path, "wt") as output_file:
    json.dump(id_like_count_map, output_file, indent=4)

print(f"Output saved to {output_file_path}")

Processing test posts: 100%|██████████| 3000/3000 [00:30<00:00, 98.25it/s]

Output saved to /content/output_test-regression-round3.json



