In [None]:
import json
import numpy as np

# Function to predict like count
def predict_like_count(username, current_post=None):
    def get_avg_like_count(posts: list):
        total = 0.0
        count = 0
        for post in posts:
            if current_post is not None and post["id"] == current_post["id"]:
                continue

            like_count = post.get("like_count", 0)
            if like_count is None:
                like_count = 0
            total += like_count
            count += 1

        if count == 0:
            return 0.0

        return total / count

    if username in username2posts_train:
        return get_avg_like_count(username2posts_train[username])
    elif username in username2posts_test:
        return get_avg_like_count(username2posts_test[username])
    else:
        print(f"No data available for {username}")
        return -1


# Log MSE calculation function
def log_mse_like_counts(y_true, y_pred):
    y_true = np.maximum(np.array(y_true), 0)  # Ensure non-negative values
    y_pred = np.maximum(np.array(y_pred), 0)

    log_y_true = np.log1p(y_true)
    log_y_pred = np.log1p(y_pred)

    return np.mean((log_y_true - log_y_pred) ** 2)


# Train Dataset Evaluation
y_like_count_train_true = []
y_like_count_train_pred = []

for uname, posts in username2posts_train.items():
    for post in posts:
        pred_val = predict_like_count(uname, post)
        true_val = post.get("like_count", 0) or 0

        y_like_count_train_true.append(true_val)
        y_like_count_train_pred.append(pred_val)

print(f"Log MSE Train: {log_mse_like_counts(y_like_count_train_true, y_like_count_train_pred)}")


# Test Dataset Prediction
path = "prediction-regression-round2"
output_path = "/content/test-like-counts.json"

output_dict = {}  # Dictionary to store id: predicted_like_count pairs

with open(path, "rt") as fh:
    for line in fh:
        sample = json.loads(line)

        # Predict like count
        pred_val = predict_like_count(sample["username"])

        # Add id and predicted like_count to output dictionary
        output_dict[sample["id"]] = int(pred_val)

# Write the output to a JSON file in the specified format
with open(output_path, "w") as of:
    json.dump(output_dict, of, indent=4)