In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Cell [2]

!pip install transformers sentencepiece xgboost  # Ensure huggingface & xgboost installed

import json
import gzip
import os

import pandas as pd
import numpy as np

# For advanced NLP
import torch
from transformers import AutoTokenizer, AutoModel

# For regression
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# For possible GPU usage check
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cpu


In [None]:
# Cell [3]

BASE_DIR = "/content/drive/MyDrive/412_project_round2"
FILENAME = "training-dataset.jsonl.gz"  # or "training-dataset.jsonl.gz"

FILE_PATH = os.path.join(BASE_DIR, FILENAME)

def read_jsonl(file_path):
    data_list = []
    if file_path.endswith(".gz"):
        # For gzipped
        with gzip.open(file_path, 'rt', encoding='utf-8') as f:
            for line in f:
                line=line.strip()
                if not line:
                    continue
                data_list.append(json.loads(line))
    else:
        # For normal .jsonl
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line=line.strip()
                if not line:
                    continue
                data_list.append(json.loads(line))
    return data_list

# Read data
data_entries = read_jsonl(FILE_PATH)
print("Number of entries:", len(data_entries))

# Peek first item
print("First item sample:", data_entries[0])

Number of entries: 5415
First item sample: {'profile': {'username': 'deparmedya', 'id': '3170700063', 'full_name': 'Depar Medya', 'biography': '#mediaplanning #mediabuying #sosyalmedya', 'category_name': 'Local business', 'post_count': None, 'follower_count': 1167, 'following_count': 192, 'is_business_account': True, 'is_private': False, 'is_verified': False, 'highlight_reel_count': 6, 'bio_links': '"[{\'title\': \'\', \'lynx_url\': \'https://l.instagram.com/?u=http%3A%2F%2Fwww.deparmedya.com%2F&e=AT31HasgARgPeShR04-FuJmNiVuIQs9QexXBXhEPSCA0mp7gmyAofqi8YDvLKAqZ0zRdO4QFsllkZeR9cLqkyC747QgTHmoA\', \'url\': \'http://www.deparmedya.com/\', \'link_type\': \'external\'}]"', 'entities': '#mediaplanning #mediabuying #sosyalmedya', 'ai_agent_type': None, 'fb_profile_biolink': None, 'restricted_by_viewer': None, 'country_block': False, 'eimu_id': '110133017051179', 'external_url': 'http://www.deparmedya.com/', 'fbid': '17841403241896337', 'has_clips': True, 'hide_like_and_view_counts': False, 'i

In [None]:
# Cell [4]

import pandas as pd
import numpy as np

rows = []
for item in data_entries:
    profile = item.get('profile', {})
    posts   = item.get('posts', [])

    # Numeric from profile
    follower_count = profile.get("follower_count", 0)
    post_count     = profile.get("post_count", 0)
    # Possibly convert them, or we’ll convert later in DataFrame

    # Sum likes/comments across all posts
    total_likes = 0
    total_comments = 0
    for p in posts:
        lc = p.get("like_count", 0)
        cc = p.get("comments_count", 0)
        # ensure numeric
        if not isinstance(lc, (int, float)):
            lc = 0
        if not isinstance(cc, (int, float)):
            cc = 0
        total_likes += lc
        total_comments += cc

    row = {
        "username":        profile.get("username", ""),
        "biography":       profile.get("biography", ""),
        "category_name":   profile.get("category_name", ""),
        "entities":        profile.get("entities", ""),
        "post_count":      post_count,
        "follower_count":  follower_count,
        # aggregated from posts
        "sum_like_count":      total_likes,
        "sum_comments_count":  total_comments
    }
    rows.append(row)

df = pd.DataFrame(rows)

# Convert numeric columns properly
df['post_count'] = pd.to_numeric(df['post_count'], errors='coerce').fillna(0)
df['follower_count'] = pd.to_numeric(df['follower_count'], errors='coerce').fillna(0)
df['sum_like_count'] = pd.to_numeric(df['sum_like_count'], errors='coerce').fillna(0)
df['sum_comments_count'] = pd.to_numeric(df['sum_comments_count'], errors='coerce').fillna(0)

# Fill missing text with empty string
df.fillna("", inplace=True)

print("DataFrame shape:", df.shape)
df.head(3)

DataFrame shape: (5415, 8)


Unnamed: 0,username,biography,category_name,entities,post_count,follower_count,sum_like_count,sum_comments_count
0,deparmedya,#mediaplanning #mediabuying #sosyalmedya,Local business,#mediaplanning #mediabuying #sosyalmedya,0.0,1167,404.0,12
1,beyazyakaliyiz,Beyaz yakalıların dünyasına hoşgeldiniz 😀😀😀,Personal blog,Beyaz yakalıların dünyasına hoşgeldiniz 😀😀😀,0.0,1265,1126.0,33
2,kafesfirin,📍Söğütözü📍FTZ AVM\n🛒Ankara macro▲center v...,Brand,📍Söğütözü📍FTZ AVM\n🛒Ankara macro▲center v...,0.0,11997,1103.0,13


In [None]:
# Cell [5]

import torch
from transformers import AutoTokenizer, AutoModel

# If you haven't installed earlier, uncomment:
# !pip install transformers sentencepiece

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "dbmdz/bert-base-turkish-cased"  # or another Turkish model

print("Loading tokenizer/model:", model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)
bert_model.to(device)
bert_model.eval()

def get_bert_embedding(text):
    """
    Encode text into a single embedding vector using BERT.
    We'll use the [CLS] token representation for simplicity.
    """
    inputs = tokenizer(
        text, return_tensors='pt', truncation=True,
        max_length=128, padding='max_length'
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
        # outputs[0] is last hidden state -> shape [batch, seq_len, hidden_dim]
        last_hidden_state = outputs[0]
        # [CLS] token is at index 0
        cls_embedding = last_hidden_state[:, 0, :]  # shape: (batch, hidden_size)
    return cls_embedding.squeeze().cpu().numpy()

Loading tokenizer/model: dbmdz/bert-base-turkish-cased


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

In [None]:
# Cell [6]

text_list = []
for i, row in df.iterrows():
    # Concatenate the relevant text fields
    combined_text = (
        str(row['biography']) + " " +
        str(row['category_name']) + " " +
        str(row['entities'])
    )
    text_list.append(combined_text)

all_embeddings = []
for i, t in enumerate(text_list):
    emb = get_bert_embedding(t)
    all_embeddings.append(emb)
    if (i+1) % 500 == 0:
        print(f"Processed {i+1} / {len(text_list)}")

text_embeddings = np.vstack(all_embeddings)  # shape: (N, hidden_dim)
print("Embeddings shape:", text_embeddings.shape)

Processed 500 / 5415
Processed 1000 / 5415
Processed 1500 / 5415
Processed 2000 / 5415
Processed 2500 / 5415
Processed 3000 / 5415
Processed 3500 / 5415
Processed 4000 / 5415
Processed 4500 / 5415
Processed 5000 / 5415
Embeddings shape: (5415, 768)


In [None]:
# Cell [7] (Modified for Task 2: Predict sum_like_count in log10 scale)

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# (Optional) Apply log-transform to numeric features to reduce skew:
df['log_follower_count']      = np.log1p(df['follower_count'])
df['log_post_count']          = np.log1p(df['post_count'])
df['log_sum_comments_count']  = np.log1p(df['sum_comments_count'])

# We'll use these log-feature columns + BERT embeddings as input (X).
# The new numeric columns:
numeric_cols_log = ["log_follower_count", "log_post_count", "log_sum_comments_count"]

X_numeric_log = df[numeric_cols_log].values  # shape: (N, 3)

# text_embeddings is from Cell [6]
X_full_log = np.hstack([X_numeric_log, text_embeddings])  # shape: (N, 3 + hidden_dim)

# Task 2 target = sum_like_count.
# We measure MSE on log10 scale => y_log10 = log10( sum_like_count + 1 )
y_log10 = np.log10(df['sum_like_count'] + 1)

# Train/val split
X_train, X_val, y_train_log10, y_val_log10 = train_test_split(
    X_full_log, y_log10, test_size=0.2, random_state=42
)

model_xgb = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)
model_xgb.fit(X_train, y_train_log10)

y_pred_log10 = model_xgb.predict(X_val)

In [None]:
# Cell [8] (Modified to measure MSE in log10 space)

from sklearn.metrics import mean_squared_error, r2_score

# 1) Compute MSE in log10 space
mse_log10 = mean_squared_error(y_val_log10, y_pred_log10)
rmse_log10 = np.sqrt(mse_log10)

print("XGBoost - MSE (log10 scale):", mse_log10)
print("XGBoost - RMSE (log10 scale):", rmse_log10)

# 2) (Optional) If you'd like an R^2 in log space:
r2_log10 = r2_score(y_val_log10, y_pred_log10)
print("R^2 Score (log10 scale):", r2_log10)

# 3) Compare a few predictions in both log10 and original scale:
#    Convert from log10(y + 1) -> predicted sum_like_count
val_count = 10  # number of rows to show
compare_data = []
for i in range(val_count):
    actual_l10 = y_val_log10.iloc[i]
    pred_l10   = y_pred_log10[i]
    # invert transform -> 10^(log10_value) - 1
    actual_original = (10 ** actual_l10) - 1
    pred_original   = (10 ** pred_l10)   - 1
    compare_data.append({
        "actual_log10":     actual_l10,
        "predicted_log10":  pred_l10,
        "actual_likecount": actual_original,
        "pred_likecount":   pred_original
    })

compare_df = pd.DataFrame(compare_data)
print(compare_df)

XGBoost - MSE (log10 scale): 0.2280680956928837
XGBoost - RMSE (log10 scale): 0.4775647554969731
R^2 Score (log10 scale): 0.7962530085856991
   actual_log10  predicted_log10  actual_likecount  pred_likecount
0      2.939020         2.933951             868.0      857.916883
1      3.964024         4.046859            9204.0    11138.334985
2      3.115943         3.057752            1305.0     1141.226866
3      6.196782         5.638585         1573192.0   435094.522240
4      3.792532         3.641668            6201.0     4380.951884
5      5.098637         5.315547          125497.0   206797.311977
6      2.380211         2.929917             239.0      849.976046
7      2.419956         2.281993             262.0      190.422468
8      2.505150         2.870869             319.0      741.795729
9      3.271609         3.487288            1868.0     3070.059542


In [None]:
# Cell [10]: Regression Prediction & JSON Output

import json
import gzip
import os

# Suppose the test file for regression is "test-regression-round3.jsonl"
TEST_REGRESSION_FILE = os.path.join(BASE_DIR, "test-regression-round3.jsonl")
REGRESSION_OUTPUT_FILE = os.path.join(BASE_DIR, "prediction-regression-round3.json")

# 1) Read the test set from .jsonl
def read_jsonl_normal(file_path):
    data_list = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line=line.strip()
            if not line:
                continue
            data_list.append(json.loads(line))
    return data_list

test_entries = read_jsonl_normal(TEST_REGRESSION_FILE)
print("Number of regression test entries:", len(test_entries))

# 2) We'll create the input features to feed into the XGB model.
#    For consistency, let's replicate how we built features in training.
#    We'll do a BERT embedding from text, plus numeric feats:
#    In the test file, we have: "caption", "comments_count", "id", ...
#    We do NOT have "follower_count" or "post_count" or "sum_comments_count" from original training.
#    So we must adapt. Let's treat "comments_count" in the same role as sum_comments_count.
#    For follower_count/post_count, we might just set them = 0, or some placeholder.

test_rows = []
for item in test_entries:
    post_id = item.get("id", "")
    caption = item.get("caption", "")
    ccount  = item.get("comments_count", 0)

    row = {
       "post_id": post_id,
       "text_for_bert": str(caption),
       # placeholders for alignment
       "follower_count": 0.0,
       "post_count": 0.0,
       "sum_comments_count": float(ccount),
    }
    test_rows.append(row)

test_df = pd.DataFrame(test_rows)
print("Test DataFrame shape:", test_df.shape)

# 3) Prepare numeric columns with the same transformations as training:
#    log_follower_count, log_post_count, log_sum_comments_count
test_df['log_follower_count'] = np.log1p(test_df['follower_count'])
test_df['log_post_count']     = np.log1p(test_df['post_count'])
test_df['log_sum_comments_count'] = np.log1p(test_df['sum_comments_count'])

# 4) Create BERT embeddings for test entries
test_embeddings = []
for i, row in test_df.iterrows():
    text_input = row['text_for_bert']
    emb = get_bert_embedding(text_input)
    test_embeddings.append(emb)
    if (i+1) % 100 == 0:
        print(f"Processed embeddings for {i+1} / {len(test_df)}")

test_embeddings = np.vstack(test_embeddings)  # shape: (Ntest, hidden_dim)

# 5) Combine numeric + text embeddings
numeric_cols_log = ["log_follower_count", "log_post_count", "log_sum_comments_count"]
X_test_numeric_log = test_df[numeric_cols_log].values
X_test_full = np.hstack([X_test_numeric_log, test_embeddings])

print("Test X shape:", X_test_full.shape)

# 6) Use the trained XGB model (model_xgb) to predict sum_like_count in log10 space,
#    then invert the transform => 10^(pred) - 1
y_test_pred_log10 = model_xgb.predict(X_test_full)

# Convert from log10 => original
y_test_pred_original = (10 ** y_test_pred_log10) - 1
y_test_pred_original = np.maximum(y_test_pred_original, 0.0)  # ensure no negatives

# 7) Build JSON output:  { post_id : predicted_like_count }
pred_dict = {}
for i, row in test_df.iterrows():
    pid = row['post_id']
    pred_val = int(round(y_test_pred_original[i]))
    pred_dict[str(pid)] = pred_val

# 8) Save to JSON
with open(REGRESSION_OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(pred_dict, f, ensure_ascii=False, indent=2)

print(f"Regression output saved to: {REGRESSION_OUTPUT_FILE}")

Number of regression test entries: 3000
Test DataFrame shape: (3000, 5)
Processed embeddings for 100 / 3000
Processed embeddings for 200 / 3000
Processed embeddings for 300 / 3000
Processed embeddings for 400 / 3000
Processed embeddings for 500 / 3000
Processed embeddings for 600 / 3000
Processed embeddings for 700 / 3000
Processed embeddings for 800 / 3000
Processed embeddings for 900 / 3000
Processed embeddings for 1000 / 3000
Processed embeddings for 1100 / 3000
Processed embeddings for 1200 / 3000
Processed embeddings for 1300 / 3000
Processed embeddings for 1400 / 3000
Processed embeddings for 1500 / 3000
Processed embeddings for 1600 / 3000
Processed embeddings for 1700 / 3000
Processed embeddings for 1800 / 3000
Processed embeddings for 1900 / 3000
Processed embeddings for 2000 / 3000
Processed embeddings for 2100 / 3000
Processed embeddings for 2200 / 3000
Processed embeddings for 2300 / 3000
Processed embeddings for 2400 / 3000
Processed embeddings for 2500 / 3000
Processed em

In [None]:
import json
#with open("/content/drive/MyDrive/412_project_round2/prediction-classification-round2.json", "r", encoding="utf-8") as f:
    #classification_results = json.load(f)
#print(classification_results)

with open("/content/drive/MyDrive/412_project_round2/prediction-regression-round2.json", "r", encoding="utf-8") as f:
    regression_results = json.load(f)
print(regression_results)

{'17903451397703117': 197, '17896404506845900': 1248, '17853971531941549': 1325, '18362044393058713': 2259, '17999365834969022': 876, '18020868037872253': 214, '18013287148601185': 127, '18019742449567154': 635, '18116761582332028': 641, '18006379492926193': 286, '17998975151218109': 122, '18025264618764141': 399, '17906977549563245': 138, '18013521889517401': 231, '17954387045050844': 488, '17953476020069513': 1289, '18030225580657632': 340, '17884877039682929': 136, '17979533717351528': 197, '17975764259290937': 140, '17881555487485739': 1864, '17985820520306917': 819, '18036121630589876': 180, '17877635093918025': 151, '17997401021123841': 462, '17984682878229994': 1728, '18112176310316293': 160, '18096068665372342': 212, '17958706499341475': 528, '18220807072247882': 177, '17907682292811483': 2228, '17967366122058393': 1143, '18274091311094151': 149, '18001100212690547': 805, '17854358829013795': 144, '18244254190214896': 172, '17968601219466127': 234, '17980627964589025': 399, '17