In [1]:
import numpy as np
import pandas as pd
import gzip
import json

from pprint import pprint
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedKFold
import optuna
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')

[nltk_data] Downloading package stopwords to /Users/eren/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# 1) Read train_classification.csv more carefully
train_classification_df = pd.read_csv("train-classification.csv")

# If "Unnamed: 0" is your user_id (and there's no user_id column yet), rename it
if "Unnamed: 0" in train_classification_df.columns and "user_id" not in train_classification_df.columns:
    train_classification_df.rename(columns={"Unnamed: 0": "user_id"}, inplace=True)
    
# Rename "label" to "category"
train_classification_df.rename(columns={"label": "category"}, inplace=True)

# 2) Read & unify the 3 annotation files in a loop
annotation_files = [
    "annotated_users_CS412-70f170887437.csv",
    "annotated_users_CS412-1210cecaf90d.csv",
    "annotated_users_CS412-2fa73b22df12.csv",
]

all_annotated = []
for ann_file in annotation_files:
    df_temp = pd.read_csv(ann_file)
    
    # Rename "Unnamed: 0" to "user_id" if needed
    if "Unnamed: 0" in df_temp.columns and "user_id" not in df_temp.columns:
        df_temp.rename(columns={"Unnamed: 0": "user_id"}, inplace=True)
    
    # Rename influencerCategory to category
    if "influencerCategory" in df_temp.columns:
        df_temp.rename(columns={"influencerCategory": "category"}, inplace=True)
    
    # Drop rows missing a category
    df_temp.dropna(subset=["category"], inplace=True)
    
    # Lowercase the category
    df_temp["category"] = df_temp["category"].str.lower()
    
    # Drop any unwanted columns that might exist
    for col in ["url", "influencerMention", "accountType"]:
        if col in df_temp.columns:
            df_temp.drop(columns=col, inplace=True)
    
    df_temp.reset_index(drop=True, inplace=True)
    all_annotated.append(df_temp)

annotated_users_df = pd.concat(all_annotated, ignore_index=True)

# 3) Combine with your train_classification_df
train_classification_df = pd.concat([train_classification_df, annotated_users_df],
                                    ignore_index=True)

# 4) Unify labels (make sure everything is lowercase, etc.)
train_classification_df["category"] = train_classification_df["category"].str.lower()

# 5) Remove duplicate user_ids if they appear multiple times
#    keep="last" keeps the final row of any duplicates
train_classification_df.drop_duplicates(subset="user_id", keep="last", inplace=True)
train_classification_df.reset_index(drop=True, inplace=True)

# 6) Create your mapping of user_id → category
username2_category = train_classification_df.set_index("user_id")["category"].to_dict()

In [3]:
train_data_path = "training-dataset.jsonl.gz"

username2posts_train = dict()
username2profile_train = dict()

username2posts_test = dict()
username2profile_test = dict()


with gzip.open(train_data_path, "rt") as fh:
  for line in fh:
    sample = json.loads(line)

    profile = sample["profile"]
    username = profile["username"]
    if username in username2_category:
      # train data info
      username2posts_train[username] = sample["posts"]
      username2profile_train[username] = profile


    else:
      # it is test data info
      username2posts_test[username] = sample["posts"]
      username2profile_test[username] = profile

In [4]:
# Profile Dataframe
train_profile_df = pd.DataFrame(username2profile_train).T.reset_index(drop=True)
test_profile_df = pd.DataFrame(username2profile_test).T.reset_index(drop=True)

In [5]:
print(train_profile_df)

                   username           id                       full_name  \
0                deparmedya   3170700063                     Depar Medya   
1                kafesfirin    266439571                     KAFES FIRIN   
2                  vimerang   2367195567                        Vimerang   
3         mustafa_yalcinn38   9606564254                 Mustafa Yalçın   
4      zorluenergysolutions   8155780357    ZES (Zorlu Energy Solutions)   
...                     ...          ...                             ...   
2824    tatlidunyasidergisi   5726272058          Tatlı Dünyası Dergisi   
2825  yerindengelsin.com.tr   4755951986  YerindenGelsin Gurme Şarküteri   
2826       woodtechistanbul  39207633113                        WoodTech   
2827               netsvadi   2219868064                       Nets Vadi   
2828          herbisiatolye   3282895910   HERBİŞİ BİTKİ ATÖLYESİ   

                                              biography      category_name  \
0        

In [6]:
print(train_profile_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2829 entries, 0 to 2828
Data columns (total 44 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   username                           2829 non-null   object
 1   id                                 2829 non-null   object
 2   full_name                          2801 non-null   object
 3   biography                          2653 non-null   object
 4   category_name                      2302 non-null   object
 5   post_count                         282 non-null    object
 6   follower_count                     2829 non-null   object
 7   following_count                    2829 non-null   object
 8   is_business_account                2829 non-null   object
 9   is_private                         2829 non-null   object
 10  is_verified                        2829 non-null   object
 11  highlight_reel_count               2829 non-null   object
 12  bio_li

In [7]:
print(username2posts_train['deparmedya'])

[{'caption': 'Cumhuriyetimizin 100.yılı kutlu olsun♾️🇹🇷', 'comments_count': 0, 'id': '17990918969458720', 'like_count': 6, 'media_type': 'IMAGE', 'media_url': 'https://scontent-sof1-2.cdninstagram.com/v/t51.29350-15/396342908_267936919574308_4264417069827989599_n.jpg?_nc_cat=107&ccb=1-7&_nc_sid=c4dd86&_nc_ohc=IynXuQSoOT8AX9RSy20&_nc_ht=scontent-sof1-2.cdninstagram.com&edm=AL-3X8kEAAAA&oh=00_AfA8OKAM0MY9tqg6dw8C8I5TJp4SHPBp-VlNXrFAh2agqg&oe=6563581C', 'timestamp': '2023-10-29 09:12:30'}, {'caption': 'Oriflame Duologi Lansmanı #isveçtengelengüzellik #oriflameilesaçbakımdevrimi', 'comments_count': 1, 'id': '18219250732221045', 'like_count': 22, 'media_type': 'VIDEO', 'media_url': 'https://scontent-sof1-2.cdninstagram.com/o1/v/t16/f1/m82/FB43CFCD94D54CB5DC585E1777FB28B8_video_dashinit.mp4?efg=eyJ2ZW5jb2RlX3RhZyI6InZ0c192b2RfdXJsZ2VuLmNsaXBzLnVua25vd24tQzMuNzIwLmRhc2hfYmFzZWxpbmVfMV92MSJ9&_nc_ht=scontent-sof1-2.cdninstagram.com&_nc_cat=111&vs=3488236348085620_3611354976&_nc_vs=HBksFQIYT2lnX

In [8]:
import pandas as pd
import numpy as np

# Drop columns that are entirely NaN
train_profile_df.dropna(axis=1, how='all', inplace=True)

def try_convert_boolean(series: pd.Series) -> pd.Series:
    """
    Attempt to convert a string/object series to a boolean (True/False).
    We treat strings like 'true', 'false', '1', '0' (any case) as booleans.
    If conversion is not perfect, return the original series.
    """
    # Define acceptable string representations
    bool_map = {
        'true': True, 'false': False,
        '1': True, '0': False
    }
    non_null = series.dropna()

    # Check if all non-null values are in {true, false, 1, 0}
    unique_vals = set(str(x).strip().lower() for x in non_null)
    if unique_vals.issubset(bool_map.keys()):
        # Convert via map
        return series.apply(
            lambda x: bool_map[str(x).strip().lower()] if pd.notna(x) else np.nan
        ).astype('boolean')  # pandas boolean dtype (can hold NA)
    else:
        return series  # leave unchanged

def try_convert_numeric(series: pd.Series) -> pd.Series:
    """
    Attempt to convert a string/object series to numeric (float).
    If everything becomes NaN, revert to original. Otherwise keep numeric.
    """
    converted = pd.to_numeric(series, errors='coerce')
    # If *all* non-null entries became NaN, revert
    if converted.notna().sum() == 0:
        return series
    return converted

# Go column by column
for col in train_profile_df.columns:
    # First, try boolean
    temp_bool = try_convert_boolean(train_profile_df[col])
    if temp_bool.dtype == 'boolean':
        train_profile_df[col] = temp_bool
        continue  # done with this column

    # If not boolean, try numeric
    temp_numeric = try_convert_numeric(train_profile_df[col])
    if pd.api.types.is_numeric_dtype(temp_numeric):
        train_profile_df[col] = temp_numeric
    else:
        # remains object/string if numeric conversion fails
        train_profile_df[col] = train_profile_df[col].astype('object')
        
# At this point:
# - Some columns may be bool (dtype='boolean')
# - Some columns may be float64/int64
# - Some columns may remain object
#
# Next we can refine int vs float:
# If a column is float but all values are integers, convert to int
for col in train_profile_df.select_dtypes(include=['float']):
    # Check if all float values are integral
    no_na = train_profile_df[col].dropna()
    if (no_na == no_na.round()).all():
        train_profile_df[col] = train_profile_df[col].astype('Int64')  # nullable integer

# Now we might have dtypes = boolean / Int64 / float / object

In [9]:
# Impute missing values:
# floats → mean
# ints   → mode
# bool   → mode (True/False)
# object → 'unknown'

# 2a) float columns
float_cols = train_profile_df.select_dtypes(include=['float64'])
for c in float_cols:
    mean_val = float_cols[c].mean()
    train_profile_df[c] = train_profile_df[c].fillna(mean_val)

# 2b) int columns (nullable Int64)
int_cols = train_profile_df.select_dtypes(include=['Int64', 'int64'])
for c in int_cols:
    mode_val = train_profile_df[c].mode(dropna=True)
    if len(mode_val) > 0:
        mode_val = mode_val.iloc[0]
        train_profile_df[c] = train_profile_df[c].fillna(mode_val)
    else:
        # if there's literally no non-null values, fallback to 0 or something
        train_profile_df[c] = train_profile_df[c].fillna(0)

# 2c) boolean columns (pandas 'boolean' dtype)
bool_cols = train_profile_df.select_dtypes(include=['boolean'])
for c in bool_cols:
    mode_val = train_profile_df[c].mode(dropna=True)
    if len(mode_val) > 0:
        mode_val = mode_val.iloc[0]
        train_profile_df[c] = train_profile_df[c].fillna(mode_val)
    else:
        # fallback if all NaN
        train_profile_df[c] = train_profile_df[c].fillna(False)

# 2d) object (string) columns
obj_cols = train_profile_df.select_dtypes(include=['object'])
for c in obj_cols:
    train_profile_df[c] = train_profile_df[c].fillna("unknown")

# Done with basic cleaning & type conversion
print(train_profile_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2829 entries, 0 to 2828
Data columns (total 38 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   username                           2829 non-null   object 
 1   id                                 2829 non-null   int64  
 2   full_name                          2829 non-null   object 
 3   biography                          2829 non-null   object 
 4   category_name                      2829 non-null   object 
 5   post_count                         2829 non-null   Int64  
 6   follower_count                     2829 non-null   int64  
 7   following_count                    2829 non-null   int64  
 8   is_business_account                2829 non-null   boolean
 9   is_private                         2829 non-null   boolean
 10  is_verified                        2829 non-null   boolean
 11  highlight_reel_count               2829 non-null   int64

In [10]:
import pandas as pd
import numpy as np
import json
import gzip

#############################################
# 1) Build a training dict (instead of a DataFrame)
#############################################

train_dict = {}

def get_avg_like_count_excluding(posts, exclude_post_id):
    total_likes = 0.0
    count = 0
    for p in posts:
        if p["id"] == exclude_post_id:
            continue
        like_count = p.get("like_count", 0) or 0
        total_likes += like_count
        count += 1
    return total_likes / count if count > 0 else 0.0

for username, posts_list in username2posts_train.items():
    # Retrieve the user-level info from train_profile_df
    row_df = train_profile_df[train_profile_df["username"] == username]
    
    if len(row_df) == 1:
        follower_count = row_df["follower_count"].iloc[0]
        is_business_account = row_df["is_business_account"].iloc[0]
        is_verified = row_df["is_verified"].iloc[0]
    else:
        # fallback if missing or multiple
        follower_count = 0
        is_business_account = False
        is_verified = False

    # We'll build up a list of post dicts for this username
    user_posts_for_training = []
    
    for post in posts_list:
        post_id = post.get("id", "")
        comments_count = post.get("comments_count", 0) or 0
        media_type = post.get("media_type", "IMAGE")
        caption = post.get("caption", "") or ""
        like_count = post.get("like_count", 0)  # target

        avg_like_excl = get_avg_like_count_excluding(posts_list, post_id)

        # Additional example features
        caption_length = len(caption)
        is_video = 1 if media_type == "VIDEO" else 0

        # Collect post-level + user-level features + target
        post_info = {
            "post_id": post_id,
            "comments_count": comments_count,
            "follower_count": float(follower_count),
            "is_business_account": 1 if is_business_account else 0,
            "is_verified": 1 if is_verified else 0,
            "avg_like_excl": avg_like_excl,
            "caption_length": caption_length,
            "is_video": is_video,
            "like_count": like_count  # The target
        }
        user_posts_for_training.append(post_info)
    
    # Finally store under train_dict
    train_dict[username] = user_posts_for_training

# Now we have a dictionary that is conceptually a "train dataset".
print(f"Number of users in train_dict: {len(train_dict)}")

# For demonstration, let's pick one username and print their data:
some_username = next(iter(train_dict.keys()))
print(f"Example: train_dict[{some_username}] =")
for post_obj in train_dict[some_username][:3]:  # first 3 posts
    print(post_obj)

Number of users in train_dict: 2829
Example: train_dict[deparmedya] =
{'post_id': '17990918969458720', 'comments_count': 0, 'follower_count': 1167.0, 'is_business_account': 1, 'is_verified': 0, 'avg_like_excl': 11.705882352941176, 'caption_length': 41, 'is_video': 0, 'like_count': 6}
{'post_id': '18219250732221045', 'comments_count': 1, 'follower_count': 1167.0, 'is_business_account': 1, 'is_verified': 0, 'avg_like_excl': 11.235294117647058, 'caption_length': 76, 'is_video': 1, 'like_count': 22}
{'post_id': '18311380465102328', 'comments_count': 0, 'follower_count': 1167.0, 'is_business_account': 1, 'is_verified': 0, 'avg_like_excl': 11.323529411764707, 'caption_length': 30, 'is_video': 1, 'like_count': 19}


In [11]:
print(train_dict['deparmedya'])

[{'post_id': '17990918969458720', 'comments_count': 0, 'follower_count': 1167.0, 'is_business_account': 1, 'is_verified': 0, 'avg_like_excl': 11.705882352941176, 'caption_length': 41, 'is_video': 0, 'like_count': 6}, {'post_id': '18219250732221045', 'comments_count': 1, 'follower_count': 1167.0, 'is_business_account': 1, 'is_verified': 0, 'avg_like_excl': 11.235294117647058, 'caption_length': 76, 'is_video': 1, 'like_count': 22}, {'post_id': '18311380465102328', 'comments_count': 0, 'follower_count': 1167.0, 'is_business_account': 1, 'is_verified': 0, 'avg_like_excl': 11.323529411764707, 'caption_length': 30, 'is_video': 1, 'like_count': 19}, {'post_id': '18089518138361507', 'comments_count': 1, 'follower_count': 1167.0, 'is_business_account': 1, 'is_verified': 0, 'avg_like_excl': 11.323529411764707, 'caption_length': 58, 'is_video': 1, 'like_count': 19}, {'post_id': '18012743929758497', 'comments_count': 0, 'follower_count': 1167.0, 'is_business_account': 1, 'is_verified': 0, 'avg_lik

In [12]:
train_rows = []
for user, post_list in train_dict.items():
    train_rows.extend(post_list)
train_df = pd.DataFrame(train_rows)

In [13]:
train_df['like_count'] = train_df['like_count'].fillna(train_df['like_count'].mean())

In [14]:
print(train_df)

                 post_id  comments_count  follower_count  is_business_account  \
0      17990918969458720               0          1167.0                    1   
1      18219250732221045               1          1167.0                    1   
2      18311380465102328               0          1167.0                    1   
3      18089518138361507               1          1167.0                    1   
4      18012743929758497               0          1167.0                    1   
...                  ...             ...             ...                  ...   
97818  17929553231561932               1         11805.0                    1   
97819  18193569232231187               7         11805.0                    1   
97820  17956886519293484               3         11805.0                    1   
97821  17987965657627821               2         11805.0                    1   
97822  17970220879906281               2         11805.0                    1   

       is_verified  avg_lik

In [15]:
train_df["log_like_count"] = np.log1p(train_df["like_count"])  

In [16]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97823 entries, 0 to 97822
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   post_id              97823 non-null  object 
 1   comments_count       97823 non-null  int64  
 2   follower_count       97823 non-null  float64
 3   is_business_account  97823 non-null  int64  
 4   is_verified          97823 non-null  int64  
 5   avg_like_excl        97823 non-null  float64
 6   caption_length       97823 non-null  int64  
 7   is_video             97823 non-null  int64  
 8   like_count           97823 non-null  float64
 9   log_like_count       97823 non-null  float64
dtypes: float64(4), int64(5), object(1)
memory usage: 7.5+ MB


In [17]:
#############################################
# 2) Train-validation split
#############################################
X = train_df.drop(["like_count", "log_like_count", "post_id"], axis=1)
y_log = train_df["log_like_count"]

X_train, X_val, y_train, y_val = train_test_split(X, y_log, test_size=0.2, random_state=42)

In [18]:
#############################################
# 3) Fit the model
#############################################
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate on validation

# Predict in log-space
y_val_pred_log = model.predict(X_val)

# Convert back to real counts
y_val_pred = np.expm1(y_val_pred_log)
y_val_true = np.expm1(y_val)  # or if you still have the original like_count
mse = mean_squared_error(y_val, y_val_pred_log)
rmse = np.sqrt(mse)
print(f"RMSE on validation = {rmse:.2f}")

RMSE on validation = 1.82


In [19]:
#############################################
# 4) Build test dataframe & predict
#############################################

test_path = "test-regression-round2.jsonl"  # or .gz, adapt your path
test_rows = []

with open(test_path, "rt") as f:  # if it's not gzipped, just open(...)
    for line in f:
        sample = json.loads(line)
        
        # Extract fields
        post_id = sample["id"]
        username = sample["username"]
        comments_count = sample.get("comments_count", 0)
        media_type = sample.get("media_type", "IMAGE")
        caption = sample.get("caption", "") or ""

        # Gather user-level data from train_profile_df or test_profile_df
        # If your test set of users is also in username2profile_test, for example:
        row_df = test_profile_df[test_profile_df["username"] == username]
        
        if len(row_df) == 1:
            follower_count = row_df["follower_count"].values[0]
            is_business_account = row_df["is_business_account"].values[0]
            is_verified = row_df["is_verified"].values[0]
        else:
            # fallback
            follower_count = 0
            is_business_account = False
            is_verified = False

        # Compute average likes from username2posts_train or username2posts_test
        # For a brand new post, we exclude the post in question from the average.
        # If user is found in train or test dictionary:
        
        # We can reuse your predict_like_count function, 
        # or do it inline here similarly:
        if username in username2posts_train:
            posts_list = username2posts_train[username]
        elif username in username2posts_test:
            posts_list = username2posts_test[username]
        else:
            posts_list = []
        
        def get_avg_like_count_excluding(posts, current_post_id):
            total_likes = 0.0
            count = 0
            for p in posts:
                if p["id"] == current_post_id:
                    continue
                like_count = p.get("like_count", 0) or 0
                total_likes += like_count
                count += 1
            return (total_likes / count) if count > 0 else 0.0
        
        avg_like_excl = get_avg_like_count_excluding(posts_list, post_id)

        # Same features as training
        test_rows.append({
            "post_id": post_id,
            "username": username,
            "comments_count": comments_count,
            "follower_count": float(follower_count),
            "is_business_account": 1 if is_business_account else 0,
            "is_verified": 1 if is_verified else 0,
            "avg_like_excl": avg_like_excl,
            "caption_length": len(caption),
            "is_video": 1 if media_type == "VIDEO" else 0
        })

test_df = pd.DataFrame(test_rows)

# We'll keep post_id for the final output, but drop it when feeding to the model
X_test = test_df.drop(["post_id", "username"], axis=1)

test_preds = model.predict(X_test)

In [20]:
# 1) Build a dictionary: { post_id: predicted_like_count, ... }
predictions_dict = {}
for i, row in test_df.iterrows():
    post_id = row["post_id"]
    predicted_like = test_preds[i]
    # Round or cast to int if desired:
    predicted_like_rounded = int(round(predicted_like))
    
    predictions_dict[post_id] = predicted_like_rounded

# 2) Dump that dictionary to a JSON file
output_path = "my_submission.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(predictions_dict, f, ensure_ascii=False, indent=2)

print(f"Saved predictions to {output_path}")

Saved predictions to my_submission.json
