In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from tag_recommender.utils.text import split_tags

tqdm.pandas()

In [2]:
# read dataset
df = pd.read_csv(
    "../data/full_dataset.csv",
    usecols=["type", "lang", "is_reblog", "tags", "root_tags"],
)

# Split tags strings into tag arrays
df["root_tags"] = df["root_tags"].fillna("").progress_apply(split_tags)
df["tags"] = df["tags"].fillna("").progress_apply(split_tags)

# Count the number of tags in each row
df["root_tags_count"] = df["root_tags"].progress_apply(len)
df["tags_count"] = df["tags"].progress_apply(len)

# Fill NaN values in 'is_reblog' column with 0 to show that these posts are not re-blogged
df["is_reblog"] = df["is_reblog"].fillna(0)

# Convert 'lang' column to 'en' and 'other'
df["lang_type"] = df.lang.apply(lambda s: "en" if s == "en_US" else "other")
# Convert 'type' column to 'photo' and 'other'
df["type_bucket"] = df["type"].apply(lambda s: s if s == "photo" else "other")

df.head()

100%|██████████| 1764542/1764542 [00:02<00:00, 652337.87it/s]
100%|██████████| 1764542/1764542 [00:02<00:00, 739086.31it/s] 
100%|██████████| 1764542/1764542 [00:00<00:00, 2778285.81it/s]
100%|██████████| 1764542/1764542 [00:00<00:00, 2779466.93it/s]


Unnamed: 0,type,lang,is_reblog,tags,root_tags,root_tags_count,tags_count,lang_type,type_bucket
0,photo,en_US,1.0,"[me too sadie, sadie sink, stranger things cast]","[sadie sink, sadiesinkedit, femaledaily, daily...",15,3,en,photo
1,regular,en_US,0.0,"[it speaks, r: the end of nihility, r: on a ga...",[],0,3,en,other
2,video,en_US,0.0,[Youtube],[],0,1,en,other
3,photo,en_US,1.0,"[alchemy of souls, i miss them already]","[alchemy of souls, alchemy of souls: light and...",26,2,en,photo
4,photo,en_US,1.0,[tokyo revengers],"[manila mikey, manjiro sano, tokyo manji gang,...",21,1,en,photo


In [3]:
# Remove rows with no tags
df = df[(df["root_tags_count"] > 0) & (df["tags_count"] > 0)].reset_index(drop=True)

In [4]:
hashtags_counts = (
    pd.concat([df["root_tags"], df["tags"]]).explode().dropna().value_counts()
)
hashtags_counts

art                                                               94867
my art                                                            71525
fanart                                                            47753
*                                                                 32523
mine                                                              29928
                                                                  ...  
i have a huge crush on this man                                       1
disneyfever                                                           1
linmanuelmirandaedit                                                  1
i love the idea of fairies and garden sprites getting along <3        1
the way his expression changes😂                                       1
Name: count, Length: 1717221, dtype: int64

In [5]:
# Around 85% of the hashtags appear up to 3 times.
(hashtags_counts <= 3).sum() / len(hashtags_counts)

0.8421164194940547

In [6]:
df["root_tags_popularity"] = df["root_tags"].progress_apply(
    lambda x: sum([hashtags_counts.get(tag, 0) for tag in x])
)
df["tags_popularity"] = df["tags"].progress_apply(
    lambda x: sum([hashtags_counts.get(tag, 0) for tag in x])
)

df["total_popularity"] = df["root_tags_popularity"] + df["tags_popularity"]

100%|██████████| 1166609/1166609 [00:11<00:00, 100285.61it/s]
100%|██████████| 1166609/1166609 [00:04<00:00, 247472.16it/s]


In [7]:
df.sample(20)

Unnamed: 0,type,lang,is_reblog,tags,root_tags,root_tags_count,tags_count,lang_type,type_bucket,root_tags_popularity,tags_popularity,total_popularity
590953,photo,en_US,1.0,"[black sails, babygirl both might be true]","[black sails crack, black sails, captain james...",10,2,en,photo,53697,1190,54887
1096729,regular,en_US,1.0,"[gods menu, or levanter, stray kids]","[stray kids, skz]",2,3,en,other,6918,4341,11259
1000951,photo,en_US,1.0,"[he has so much transgender swag, all elite wr...","[aew, all elite wrestling, aew dark elevation,...",5,4,en,photo,2063,1990,4053
573593,photoset,en_US,1.0,[knitting],"[art, craft, sculpture, glass, knitting]",5,1,en,other,97556,833,98389
173570,photo,es_ES,1.0,[pokemon],"[pokemon, pokemon sv, pokemon scarlet and viol...",4,1,other,photo,30881,25169,56050
210751,regular,en_US,1.0,[it’s not a big deal or anything don’t worry b...,[obligatory disclaimer: it’s nothing immoral l...,5,4,en,other,11,4,15
920640,photo,en_US,1.0,[fr art],"[flight rising, fr art, frfanart, fr dragons]",4,1,en,photo,1037,207,1244
621715,photo,en_US,1.0,[think i reblogged this already however. Look ...,[fob],1,1,en,photo,4008,1,4009
5183,photo,en_US,1.0,"[louis, with fans, cabo mexico, 14.01.23]","[louis tomlinson, photo, with fans, 14jan23]",4,4,en,photo,7403,807,8210
888561,photo,en_US,1.0,"[gayy, genshin impact, jeanlisa]","[genshin, genshin impact, jean gunnhildr, lisa...",6,3,en,photo,11980,9284,21264


In [8]:
for i in ["type", "lang", "is_reblog", "root_tags_count", "tags_count"]:
    print(df[i].value_counts(normalize=True).reset_index())
    print()

           type  proportion
0         photo    0.685705
1       regular    0.197783
2         video    0.055555
3      photoset    0.023339
4          note    0.022961
5         audio    0.004437
6         quote    0.004129
7          link    0.003539
8  conversation    0.002552

     lang  proportion
0   en_US    0.918699
1   es_ES    0.018607
2   de_DE    0.015366
3   fr_FR    0.010360
4   pt_BR    0.010043
5   it_IT    0.007027
6   pl_PL    0.006270
7   ru_RU    0.004571
8   nl_NL    0.003262
9   ja_JP    0.002468
10  pt_PT    0.001138
11  tr_TR    0.000961
12  ko_KR    0.000558
13  zh_CN    0.000254
14  zh_HK    0.000189
15  zh_TW    0.000180
16  id_ID    0.000047

   is_reblog  proportion
0        1.0         1.0

    root_tags_count  proportion
0                 1    0.096127
1                 2    0.094242
2                 5    0.091547
3                 3    0.088365
4                 4    0.082057
5                 6    0.074375
6                 7    0.066293
7              

In [9]:
def stratified_split(
    df, stratify_cols, train_size=0.8, val_size=0.1, test_size=0.10, random_state=42
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:

    # First split: Train and Remaining (Validation + Test)
    df_train, df_remaining = train_test_split(
        df, stratify=df[stratify_cols], train_size=train_size, random_state=random_state
    )

    # Calculate the proportion of validation set from remaining (Validation + Test)
    val_test_ratio = val_size / (val_size + test_size)

    # Second split: Validation and Test from the remaining data
    df_val, df_test = train_test_split(
        df_remaining,
        stratify=df_remaining[stratify_cols],
        test_size=val_test_ratio,
        random_state=random_state,
    )

    return df_train, df_val, df_test

In [10]:
from tag_recommender.utils.general import generate_labels

# Define bins for 'tags_count' and 'root_tags_count'

max_tags_count = max(df["root_tags_count"].max(), df["tags_count"].max())
tag_bins = [0, 1, 2, 3, 4, 5, 10, 15, 20, 25, max_tags_count]
labels = generate_labels(tag_bins)

print(tag_bins)
print(labels)

[0, 1, 2, 3, 4, 5, 10, 15, 20, 25, 31]
['0-0', '1-1', '2-2', '3-3', '4-4', '5-9', '10-14', '15-19', '20-24', '25+']


In [11]:
from tag_recommender.process.process import bucketize_col

# Create bucketized columns
df["tags_count_bucket"] = bucketize_col(df, "tags_count", tag_bins, labels)
df["root_tags_count_bucket"] = bucketize_col(df, "root_tags_count", tag_bins, labels)

In [12]:
root_hashtag_counts = df.root_tags.explode().value_counts()
root_hashtag_counts

root_tags
my art      70708
art         50663
fanart      36311
*           32192
mine        29274
            ...  
왕이보             1
舞者王一博           1
演員王一博           1
王一博歌手           1
series 7        1
Name: count, Length: 924416, dtype: int64

In [13]:
# calculate total popularity score for each root array
df["root_tags_popularity"] = df.root_tags.apply(
    lambda tags: sum([root_hashtag_counts[tag] for tag in tags])
)

In [14]:
# Calculate the percentiles for the popularity
popularity_percentiles = df["root_tags_popularity"].quantile([0.25, 0.5, 0.75])
popularity_percentiles

0.25      719.0
0.50     5106.0
0.75    25068.0
Name: root_tags_popularity, dtype: float64

In [15]:
# Define bins for 'root_tags_popularity'
popularity_bins = [
    0,
    int(popularity_percentiles[0.25]),
    int(popularity_percentiles[0.5]),
    int(popularity_percentiles[0.75]),
    int(df["root_tags_popularity"].max()),
]

popularity_labels = generate_labels(popularity_bins)

print(popularity_bins)
print(popularity_labels)

[0, 719, 5106, 25068, 285350]
['0-718', '719-5105', '5106-25067', '25068+']


In [16]:
# Create bucketized column for 'root_tags_popularity'
df["root_tags_popularity_bucket"] = bucketize_col(
    df, "root_tags_popularity", popularity_bins, popularity_labels
)

In [17]:
# Trying to Perform the split using many columns, but it's failing due to the minimum number of samples in the validation set
# ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
# Also tried with lang_bucket, is_reblog etc, but it's still failing
df_train, df_val, df_test = stratified_split(
    df,
    stratify_cols=[
        "tags_count_bucket",
        "root_tags_count_bucket",
        "type_bucket",
        "root_tags_popularity_bucket",
    ],
)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [18]:
df_training, df_validation, df_testing = stratified_split(
    df, stratify_cols=["tags_count_bucket", "root_tags_count_bucket", "type_bucket"]
)

In [19]:
df_training.shape[0] / len(df), df_validation.shape[0] / len(df), df_testing.shape[
    0
] / len(df)

(0.7999998285629547, 0.10000008571852266, 0.10000008571852266)

In [20]:
df_training

Unnamed: 0,type,lang,is_reblog,tags,root_tags,root_tags_count,tags_count,lang_type,type_bucket,root_tags_popularity,tags_popularity,total_popularity,tags_count_bucket,root_tags_count_bucket,root_tags_popularity_bucket
1029143,photo,en_US,1.0,"[ueueueueue, it’s about the hands.jpeg, hua ch...","[dianxia statue, dianxia, tgcf xie lian, mxtx ...",22,5,en,photo,187056,1663,237461,4-4,20-24,25068+
807183,photo,en_US,1.0,"[sambucky, bucky barnes, sam wilson, simply sa...","[sambucky, I have to, and i have no regret, sa...",9,5,en,photo,852,2881,4212,4-4,5-9,719-5105
289372,note,en_US,1.0,"[NAURRRRRRRR, THIS IS NOT ALLOWED, warrior nun...","[warrior nun, ava x beatrice, avatrice, ava si...",9,5,en,other,6597,5082,14963,4-4,5-9,5106-25067
921796,photo,en_US,1.0,"[art, historical costume]","[jewellery, René Lalique, Art Nouveau, m]",4,2,en,photo,2334,94884,97914,1-1,3-3,719-5105
266194,regular,en_US,1.0,[;prosecutor!verse],"[;prosecutor!verse, hello!, I just went with a...",3,1,en,other,9,3,23,0-0,2-2,0-718
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1124218,photo,en_US,1.0,[nsft],"[aesthetic, black beauty, beauty, black girls ...",23,1,en,photo,77953,1414,128815,0-0,20-24,25068+
1131421,photo,en_US,1.0,"[;sadposting, ;r, ;q]",[very fitting thank u co star],1,3,en,photo,1,21,22,2-2,0-0,0-718
1117904,photo,en_US,1.0,"[god the way eddie keeps LOOKING at him, these...","[stedit, strangerthingsedit, steddieedit, sted...",29,7,en,photo,30345,31744,68356,5-9,25+,25068+
168726,photo,en_US,1.0,"[sheith, queue]","[sheith, realitieszine, shiro, keith, leftovers]",5,2,en,photo,403,9173,9842,1-1,4-4,0-718
