In [1]:
import pandas as pd
from tqdm import tqdm

from tag_recommender.utils.text import to_snake_case, to_snake_case_boosted

tqdm.pandas()

In [2]:
# read the dataset
df = pd.read_csv(
    "../data/full_dataset.csv",
    usecols=["tags", "root_tags"],
)

# convert the string tags to tags lists
df["root_tags"] = df["root_tags"].fillna("").str.split(",")
df["tags"] = df["tags"].fillna("").str.split(",")

ArrowInvalid: Could not open Parquet input source '<Buffer>': Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.

In [3]:
# extract all tags and root tags in one place and take the unique set
tags_df = (
    pd.concat([df.tags.explode(), df.root_tags.explode()]).drop_duplicates().to_frame()
)
tags_df.columns = ["initial_tag"]

In [4]:
# Perform two kinds of normalization: 'basic' and 'boosted'.
tags_df["basic_norm"] = tags_df["initial_tag"].progress_apply(to_snake_case)

# removes emojis, accents, handles unicode charts etc
tags_df["advanced_norm"] = tags_df["initial_tag"].progress_apply(to_snake_case_boosted)

100%|███████████████████████████████████| 2401924/2401924 [00:08<00:00, 276403.08it/s]
100%|████████████████████████████████████| 2401924/2401924 [00:39<00:00, 61155.70it/s]


In [5]:
# All tags without normalization whatsoever
len(tags_df)

2401924

In [6]:
# showcase the diff between the two normalization methods
tags_df[tags_df.basic_norm != tags_df.advanced_norm].sample(10)

Unnamed: 0,initial_tag,basic_norm,advanced_norm
1038124,the dialogue is probably unnessessary but I di...,the_dialogue_is_probably_unnessessary_but_i_di...,the_dialogue_is_probably_unnessessary_but_i_di...
1185332,what is he riding on in the water somehow what...,what_is_he_riding_on_in_the_water_somehow_what...,what_is_he_riding_on_in_the_water_somehow_what
1064770,@rose thanks for the idea fjjdkdkd,@rose_thanks_for_the_idea_fjjdkdkd,rose_thanks_for_the_idea_fjjdkdkd
859215,itziar verría’s flowers,itziar_verría’s_flowers,itziar_verrias_flowers
1182961,this is so cOOOOLLL,this_is_so_c_o_o_o_o_l_l_l,this_is_so_c_oooolll
1539342,reshiram.... weird little dog,reshiram...._weird_little_dog,reshiram_weird_little_dog
630313,//pls feel free to reply if you wanna make thi...,//pls_feel_free_to_reply_if_you_wanna_make_thi...,pls_feel_free_to_reply_if_you_wanna_make_this_...
949323,SPEAKING OF MY LUCAS,s_p_e_a_k_i_n_g_o_f_m_y_l_u_c_a_s,speaking_of_my_lucas
1401349,Leti who paid you and how much?,leti_who_paid_you_and_how_much?,leti_who_paid_you_and_how_much
699975,like i want to punch him but in like a homoero...,like_i_want_to_punch_him_but_in_like_a_homoero...,like_i_want_to_punch_him_but_in_like_a_homoero...


In [7]:
basic_norm_unique = tags_df["basic_norm"].nunique()

basic_norm_unique, round(100 * basic_norm_unique / len(tags_df), 3)

(2306598, 96.031)

In [8]:
advanced_norm_unique = tags_df["advanced_norm"].nunique()

advanced_norm_unique, round(100 * advanced_norm_unique / len(tags_df), 3)

(2179675, 90.747)

We select the method that normalizes more to check the normalized tags

In [9]:
norm2initial = (
    tags_df.groupby("advanced_norm")["initial_tag"]
    .progress_apply(lambda x: sorted(set(x)))
    .reset_index()
)
norm2initial["n_initial"] = norm2initial["initial_tag"].apply(len)
norm2initial.columns = ["normalized_tag", "initial_tags", "initial_count"]

100%|███████████████████████████████████| 2179675/2179675 [00:13<00:00, 156887.51it/s]


In [10]:
norm2initial.sort_values("initial_count", ascending=False, inplace=True)
norm2initial.reset_index(drop=True, inplace=True)

In [11]:
print(norm2initial["initial_tags"][8])

['!! omg', '!!!!! omg!!!!!!', '!!!!!!!!OMG', '!!!!!omg!', '( omg )', '*OMG', '..... Omg', ':omg', 'O.M.G.', 'OMG', 'OMG .', 'OMG ;;;', 'OMG ???', 'OMG ????', 'OMG 💀💀💀💀', 'OMG 😂😂😂😂😂😂', 'OMG 😭', 'OMG 😭😭😭', 'OMG 🥹', 'OMG!', 'OMG!!', 'OMG!!!', 'OMG!!!!', 'OMG!!!!!', 'OMG!!!!!!!!!', 'OMG!!!!!!!!!!!!!', 'OMG!!!!!!!!!!!!!!!!', 'OMG.', 'OMG..', 'OMG...', 'OMG...!!!!!!', 'OMG.....', 'OMG?', 'OMG?!?!!?!', 'OMG?!?!?!?!?!?!?!?!?!?', 'OMG??', 'OMG???', 'OMG????', 'OMG?????', 'OMG??????', 'OMG???????', 'OMG??????????', 'OMG…', 'OMG………..', 'OMG😭😭😭', 'OMG😭😭😭😭😭😭', 'OMG🥺🥺🥺', 'OMG🥺🥺🥺🥺', 'Omg', 'Omg 😭', 'Omg 😭😭', 'Omg 🤣🤣🤣🤣🤣🤣🤣', 'Omg!!', 'Omg!!!', 'Omg!!!!', 'Omg...', 'Omg... ❤️', 'Omg....', 'Omg:)', 'Omg?', 'Omg????', 'o.m.g.', 'omg', 'omg !!', 'omg !!!!', 'omg !!!!!!!!!!', 'omg !!!!!!!!!!!!!!!!!', 'omg ..', 'omg ....', 'omg .......', "omg :')", 'omg :(', 'omg :((', 'omg ;-;', 'omg ;;♥', 'omg ;_;', 'omg ?', 'omg ….', 'omg 👁️👁️', 'omg 👑', 'omg 💀💀', 'omg 💙🖤', 'omg 😂', 'omg 😂😂', 'omg 😂😂😂', 'omg 😂🙊', 'omg 😂🤣'

In [12]:
norm2initial.head(20)

Unnamed: 0,normalized_tag,initial_tags,initial_count
0,,"[, !, ! ! 💗💙, ! ! 💙💜, ! !!, ! ? ! ?, ! ★, ! 👀,...",6956
1,yeah,"[!!! yeah!!, !!!! yeah, !!!!!! yeah!!!!!, !!!!...",323
2,ooc,"[! OOC., & ooc, & ooc., &&. ooc, &&ooc, &. oo...",233
3,oh,"[!!oh, 'OH! 👀👀👀', 'oh, 'oh :(', ( OH??????????...",228
4,yes,"[! yes, !! yes!!!, !!! YES, !!! yes, !!!! yes,...",198
5,prev,"[!! prev, !!!!! prev, (< prev), (<-prev), (<pr...",194
6,me,"[(ME), (me), **** me, *me, - me, . me ., ...",184
7,oh_my_god,"[...oh my god, OH MY GOD, OH MY GOD, OH MY G...",180
8,omg,"[!! omg, !!!!! omg!!!!!!, !!!!!!!!OMG, !!!!!om...",178
9,them,"[!!! THEM!!!, !!!! them, !!!!! THEM, !!!!! the...",177
