In [1]:
import pandas as pd
from tqdm import tqdm

from tag_recommender.utils.text import to_snake_case, to_snake_case_boosted

tqdm.pandas()

In [2]:
# read the dataset
df = pd.read_csv(
    "../data/full_dataset.csv",
    usecols=["tags", "root_tags"],
)

# convert the string tags to tags lists
df["root_tags"] = df["root_tags"].fillna("").str.split(",")
df["tags"] = df["tags"].fillna("").str.split(",")

In [3]:
# extract all tags and root tags in one place and take the unique set
tags_df = (
    pd.concat([df.tags.explode(), df.root_tags.explode()]).drop_duplicates().to_frame()
)
tags_df.columns = ["initial_tag"]

In [4]:
# Perform two kinds of normalization: 'basic' and 'boosted'.
tags_df["basic_norm"] = tags_df["initial_tag"].progress_apply(to_snake_case)

# removes emojis, accents, handles unicode charts etc
tags_df["advanced_norm"] = tags_df["initial_tag"].progress_apply(to_snake_case_boosted)

100%|██████████| 2401924/2401924 [00:08<00:00, 273039.30it/s]
100%|██████████| 2401924/2401924 [00:39<00:00, 60661.29it/s]


In [5]:
# All tags without normalization whatsoever
len(tags_df)

2401924

In [6]:
# showcase the diff between the two normalization methods
tags_df[tags_df.basic_norm != tags_df.advanced_norm].sample(10)

Unnamed: 0,initial_tag,basic_norm,advanced_norm
20284,I CAN HEAR IT IN HIS VOICE,i_c_a_n_h_e_a_r_i_t_i_n_h_i_s_v_o_i_c_e,i_can_hear_it_in_his_voice
1259604,victoire!!!,victoire!!!,victoire
75934,and the worst part is that I’m not even sure l...,and_the_worst_part_is_that_i’m_not_even_sure_l...,and_the_worst_part_is_that_im_not_even_sure_li...
839840,this is why i usually don’t care about canon t...,this_is_why_i_usually_don’t_care_about_canon_t...,this_is_why_i_usually_dont_care_about_canon_th...
1658845,áudio,áudio,audio
1005631,once art!,once_art!,once_art
1572561,Gerald <3,gerald_<3,gerald_3
1460676,yokluğunda,yokluğunda,yoklugunda
101210,too horrible to the point i don't wanna post t...,too_horrible_to_the_point_i_don't_wanna_post_t...,too_horrible_to_the_point_i_dont_wanna_post_th...
631955,there's a few adaptations that do show him bei...,there's_a_few_adaptations_that_do_show_him_bei...,theres_a_few_adaptations_that_do_show_him_bein...


In [7]:
basic_norm_unique = tags_df["basic_norm"].nunique()

basic_norm_unique, round(100 * basic_norm_unique / len(tags_df), 3)

(2306598, 96.031)

In [8]:
advanced_norm_unique = tags_df["advanced_norm"].nunique()

advanced_norm_unique, round(100 * advanced_norm_unique / len(tags_df), 3)

(2178634, 90.704)

We select the method that normalizes more to check the normalized tags

In [9]:
norm2initial = (
    tags_df.groupby("advanced_norm")["initial_tag"]
    .progress_apply(lambda x: sorted(set(x)))
    .reset_index()
)
norm2initial["n_initial"] = norm2initial["initial_tag"].apply(len)
norm2initial.columns = ["normalized_tag", "initial_tags", "initial_count"]

100%|██████████| 2178634/2178634 [00:14<00:00, 152304.44it/s]


In [10]:
norm2initial.sort_values("initial_count", ascending=False, inplace=True)
norm2initial.reset_index(drop=True, inplace=True)

In [11]:
print(norm2initial["initial_tags"][8])

['!! omg', '!!!!! omg!!!!!!', '!!!!!!!!OMG', '!!!!!omg!', '( omg )', '*OMG', '..... Omg', ':omg', 'O.M.G.', 'OMG', 'OMG .', 'OMG ;;;', 'OMG ???', 'OMG ????', 'OMG 💀💀💀💀', 'OMG 😂😂😂😂😂😂', 'OMG 😭', 'OMG 😭😭😭', 'OMG 🥹', 'OMG!', 'OMG!!', 'OMG!!!', 'OMG!!!!', 'OMG!!!!!', 'OMG!!!!!!!!!', 'OMG!!!!!!!!!!!!!', 'OMG!!!!!!!!!!!!!!!!', 'OMG.', 'OMG..', 'OMG...', 'OMG...!!!!!!', 'OMG.....', 'OMG?', 'OMG?!?!!?!', 'OMG?!?!?!?!?!?!?!?!?!?', 'OMG??', 'OMG???', 'OMG????', 'OMG?????', 'OMG??????', 'OMG???????', 'OMG??????????', 'OMG…', 'OMG………..', 'OMG😭😭😭', 'OMG😭😭😭😭😭😭', 'OMG🥺🥺🥺', 'OMG🥺🥺🥺🥺', 'Omg', 'Omg 😭', 'Omg 😭😭', 'Omg 🤣🤣🤣🤣🤣🤣🤣', 'Omg!!', 'Omg!!!', 'Omg!!!!', 'Omg...', 'Omg... ❤️', 'Omg....', 'Omg:)', 'Omg?', 'Omg????', 'o.m.g.', 'omg', 'omg !!', 'omg !!!!', 'omg !!!!!!!!!!', 'omg !!!!!!!!!!!!!!!!!', 'omg ..', 'omg ....', 'omg .......', "omg :')", 'omg :(', 'omg :((', 'omg ;-;', 'omg ;;♥', 'omg ;_;', 'omg ?', 'omg ….', 'omg 👁️👁️', 'omg 👑', 'omg 💀💀', 'omg 💙🖤', 'omg 😂', 'omg 😂😂', 'omg 😂😂😂', 'omg 😂🙊', 'omg 😂🤣'

In [12]:
norm2initial.head(20)

Unnamed: 0,normalized_tag,initial_tags,initial_count
0,,"[, !, ! ! 💗💙, ! ! 💙💜, ! !!, ! ? ! ?, ! ★, ! 👀,...",6956
1,yeah,"[!!! yeah!!, !!!! yeah, !!!!!! yeah!!!!!, !!!!...",323
2,ooc,"[! OOC., & ooc, & ooc., &&. ooc, &&ooc, &. oo...",233
3,oh,"[!!oh, 'OH! 👀👀👀', 'oh, 'oh :(', ( OH??????????...",228
4,yes,"[! yes, !! yes!!!, !!! YES, !!! yes, !!!! yes,...",198
5,prev,"[!! prev, !!!!! prev, (< prev), (<-prev), (<pr...",194
6,me,"[(ME), (me), **** me, *me, - me, . me ., ...",184
7,oh_my_god,"[...oh my god, OH MY GOD, OH MY GOD, OH MY G...",180
8,omg,"[!! omg, !!!!! omg!!!!!!, !!!!!!!!OMG, !!!!!om...",178
9,them,"[!!! THEM!!!, !!!! them, !!!!! THEM, !!!!! the...",177
