In [1]:
import pandas as pd
from tqdm import tqdm

from tag_recommender.utils.text import to_snake_case, to_snake_case_boosted

tqdm.pandas()

In [2]:
# read the dataset
df = pd.read_csv(
    "../data/full_dataset.csv",
    usecols=["tags", "root_tags"],
)

# convert the string tags to tags lists
df["root_tags"] = df["root_tags"].fillna("").str.split(",")
df["tags"] = df["tags"].fillna("").str.split(",")

In [3]:
# extract all tags and root tags in one place and take the unique set
tags_df = (
    pd.concat([df.tags.explode(), df.root_tags.explode()]).drop_duplicates().to_frame()
)
tags_df.columns = ["initial_tag"]

In [4]:
# Perform two kinds of normalization: 'basic' and 'boosted'.
tags_df["basic_norm"] = tags_df["initial_tag"].progress_apply(to_snake_case)

# removes emojis, accents, handles unicode charts etc
tags_df["advanced_norm"] = tags_df["initial_tag"].progress_apply(to_snake_case_boosted)

100%|█| 2401924/2401924 [00:08<00:00, 277926.10
100%|█| 2401924/2401924 [00:39<00:00, 61306.50i


In [5]:
# All tags without normalization whatsoever
len(tags_df)

2401924

In [6]:
# showcase the diff between the two normalization methods
tags_df[tags_df.basic_norm != tags_df.advanced_norm].sample(10)

Unnamed: 0,initial_tag,basic_norm,advanced_norm
547636,i'm surprised how little analysis there is abo...,i'm_surprised_how_little_analysis_there_is_abo...,im_surprised_how_little_analysis_there_is_abou...
101109,star: renegade,star:_renegade,star_renegade
242587,TONIGHT,t_o_n_i_g_h_t,tonight
1287962,(yes this is the original),(yes_this_is_the_original),yes_this_is_the_original
559693,oh that is a good one <3,oh_that_is_a_good_one_<3,oh_that_is_a_good_one_3
1407768,and his STUPID obsession with the Faith and hi...,and_his_s_t_u_p_i_d_obsession_with_the_faith_a...,and_his_stupid_obsession_with_the_faith_and_hi...
372306,I LOVE THE PATTERN BOOB,i_l_o_v_e_t_h_e_p_a_t_t_e_r_n_b_o_o_b,i_love_the_pattern_boob
1723423,BATTLE ROYALE BAYBAY,b_a_t_t_l_e_r_o_y_a_l_e_b_a_y_b_a_y,battle_royale_baybay
1503810,부산유흥 링크,부산유흥_링크,부산유흥_링크
1493664,second of all [i am silenced],second_of_all_[i_am_silenced],second_of_all_i_am_silenced


In [7]:
basic_norm_unique = tags_df["basic_norm"].nunique()

basic_norm_unique, round(100 * basic_norm_unique / len(tags_df), 3)

(2306598, 96.031)

In [8]:
advanced_norm_unique = tags_df["advanced_norm"].nunique()

advanced_norm_unique, round(100 * advanced_norm_unique / len(tags_df), 3)

(2178634, 90.704)

We select the method that normalizes more to check the normalized tags

In [9]:
norm2initial = (
    tags_df.groupby("advanced_norm")["initial_tag"]
    .progress_apply(lambda x: sorted(set(x)))
    .reset_index()
)
norm2initial["n_initial"] = norm2initial["initial_tag"].apply(len)
norm2initial.columns = ["normalized_tag", "initial_tags", "initial_count"]

100%|█| 2178634/2178634 [00:14<00:00, 151995.24


In [10]:
norm2initial.sort_values("initial_count", ascending=False, inplace=True)
norm2initial.reset_index(drop=True, inplace=True)

In [11]:
print(norm2initial["initial_tags"][8])

['!! omg', '!!!!! omg!!!!!!', '!!!!!!!!OMG', '!!!!!omg!', '( omg )', '*OMG', '..... Omg', ':omg', 'O.M.G.', 'OMG', 'OMG .', 'OMG ;;;', 'OMG ???', 'OMG ????', 'OMG 💀💀💀💀', 'OMG 😂😂😂😂😂😂', 'OMG 😭', 'OMG 😭😭😭', 'OMG 🥹', 'OMG!', 'OMG!!', 'OMG!!!', 'OMG!!!!', 'OMG!!!!!', 'OMG!!!!!!!!!', 'OMG!!!!!!!!!!!!!', 'OMG!!!!!!!!!!!!!!!!', 'OMG.', 'OMG..', 'OMG...', 'OMG...!!!!!!', 'OMG.....', 'OMG?', 'OMG?!?!!?!', 'OMG?!?!?!?!?!?!?!?!?!?', 'OMG??', 'OMG???', 'OMG????', 'OMG?????', 'OMG??????', 'OMG???????', 'OMG??????????', 'OMG…', 'OMG………..', 'OMG😭😭😭', 'OMG😭😭😭😭😭😭', 'OMG🥺🥺🥺', 'OMG🥺🥺🥺🥺', 'Omg', 'Omg 😭', 'Omg 😭😭', 'Omg 🤣🤣🤣🤣🤣🤣🤣', 'Omg!!', 'Omg!!!', 'Omg!!!!', 'Omg...', 'Omg... ❤️', 'Omg....', 'Omg:)', 'Omg?', 'Omg????', 'o.m.g.', 'omg', 'omg !!', 'omg !!!!', 'omg !!!!!!!!!!', 'omg !!!!!!!!!!!!!!!!!', 'omg ..', 'omg ....', 'omg .......', "omg :')", 'omg :(', 'omg :((', 'omg ;-;', 'omg ;;♥', 'omg ;_;', 'omg ?', 'omg ….', 'omg 👁️👁️', 'omg 👑', 'omg 💀💀', 'omg 💙🖤', 'omg 😂', 'omg 😂😂', 'omg 😂😂😂', 'omg 😂🙊', 'omg 😂🤣'

In [12]:
norm2initial.head(20)

Unnamed: 0,normalized_tag,initial_tags,initial_count
0,,"[, !, ! ! 💗💙, ! ! 💙💜, ! !!, ! ? ! ?, ! ★, ! 👀,...",6956
1,yeah,"[!!! yeah!!, !!!! yeah, !!!!!! yeah!!!!!, !!!!...",323
2,ooc,"[! OOC., & ooc, & ooc., &&. ooc, &&ooc, &. oo...",233
3,oh,"[!!oh, 'OH! 👀👀👀', 'oh, 'oh :(', ( OH??????????...",228
4,yes,"[! yes, !! yes!!!, !!! YES, !!! yes, !!!! yes,...",198
5,prev,"[!! prev, !!!!! prev, (< prev), (<-prev), (<pr...",194
6,me,"[(ME), (me), **** me, *me, - me, . me ., ...",184
7,oh_my_god,"[...oh my god, OH MY GOD, OH MY GOD, OH MY G...",180
8,omg,"[!! omg, !!!!! omg!!!!!!, !!!!!!!!OMG, !!!!!om...",178
9,them,"[!!! THEM!!!, !!!! them, !!!!! THEM, !!!!! the...",177


In [16]:
norm2initial["length"] = norm2initial.normalized_tag.apply(len)

In [24]:
norm2initial[norm2initial["length"] == 1].head(30)

Unnamed: 0,normalized_tag,initial_tags,initial_count,length
24,3,"[! <3, !! <3, !!! :3, !!! <3, !!!! <3, !!!!! <...",111,1
44,o,"[!! :O, !!!!! :O, (*o * ), (o) ), (⁠゜⁠o⁠゜⁠;...",88,1
60,i,"[*i, .i, :I, I, I -, I -----, I ., I 😭😭😭😭😭😭😭, ...",70,1
77,q,"[!! q, &. q, ( q. ), (q), * q ♡ ·, *q, + q., -...",63,1
81,a,"[*A*, *a, +a, .a, /a, :A, :A:, ; A ;, ; A ; !!...",61,1
99,d,"[! :D, !!! :D, -d, /d, : D, :""D, :'D, :) -d, :...",55,1
122,m,"[**m, *m, - M, -M, -m, .m, ///m, /m, :m, ;m, ;...",49,1
149,c,"[*c, .c, //c, /c, :'C, :)c, :C, :c, :| c, ;c, ...",43,1
165,s,"[( /s? ), (/s), (s), *s, .s, /////S, /s, :s, S...",41,1
189,0,"[!!!!!!!!! :0, $0, $:@0-):, *0*, .0, 0, 0 :(, ...",38,1


In [29]:
norm2initial[norm2initial["length"] == 2].head(30)

Unnamed: 0,normalized_tag,initial_tags,initial_count,length
3,oh,"[!!oh, 'OH! 👀👀👀', 'oh, 'oh :(', ( OH??????????...",228,2
6,me,"[(ME), (me), **** me, *me, - me, . me ., ...",184,2
13,im,"[( im, I'M, I'M ., I'M @)!__(""!&&, I'M-, I'M.....",131,2
25,he,"[!!!!! HE, !!HE!!, .......he, ......he, :) he,...",111,2
36,ic,"[( IC. ), ( ic ; ), ( ic. ), ( ic; ), (.ic), (...",95,2
43,no,"[''NO'', ''no'', '...no?', (no), ***no, ... no...",89,2
54,ok,"[''ok :('', 'ok', .... ok, ......... ok, ........",72,2
56,hi,"[!!!!!!!!!!!!!!!!!!!!!!HI, (hi), . hi., .........",72,2
128,um,"[......um, ...um, UM, UM 👀, UM!!!!, UM!!!!!!!!...",48,2
129,ah,"[( ah. ), ....ah, ...ah, .Ah, :')))))))) ah, >...",47,2


In [31]:
norm2initial[norm2initial["length"] == 3].head(50)

Unnamed: 0,normalized_tag,initial_tags,initial_count,length
2,ooc,"[! OOC., & ooc, & ooc., &&. ooc, &&ooc, &. oo...",233,3
4,yes,"[! yes, !! yes!!!, !!! YES, !!! yes, !!!! yes,...",198,3
8,omg,"[!! omg, !!!!! omg!!!!!!, !!!!!!!!OMG, !!!!!om...",178,3
11,txt,"[* ♡ txt., ***txt, **txt, *.txt, *txt, . txt, ...",154,3
16,art,"[! art, !art, ( * art ), ( art . 🎨 ), ( art. )...",125,3
17,him,"[!!!!!!! HIM, (HIM!!!!), * / him 💋💅, *him, ......",122,3
27,she,"[!!!!! she🥹, *she, . 💚💚💚 she!!!, .... she, // ...",107,3
33,wow,"[!!!! WOW, !!!!!!!!! WOW, *wow*, ..... wow, .....",100,3
39,her,"[(HER...), *her, *her*, @her, H.E.R., HER, HER...",91,3
55,huh,"[((huh)), .... huh ?, ......huh, ....huh, .......",72,3
