# TF-IDF Similarity for Emoticons
Find similar emoticons using TF-IDF, where tokens are the individual characters in the emoticons.

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import numpy as np
import random
import json

In [2]:
# load a corpus of emoticons as a list
with open('emoticon_dict.json', 'r', encoding='utf-8') as f:
  emoticon_dict = json.load(f)
  corpus = list( emoticon_dict.keys() )
print(corpus[:5], '...')
print("CORPUS LENGTH:", len(corpus))

['( ˘͈ ᵕ ˘͈♡)', '(╥﹏╥)', '（＾ω＾）', '(◍•ᴗ•◍)♡ ✧*。', '⸜( ˙˘˙)⸝'] ...
CORPUS LENGTH: 62149


In [54]:
# search by regex aand/or labels (specifically 'new_tags')
import re
def search(regex='', labels=[]):
    """returns emoticons that contain a given regex and have all the given labels. 
    ignores regex or labels if none are given."""
    condition_re = lambda em: re.search(regex, em) if regex else True
    condition_labels = lambda em_labels: all( [l in em_labels for l in labels] ) if labels else True
    return [em for em, tags in emoticon_dict.items() if condition_re(em) and condition_labels(tags['new_tags'])]

search(regex='◍.*‿.*◍', labels=['smiling'])

["(◍'‿'◍)", '꒰✩◍´´•‿•´´◍✩꒱', '(◍＾‿＾◍)', '(◍•‿•◍)', '(◍‿◍)', '(◍◕ω◕)人(◕‿◕◍)']

In [4]:
# Fit the TF-IDF NearestNeighbors model
vectorizer = TfidfVectorizer(analyzer = 'char_wb', ngram_range=(1,1))  #1,3 would look at 1-grams, 2-grams, and 3-grams
X = vectorizer.fit_transform(corpus).todense()
nbrs = NearestNeighbors(n_neighbors=3, metric='cosine', algorithm='brute').fit(X)

feature_names = vectorizer.get_feature_names_out()
print("FEATURES:", len(feature_names), feature_names[:10], "...")



FEATURES: 3723 [' ' '!' '"' '#' '$' '%' '&' "'" '(' ')'] ...


In [5]:
# returns the n closest emoticons to the input emoticon
def get_n_most_similar(input_emoticon, n):
  input_vector = vectorizer.transform([input_emoticon]).todense()
  distances_np, indices_np = nbrs.kneighbors(input_vector, n) # indices is like an array of document indices
  distances = distances_np.tolist()[0] #convert from 1D numpy arrays to list (idx 0 because we're only doing one input emoticon at a time)
  closest_indices = indices_np.tolist()[0]
  return [(corpus[ci], distances[idx]) for idx, ci in enumerate(closest_indices)]

In [8]:
# Try it out with your own inputs!
example_emoticons = ['(ﾉ◕ヮ◕)ﾉ*:・ﾟ', '¯\_(ツ)_/¯', '/╲/( ͡͡° ͜ʖ ͡°)/\\╱\\', '(╯°□°)╯︵ ┻━┻', '┬┴┬┴┤ᵒᴥᵒᶅ├┬┴┬┴', '_(:3 」∠)_', '[̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅ ]']
input_emoticon = example_emoticons[0]
num_similar = 20

for item in get_n_most_similar(input_emoticon, num_similar):
  print( "{: <35} {: <10} {: <50}".format(item[0], round(item[1], 3), ", ".join(emoticon_dict[item[0]]['new_tags']) ) )



(ﾉ◕ヮ◕)ﾉ*:・ﾟ✧                        0.051      smiling                                           
✧(ﾉ◕ヮ◕)ﾉ *:・ﾟ                       0.057      smiling                                           
(ﾉ◕ヮ◕)ﾉ*:･ﾟ                         0.075      sparkles, smiling                                 
( ﾉ◕ヮ◕ )ﾉ*:･ﾟ                       0.101      sparkles, smiling                                 
(❁ﾉ◕ヮ◕)ﾉ*:･ﾟ                        0.114      flower, sparkles, smiling                         
(ﾉ◕ヮ◕)ﾉ*:･ﾟ✧                        0.123      surprised, sparkles, smiling                      
(ﾉ◕ヮ◕)ﾉ* :･ﾟ✧                       0.127      sparkles, smiling                                 
( ﾉ◕ヮ◕ )ﾉ*:･ﾟ✧                      0.143      sparkles, smiling                                 
(ﾉ◕ヮ◕)ﾉ;*:･ﾟ✧                       0.164      sparkles, smiling                                 
(ﾉ◕ヮ◕)ﾉ*:･ﾟ✧( ﾟヮﾟ)                  0.167      sparkles                                          
(ﾉ◕_●◕)ﾉ*:・ﾟ✧       

## Samples of each tag

In [10]:
all_tags = ['angel', 'anger', 'annoyed', 'archery', 'asleep', 'basketball', 'bats_vampires', 'beach', 'bear', 'bird', 'birthday', 'blush', 'bomb', 'breasts', 'butt', 'butterfly', 'cat', 'cheerleader', 'chess', 'christmas', 'cigarette', 'clown', 'computers', 'crab', 'crying', 'dancing', 'dead', 'devil', 'dog', 'donger', 'drink', 'excited', 'fight', 'fish', 'flex', 'flower', 'food', 'football', 'frog', 'glasses', 'goodbye_message', 'gun', 'hamster', 'heart', 'hello_message', 'hug', 'kiss', 'koala', 'lenny', 'lying_down', 'middle finger', 'money', 'monkey', 'monocle', 'morning_night_evening_message', 'mouse', 'music', 'mustache', 'penis', 'pig', 'ping_pong', 'pointing', 'pokemon', 'proposal', 'rabbit', 'radio', 'rain', 'robot', 'rose', 'running', 'sad', 'salute_wave', 'seal', 'sheep', 'shrug', 'smiling', 'smirk', 'soccer', 'sparkles', 'spider', 'spinning', 'surprised', 'sweat', 'sword', 'syringe', 'table_flip', 'table_upright', 'thanks_message', 'thumbs_up', 'wall', 'wand', 'wink', 'writing', 'yummy', 'zombie']
for tag in all_tags:
  tagged_emoticons = [em for em, v in emoticon_dict.items() if tag in v['new_tags']]
  samples = random.sample(tagged_emoticons, 5)
  print(f'____ {tag} ({len(tagged_emoticons)}) __________________')
  print("\n".join(samples), '\n')

____ angel (674) __________________
O:-)
(✿˶˘ ³˘)ʚ(╹ᵕ╹o)ɞ
ଘ(人ﾟ∀ﾟ)(≧U≦❁)β♡
ଘ(๑˃̵ᴗ˂̵)━☆ﾟ.*･｡ﾟ
( つ•̀ω•́)つ ଘ(੭ˊ꒳ˋ)੭ 

____ anger (2993) __________________
╰(⇀︿⇀)つ-]═───
(๑ò_ó๑)
(ง⩺ᴥ⩹)ง
(╬ ‾᷅皿‾᷄ ╬)
MIN ■■■■■■■■■■ MAX o(｀Θ´)○☆ 

____ annoyed (893) __________________
(*≗*)
(¬ω¬)╭∩╮
(╬ಠิ益ಠิ)
ʕ -㉨- ʔ
(-ᴥ-ʋ)\(•́ ∀ •̀✿) 

____ archery (231) __________________
⤜(☼_☼)⤏
⤜(⪩◡⪨)⤏
⤜(ꔸヮꔸ)⤏
⤜( ͡ಥ ͜つಥ)⤏
⤜( ͡°~ ͡°)⤏ 

____ asleep (439) __________________
（ˇ ⊖ˇ）💤(✿ᵕ◡ᵕ)
(☆uεu ☆)ZZzz
(✿˵ᵕ∀ᵕ˵)💤(｡-‿-｡)
(u˘ﻌ˘).zZ
(-ﻌ-∪).zZ 

____ basketball (63) __________________
Ю ●　＼(^_＼)
(✿˵•́ ω •̀˵) 🗑 🏀＼(oωo＼)
-🏀     \(°‿°\)
🗑️
ヽ(ﾟ∀ﾟ)ﾉ ⚽ 　🏐＼(^_＼)　-🏀     \(°‿°\) 

____ bats_vampires (67) __________________
⎛⎝(⌒ⱅ⌒ )⎠⎞
o,..,o
(ી(΄◞ิ౪◟ิ‵)ʃ) (ી(΄◞ิ౪◟ิ‵)ʃ)
(㇏(♡ ᢍ ♡)ノ)𝔟𝔩𝔬𝔬𝔡…
⎛⎝(•̀ ‿•)⎠⎞ 

____ beach (51) __________________
人人人ヾ( ;×o×)〃 人人人
(´ ͡༎ຶ ͜ʖ ͡༎ຶ `)︵‿︵
人人人ヾ( ;×o×)ツ人人人
(ꈍ⌓ꈍ✿)=
︵‿︵‿︵‿︵‿︵‿︵‿︵‿︵‿︵‿ヽ(゜□゜ )ノ︵‿︵‿︵‿︵‿︵‿ 

____ bear (4179) __________________
ʕु•̫͡•ʔुʔ•̫͡•ཻʕʕु•̫͡•ʔु
(✿≧ᴗ≦)⊂ʕ•ᴥ•⊂ʔ
⊂ʕ♥ᴥ♥ ʔ
(ﾉ´ •з•) ﾉʕ´❛ᴥ❛`ʔ
ʕ✿╹ᴥ╹ʔ 

__

____ writing (271) __________________
＿〆(。。)
◛φ(♥U♥ )
🖋️
(♡ơ♡)✎
＿〆( 。。) 

____ yummy (154) __________________
٩(*ゝڡゝ๑)۶♥
🍜ԅ(´ڡ`ԅ)
ԅ(ˆڡˆԅ)
(๑•ڡ•)
🥞 ☕ԅ(´ڡ`ԅ)ლ(ಠ益ಠლ) 

____ zombie (123) __________________
(づO◡O)づ
(づ⪨▾⪩)づ
(づx◡x)づ
ᕦ(ಠ∀ಠ)【⪨_⪩】
(๑oᵕo)╦╤─⌐(×▵×⌐҂) 

