In [4]:
import pandas as pd
import numpy as np

import sys
sys.path.append("..")
from settings import AMBIGUITY_PATH

In [5]:
ambiguity = pd.read_csv(AMBIGUITY_PATH, encoding='utf-8')
ambiguity.head()

Unnamed: 0,WorkerID,FormId,Duration,emoji_index,emoji,word
0,A19AAMLW7OP5V4,65,330.0,0,#️⃣,ash
1,A1DD23J1WBGQUU,65,315.0,0,#️⃣,hashtag
2,A2C7A6E70NYNUI,65,317.0,0,#️⃣,hashtag
3,A2CK0OXMPOR9LE,65,521.0,0,#️⃣,pound
4,A272X64FOZFYLB,65,329.0,0,#️⃣,pound


In [85]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = list(ambiguity.groupby("emoji").word.apply(list).apply(lambda x: " ".join(x)).values)
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(corpus)

num_topics = 1000
lsa = TruncatedSVD(num_topics)
lsa.fit(tfidf)

dictionary = vectorizer.get_feature_names()

lsa_topics = pd.DataFrame(lsa.components_,
                      index=[f"topic{i}" for i in range(num_topics)],
                      columns=dictionary).T
lsa_topics.head()

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,...,topic990,topic991,topic992,topic993,topic994,topic995,topic996,topic997,topic998,topic999
aardvark,9.379191e-10,5.477594e-08,7.176654e-07,7.014542e-07,5.416686e-08,3.937373e-09,9.103998e-08,1.831172e-06,1.560337e-07,5e-06,...,-0.000276,4.5e-05,0.000185,0.000375,0.000191,-0.000217,0.00016,-0.000325,-0.000553,-0.000444
ab,9.159959e-08,7.255564e-06,0.0003432,-0.0001865158,-3.888424e-06,-5.122018e-06,4.02908e-05,-5.413787e-06,-4.941931e-06,4.2e-05,...,-0.018455,0.005943,0.019112,0.015533,0.008717,-0.030606,-0.001493,0.005272,0.003222,0.004112
abacus,5.362996e-07,7.115101e-06,0.00296892,-0.001632498,-0.0001428282,-4.997754e-05,-0.0002869638,-7.52306e-05,0.0001413431,-4e-05,...,-0.002666,-0.000805,-0.009637,0.007331,-0.003218,-0.01151,-0.003083,-0.003451,0.00752,0.001856
abbreviations,9.439497e-08,2.086882e-06,0.0005315578,-0.0002860461,-2.629994e-05,-9.092181e-06,-4.880661e-05,-9.521623e-06,1.309162e-05,-2e-06,...,-0.001069,-0.000122,-0.000618,0.000144,-0.00034,0.001745,0.000485,-0.001272,-0.001237,0.00029
abc,3.593172e-08,3.576845e-06,7.456573e-05,-4.002368e-05,2.255005e-06,-7.911761e-07,2.810846e-05,-3.115418e-08,1.459945e-06,2.1e-05,...,-0.013352,0.004792,0.013933,0.012242,0.005283,-0.025778,-0.000342,0.001523,0.004744,0.004433


In [86]:
sum(lsa.explained_variance_ratio_)

0.9930222267063094

In [87]:
for i in range(num_topics):
    lsa_topics[f'abs_topic{i}'] = np.abs(lsa_topics[f'topic{i}'])
    display(lsa_topics.sort_values(f'abs_topic{i}', ascending=False).index[:10])

Index(['family', 'mother', 'parent', 'daughter', 'couple', 'people', 'love',
       'father', 'siblings', 'son'],
      dtype='object')

Index(['love', 'heart', 'kiss', 'couple', 'hearts', 'kissing', 'lesbian',
       'lovestruck', 'valentine', 'family'],
      dtype='object')

Index(['chinese', 'sad', 'symbol', 'japanese', 'kanji', 'sign', 'crying',
       'asian', 'character', 'boy'],
      dtype='object')

Index(['sad', 'chinese', 'crying', 'worried', 'boy', 'cry', 'upset', 'symbol',
       'japanese', 'happy'],
      dtype='object')

Index(['moon', 'square', 'circle', 'sad', 'happy', 'smile', 'night', 'eclipse',
       'black', 'ball'],
      dtype='object')

Index(['train', 'bus', 'subway', 'trolley', 'tram', 'helicopter', 'rail',
       'monorail', 'cablecar', 'trolly'],
      dtype='object')

Index(['square', 'black', 'moon', 'blank', 'box', 'block', 'white', 'chinese',
       'gray', 'yellow'],
      dtype='object')

Index(['happy', 'boy', 'smile', 'man', 'sad', 'moon', 'baby', 'laughing',
       'girl', 'woman'],
      dtype='object')

Index(['star', 'flower', 'asterisk', 'sun', 'jewish', 'stars', 'snowflake',
       'diamond', 'explosion', 'snow'],
      dtype='object')

Index(['heart', 'love', 'kiss', 'cat', 'couple', 'kissing', 'boy', 'man',
       'lesbian', 'friends'],
      dtype='object')

Index(['book', 'down', 'up', 'notebook', 'arrow', 'bookmark', 'triangle',
       'chevron', 'reading', 'turn'],
      dtype='object')

Index(['up', 'down', 'arrow', 'book', 'triangle', 'point', 'turn', 'upward',
       'notebook', 'right'],
      dtype='object')

Index(['man', 'boy', 'cat', 'happy', 'smile', 'laughing', 'laugh', 'silly',
       'angry', 'love'],
      dtype='object')

Index(['money', 'cash', 'yen', 'dollar', 'rich', 'currency', 'chinese',
       'exchange', 'pounds', 'cat'],
      dtype='object')

Index(['sun', 'flower', 'circle', 'diamond', 'cloudy', 'star', 'sunny',
       'sunrise', 'moon', 'sunshine'],
      dtype='object')

Index(['flower', 'diamond', 'sun', 'circle', 'star', 'cloudy', 'sunny',
       'sunrise', 'rose', 'tulip'],
      dtype='object')

Index(['cat', 'angry', 'down', 'happy', 'smile', 'man', 'boy', 'kiss', 'up',
       'love'],
      dtype='object')

Index(['down', 'up', 'cat', 'man', 'happy', 'angry', 'arrow', 'book', 'boy',
       'smile'],
      dtype='object')

Index(['wheelchair', 'disabled', 'handicapped', 'handicap', 'chair',
       'disability', 'scooter', 'accessible', 'sitting', 'girl'],
      dtype='object')

Index(['diamond', 'flower', 'star', 'building', 'corn', 'rose', 'hospital',
       'tulip', 'flowers', 'down'],
      dtype='object')

Index(['building', 'hospital', 'school', 'city', 'church', 'temple', 'house',
       'court', 'diamond', 'flower'],
      dtype='object')

Index(['stop', 'circle', 'no', 'sun', 'hand', 'kiss', 'mailbox', 'mail',
       'phone', 'sign'],
      dtype='object')

Index(['kiss', 'angry', 'love', 'heart', 'mad', 'circle', 'couple', 'stop',
       'no', 'lips'],
      dtype='object')

Index(['mailbox', 'mail', 'circle', 'email', 'envelope', 'letter', 'kiss',
       'angry', 'sun', 'love'],
      dtype='object')

Index(['circle', 'no', 'stop', 'sun', 'phone', 'dot', 'cat', 'kiss', 'cloudy',
       'surprised'],
      dtype='object')

Index(['angry', 'cat', 'kiss', 'mouse', 'mad', 'circle', 'laughing', 'devil',
       'love', 'man'],
      dtype='object')

Index(['surprised', 'shocked', 'surprise', 'monkey', 'shock', 'angry',
       'police', 'exclamation', 'man', 'stop'],
      dtype='object')

Index(['police', 'car', 'cop', 'monkey', 'pilot', 'policeman', 'mailman',
       'bus', 'mouse', 'officer'],
      dtype='object')

Index(['mouse', 'monkey', 'laughing', 'girl', 'woman', 'phone', 'laugh', 'man',
       'cat', 'rat'],
      dtype='object')

Index(['phone', 'monkey', 'mouse', 'telephone', 'stop', 'cellphone', 'no',
       'circle', 'hand', 'selfie'],
      dtype='object')

Index(['monkey', 'phone', 'mouse', 'surprised', 'blind', 'shocked', 'gorilla',
       'surprise', 'shock', 'telephone'],
      dtype='object')

Index(['girl', 'woman', 'laughing', 'mouse', 'cat', 'laugh', 'surprised',
       'monkey', 'man', 'grandma'],
      dtype='object')

Index(['medal', 'second', 'winner', 'prize', 'award', 'third', 'place',
       'first', 'bronze', 'silver'],
      dtype='object')

Index(['laughing', 'girl', 'woman', 'laugh', 'smile', 'happy', 'crying', 'cat',
       'funny', 'grandma'],
      dtype='object')

Index(['dog', 'soup', 'wolf', 'fox', 'bowl', 'food', 'puppy', 'bear',
       'guidedog', 'poodle'],
      dtype='object')

Index(['movie', 'camera', 'film', 'music', 'video', 'camcorder', 'action',
       'movies', 'projector', 'cd'],
      dtype='object')

Index(['music', 'movie', 'camera', 'film', 'cd', 'video', 'headphones', 'note',
       'disc', 'camcorder'],
      dtype='object')

Index(['house', 'hospital', 'home', 'church', 'temple', 'houses', 'building',
       'no', 'ship', 'flag'],
      dtype='object')

Index(['no', 'hand', 'stop', 'fist', 'water', 'wave', 'cross', 'wrong',
       'house', 'rain'],
      dtype='object')

Index(['alphabet', 'rain', 'letter', 'water', 'umbrella', 'cloudy', 'letters',
       'cloud', 'mailbox', 'wave'],
      dtype='object')

Index(['alphabet', 'rain', 'letter', 'umbrella', 'cloudy', 'water', 'letters',
       'cloud', 'beach', 'paper'],
      dtype='object')

Index(['graph', 'fist', 'chart', 'stocks', 'fistbump', 'stock', 'signal',
       'punch', 'sign', 'rain'],
      dtype='object')

Index(['turn', 'arrow', 'rewind', 'triangle', 'refresh', 'left', 'reverse',
       'soup', 'repeat', 'alphabet'],
      dtype='object')

Index(['fist', 'graph', 'fistbump', 'soup', 'punch', 'stop', 'chart',
       'hospital', 'stocks', 'no'],
      dtype='object')

Index(['soup', 'drink', 'tree', 'fist', 'food', 'bowl', 'dog', 'silly', 'tea',
       'turn'],
      dtype='object')

Index(['hospital', 'house', 'building', 'church', 'ship', 'fist', 'boat',
       'temple', 'flag', 'city'],
      dtype='object')

Index(['silly', 'crazy', 'goofy', 'soup', 'tongue', 'laugh', 'teasing', 'baby',
       'fist', 'laughing'],
      dtype='object')

Index(['bus', 'car', 'sick', 'police', 'van', 'fish', 'ship', 'taxi', 'soup',
       'boat'],
      dtype='object')

Index(['sick', 'tree', 'angry', 'ship', 'fish', 'car', 'boat', 'hurt',
       'couple', 'bus'],
      dtype='object')

Index(['fish', 'bus', 'plane', 'ship', 'triangle', 'airplane', 'flag', 'boat',
       'couple', 'sick'],
      dtype='object')

Index(['tree', 'soup', 'baby', 'sick', 'triangle', 'bus', 'road', 'christmas',
       'palm', 'fish'],
      dtype='object')

Index(['triangle', 'sick', 'down', 'tree', 'turn', 'angle', 'up', 'silly',
       'fish', 'couple'],
      dtype='object')

Index(['couple', 'ship', 'boat', 'friends', 'fish', 'cruise', 'hospital',
       'bus', 'church', 'flag'],
      dtype='object')

Index(['couple', 'plane', 'airplane', 'ship', 'friends', 'boat', 'fish', 'car',
       'cruise', 'bus'],
      dtype='object')

Index(['car', 'bus', 'fish', 'couple', 'flag', 'friends', 'plane', 'airplane',
       'taxi', 'man'],
      dtype='object')

Index(['calendar', 'plane', 'airplane', 'car', 'bus', 'fish', 'flag', 'date',
       'man', 'calender'],
      dtype='object')

Index(['calendar', 'plane', 'car', 'airplane', 'fish', 'bus', 'date', 'flag',
       'calender', 'man'],
      dtype='object')

Index(['lock', 'purse', 'locked', 'unlock', 'luggage', 'suitcase', 'pen',
       'bag', 'briefcase', 'tools'],
      dtype='object')

Index(['shirt', 'doctor', 'coat', 'labcoat', 'jacket', 'dress', 'jersey',
       'church', 'sick', 'stethoscope'],
      dtype='object')

Index(['earth', 'world', 'globe', 'baby', 'water', 'cloudy', 'global',
       'internet', 'tree', 'clouds'],
      dtype='object')

Index(['baby', 'tree', 'man', 'water', 'car', 'couple', 'bus', 'umbrella',
       'earth', 'friends'],
      dtype='object')

Index(['shoe', 'boot', 'sandal', 'sneaker', 'heels', 'heel', 'slipper',
       'running', 'shoes', 'walk'],
      dtype='object')

Index(['boy', 'man', 'car', 'baby', 'bus', 'grandpa', 'grandma', 'exclamation',
       'child', 'person'],
      dtype='object')

Index(['water', 'wave', 'cloudy', 'baby', 'drink', 'umbrella', 'question',
       'stop', 'cloud', 'soup'],
      dtype='object')

Index(['church', 'building', 'chick', 'hospital', 'fish', 'city', 'cross',
       'chicken', 'pray', 'synagogue'],
      dtype='object')

Index(['chick', 'chicken', 'church', 'bird', 'building', 'turkey', 'drumstick',
       'hospital', 'meat', 'city'],
      dtype='object')

Index(['umbrella', 'cloudy', 'beach', 'cloud', 'clouds', 'mountain', 'sun',
       'island', 'tree', 'church'],
      dtype='object')

Index(['cake', 'paper', 'document', 'alphabet', 'letter', 'file', 'letters',
       'folder', 'files', 'note'],
      dtype='object')

Index(['speaker', 'sound', 'volume', 'wave', 'question', 'quiet', 'water',
       'silent', 'megaphone', 'mute'],
      dtype='object')

Index(['cake', 'paper', 'document', 'alphabet', 'pie', 'birthday', 'letter',
       'pen', 'hair', 'cupcake'],
      dtype='object')

KeyboardInterrupt: 

In [64]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(corpus)

num_topics = 10
lda = LatentDirichletAllocation(n_components=num_topics)
lda.fit(bow)

dictionary = vectorizer.get_feature_names()
lda_topics = pd.DataFrame(lda.components_,
                      index=[f"topic{i}" for i in range(num_topics)],
                      columns=dictionary).T
lda_topics

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9
aardvark,0.100000,0.100000,0.100000,1.100000,0.1,0.100000,0.100000,0.100000,0.100000,0.100000
ab,0.100000,0.100000,0.100000,0.100000,0.1,0.100000,0.100000,0.100045,10.099955,0.100000
abacus,0.100000,0.100000,0.100000,0.100000,13.1,0.100000,0.100000,0.100000,0.100000,0.100000
abbreviations,1.099990,0.100000,0.100000,0.100000,0.1,0.100000,0.100010,0.100000,0.100000,0.100000
abc,0.100000,0.100000,0.100000,0.100000,0.1,0.100000,0.100000,0.100162,0.100000,3.099838
...,...,...,...,...,...,...,...,...,...,...
zodiac,0.100000,0.100000,0.100010,0.100000,0.1,0.100001,27.096149,0.100008,0.100016,2.103817
zodiccircle,1.099997,0.100000,0.100000,0.100000,0.1,0.100000,0.100000,0.100000,0.100000,0.100003
zombie,0.100000,0.100000,0.100000,0.100000,0.1,0.100005,23.099994,0.100000,0.100001,0.100000
zone,0.100000,0.100000,0.100000,0.100000,0.1,0.100000,0.100000,1.100000,0.100000,0.100000


In [65]:
for i in range(num_topics):
    lda_topics[f'abs_topic{i}'] = np.abs(lda_topics[f'topic{i}'])
    display(lda_topics.sort_values(f'abs_topic{i}', ascending=False).index[:10])

Index(['flower', 'wheelchair', 'music', 'umbrella', 'baby', 'fist', 'calendar',
       'eye', 'whale', 'laughing'],
      dtype='object')

Index(['sad', 'cat', 'kiss', 'building', 'mailbox', 'cow', 'chicken', 'crying',
       'judge', 'santa'],
      dtype='object')

Index(['moon', 'square', 'circle', 'hospital', 'ball', 'house', 'horse',
       'mail', 'question', 'golf'],
      dtype='object')

Index(['sun', 'monkey', 'car', 'medal', 'bus', 'shirt', 'ship', 'boat', 'lock',
       'apple'],
      dtype='object')

Index(['book', 'down', 'tree', 'soup', 'drink', 'purse', 'chick', 'arrow',
       'exclamation', 'rewind'],
      dtype='object')

Index(['family', 'money', 'diamond', 'up', 'angry', 'church', 'mountain',
       'triangle', 'tiger', 'chair'],
      dtype='object')

Index(['chinese', 'phone', 'girl', 'symbol', 'shoe', 'bell', 'blind', 'woman',
       'candle', 'sign'],
      dtype='object')

Index(['stop', 'no', 'alphabet', 'camera', 'pig', 'pen', 'hand', 'couple',
       'dragon', 'silly'],
      dtype='object')

Index(['love', 'heart', 'happy', 'man', 'smile', 'boy', 'mouse', 'police',
       'angel', 'scooter'],
      dtype='object')

Index(['train', 'star', 'dog', 'cake', 'cloudy', 'peace', 'key', 'camel',
       'dinosaur', 'sunrise'],
      dtype='object')