## Purpose 

The purpose of the notebook is to use annotatoins (and possibly description) of each emoji to intelligently cluster emojis
* use webscraping csv file to get dataframe with emoji, annotation

** Import Packages **

In [453]:
import nltk
import numpy as np
import gensim
from gensim.models import Word2Vec
from nltk.data import find
import pandas as pd
from collections import defaultdict

Function to convert the text file into a list of 1) titles 2) descriptions 3) annotations

In [454]:
def convert_scraped_txt(txt):
    with open("emoji_webscraped.txt") as f_in:
        titles = []
        descriptions = []
        annotations = []
        for line in f_in:
            line = line.strip()
            temp = line.split(", ")
            titles.append(temp[0])
            descriptions.append(temp[1])
            annotations.append(temp[2:len(temp)])
        return titles, descriptions, annotations

titles, descriptions, annotations = convert_scraped_txt("emoji_webscraped.txt")

In [455]:
print(len(titles))
print(len(descriptions))
print(len(annotations))

1282
1282
1282


** Create dataframe from three arrays **

In [456]:
d = {'titles' : (titles),
     'annotations' : (annotations),
     'descriptions': (descriptions)}
df = pd.DataFrame(d)

In [457]:
df.head()

Unnamed: 0,annotations,descriptions,titles
0,"[face, grin, person]",grinning face,U+1F600
1,"[eye, face, grin, person, smile]",grinning face with smiling eyes,U+1F601
2,"[face, joy, person, tear]",face with tears of joy,U+1F602
3,"[face, mouth, open, person, smile]",smiling face with open mouth,U+1F603
4,"[eye, face, mouth, open, person, smile]",smiling face with open mouth and smiling eyes,U+1F604


**Subset Dataframe to only annotations which either contain face or person**

In [458]:
def subset_annotations(_df):
    list_titles = [list(item) for item in list(_df)]
    index_face_person = [index for index,value in enumerate(list_titles) if 'face' in value] # or 'person' in value]
    # print(len(index_face_person))
    df_face_person = df.iloc[index_face_person]
    # print(df_face_person.shape)
    # df_face_person.head()
    return df_face_person

In [459]:
df_face_person = subset_annotations(df.annotations)

In [460]:
def word_list(annotations, _common=100):
    total_list = [word for item in list(annotations) for word in item]
    top_list = nltk.FreqDist(total_list).most_common(_common)
    return total_list, top_list

In [461]:
total_list, top_list = word_list(df_face_person.annotations, 50)

In [462]:
def top_justwords(top_list):
    top_words = [item[0] for item in top_list]
    return top_words

In [463]:
top_justwords = top_justwords(top_list)
top_justwords

['face',
 'person',
 'nature',
 'animal',
 'smile',
 'eye',
 'fairy tale',
 'cat',
 'mouth',
 'monster',
 'fantasy',
 'open',
 'space',
 'weather',
 'place',
 'creature',
 'kiss',
 'tear',
 'pet',
 'cold',
 'monkey',
 'moon',
 'no',
 'evil',
 'sad',
 'sweat',
 'not',
 'bright',
 'grin',
 'frown',
 'tongue',
 'cry',
 'heart',
 'body',
 'gesture',
 'prohibited',
 'forbidden',
 'angel',
 'surprised',
 'death',
 'weary',
 'quarter',
 'wink',
 'mad',
 'pig',
 'pouting',
 'speak',
 'fear',
 'disappointed',
 'alien']

In [464]:
df_face_person.head()

Unnamed: 0,annotations,descriptions,titles
0,"[face, grin, person]",grinning face,U+1F600
1,"[eye, face, grin, person, smile]",grinning face with smiling eyes,U+1F601
2,"[face, joy, person, tear]",face with tears of joy,U+1F602
3,"[face, mouth, open, person, smile]",smiling face with open mouth,U+1F603
4,"[eye, face, mouth, open, person, smile]",smiling face with open mouth and smiling eyes,U+1F604


In [465]:
df_features = pd.DataFrame(columns=top_justwords)

In [466]:
top_justwords_array = [(top_justwords) for i in range(df_face_person.shape[0])]
# df_face_person['top_words'] = 1
df_face_person['top_words'] = pd.Series(top_justwords_array, index=df_face_person.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [467]:
df_face_person.shape

(121, 4)

In [468]:
# something = [list(pd.Series(item).isin(df_face_person.annotations)) for item in df_face_person.top_words]

df_face_person['top_binary'] = [list(pd.Series(item).isin(list(df_face_person.annotations)[index])) for index, item in enumerate(df_face_person.top_words)]
df_face_person = df_face_person.reset_index(drop=True)
df_face_person.shape




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


(121, 5)

In [469]:
data_features = list(df_face_person.top_binary)

df_features = pd.DataFrame(data_features, columns=top_justwords)
df_features = df_features.astype(int)
df_features.shape

(121, 50)

In [470]:
df_combined = pd.concat([df_face_person, df_features], axis=1, join_axes=[df_face_person.index])

In [471]:
df_combined.shape

(121, 55)

In [472]:
# df_face_person['top_binary_num'] = None

# for index in range((df_face_person.shape)[0]):
#     temp = []
#     for boolean in df_face_person['top_binary'][index]:
#         if boolean==True:
#             temp.append(1)
#         else:
#             temp.append(0)
#     df_face_person['top_binary_num'][index] = temp

# # for index, _list in enumerate(df_face_person['top_binary']):
# #     temp = []
# #     for boolean in _list:
# #         if boolean==True:
# #             temp.append(1)
# #         else:
# #             temp.append(0)
# #     df_face_person['top_binary_num'][index] = temp


** Notes from John:**
count vectorizer on annoations descriptions
clustering with binary data, possibly asocaition rules
tfidf
feature vector
k_means on either full vector (or on lower dimensional space)

In [531]:
from sklearn.cluster import KMeans

def Clustering(df_combined, n_clusters=5):
    _X = np.array(df_combined.ix[:,5:(df_combined.shape)[1]])
    k_means = KMeans(init='k-means++', n_clusters= n_clusters, n_init=20)
    k_means.fit(_X)
    k_means_labels = k_means.labels_
    k_means_cluster_centers = k_means.cluster_centers_
    k_means_labels_unique = np.unique(k_means_labels)
    ft = (k_means_labels, k_means_cluster_centers, k_means_labels_unique)
    labels = np.array(k_means_labels_unique)
    location = np.array(k_means_cluster_centers)
    labels_location = list(zip(labels, location))
    # person_df['cluster_label'] = pd.DataFrame(k_means_labels)
    print ("labels:\n %s, \n cluster centers:\n %s,\n  unique labels:\n %s" % ft)
    print(labels_location)
    return k_means_labels

k_means_labels = Clustering(df_combined, n_clusters=5)

labels:
 [1 0 1 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 1 4 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 4 4 4 4 1 1
 4 4 4 1 3 3 3 3 3 3 3 3 3 3 3 3 3 1 4 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 2 2 2 2 2 3 1], 
 cluster centers:
 [[  1.00000000e+00   1.00000000e+00   1.11022302e-16   5.55111512e-17
    8.00000000e-01   4.00000000e-01   1.38777878e-17   2.77555756e-17
    4.00000000e-01   1.38777878e-17   1.38777878e-17   4.66666667e-01
   -2.08166817e-17   6.66666667e-02  -1.38777878e-17   1.38777878e-17
    6.66666667e-02   1.38777878e-17   0.00000000e+00   1.33333333e-01
    0.00000000e+00   0.00000000e+00  -6.93889390e-18  -6.93889390e-18
   -6.93889390e-18   1.33333333e-01  -6.93889390e-18   6.66666667e-02
    6.66666667e-02   6.66666667e-02  -6.93889390e-18  -6.93889390e-18
    6.66666667e-02  -6.93889390e-18  -6.93889390e-18  -6.93889390e-18
   -6.93889390e-18   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.

In [532]:
print(df_combined.shape)
print(len(k_means_labels))

(121, 56)
121


In [533]:
df_combined['k_means'] = list(k_means_labels)

In [534]:
unique_labels = list(labels)
dict_grouping = dict.fromkeys(unique_labels)
dict_grouping

{0: None, 1: None, 2: None, 3: None, 4: None}

In [535]:
subset = df_combined[['k_means','descriptions']]
subset = subset.values.tolist()
# subset

In [536]:
dict_grouping = defaultdict(list)
for key, date in subset:
    dict_grouping[key].append(date)

In [537]:
for key, value in dict_grouping.items():
    print(key)
    print(value)
    print("\n")


0
['grinning face with smiling eyes', 'smiling face with open mouth', 'smiling face with open mouth and smiling eyes', 'smiling face with open mouth and cold sweat', 'smiling face with open mouth and tightly-closed eyes', 'smiling face with smiling eyes', 'face savouring delicious food', 'smiling face with sunglasses', 'smiling face with heart-shaped eyes', 'kissing face with smiling eyes', 'white smiling face', 'slightly smiling face', 'face with open mouth', 'frowning face with open mouth', 'face with open mouth and cold sweat']


1
['grinning face', 'face with tears of joy', 'winking face', 'face throwing a kiss', 'kissing face', 'kissing face with closed eyes', 'hugging face', 'thinking face', 'neutral face', 'expressionless face', 'face without mouth', 'face with rolling eyes', 'smirking face', 'persevering face', 'disappointed but relieved face', 'zipper-mouth face', 'hushed face', 'sleepy face', 'tired face', 'sleeping face', 'relieved face', 'nerd face', 'face with stuck-out to

# LDA

In [539]:
from gensim import corpora, models, similarities 

In [540]:
df_combined.head()

Unnamed: 0,annotations,descriptions,titles,top_words,top_binary,face,person,nature,animal,smile,...,quarter,wink,mad,pig,pouting,speak,fear,disappointed,alien,k_means
0,"[face, grin, person]",grinning face,U+1F600,"[face, person, nature, animal, smile, eye, fai...","[True, True, False, False, False, False, False...",1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,"[eye, face, grin, person, smile]",grinning face with smiling eyes,U+1F601,"[face, person, nature, animal, smile, eye, fai...","[True, True, False, False, True, True, False, ...",1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,"[face, joy, person, tear]",face with tears of joy,U+1F602,"[face, person, nature, animal, smile, eye, fai...","[True, True, False, False, False, False, False...",1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,"[face, mouth, open, person, smile]",smiling face with open mouth,U+1F603,"[face, person, nature, animal, smile, eye, fai...","[True, True, False, False, True, False, False,...",1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,"[eye, face, mouth, open, person, smile]",smiling face with open mouth and smiling eyes,U+1F604,"[face, person, nature, animal, smile, eye, fai...","[True, True, False, False, True, True, False, ...",1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [541]:
texts= df_combined.annotations

#create a Gensim dictionary from the texts
dictionary = corpora.Dictionary(texts)

#remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
dictionary.filter_extremes(no_below=1, no_above=0.8)

#convert the dictionary to a bag of words corpus for reference
corpus = [dictionary.doc2bow(text) for text in texts]

In [543]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x10c301898>

In [550]:
%time lda = models.LdaModel(corpus, num_topics=10, \
                            id2word=dictionary, \
                            update_every=5, \
                            chunksize=10000, \
                            passes=100)

CPU times: user 12.7 s, sys: 20.3 ms, total: 12.7 s
Wall time: 12.7 s


In [551]:
lda.show_topics()

[(0,
  '0.160*person + 0.054*tear + 0.054*body + 0.054*sad + 0.054*cry + 0.037*monster + 0.037*tongue + 0.037*wink + 0.037*death + 0.037*skull'),
 (1,
  '0.220*person + 0.084*mouth + 0.074*smile + 0.067*open + 0.034*eye + 0.026*frown + 0.026*sweat + 0.026*cold + 0.026*heart + 0.017*mad'),
 (2,
  '0.085*grin + 0.065*person + 0.030*surprised + 0.030*moyai + 0.030*statue + 0.030*travel + 0.030*stunned + 0.030*hushed + 0.030*place + 0.030*horse'),
 (3,
  '0.142*person + 0.127*eye + 0.091*kiss + 0.070*smile + 0.020*horrible + 0.020*taste + 0.020*injury + 0.020*bandage + 0.020*hurt + 0.020*blush'),
 (4,
  '0.226*nature + 0.183*animal + 0.074*cat + 0.045*weather + 0.037*place + 0.037*space + 0.030*moon + 0.030*pet + 0.015*quarter + 0.015*pig'),
 (5,
  '0.167*person + 0.098*fairy tale + 0.088*fantasy + 0.059*monster + 0.049*creature + 0.030*smile + 0.021*nature + 0.020*alien + 0.020*extraterrestrial + 0.020*ufo'),
 (6,
  '0.130*person + 0.034*tired + 0.034*joy + 0.034*tear + 0.021*nature + 0.0

In [554]:
topics_matrix = lda.show_topics(formatted=False, num_words=20)
topics_matrix
# topics_matrix = np.array(topics_matrix)

# topic_words = topics_matrix[:,:,1]
# for i in topic_words:
#     print([str(word) for word in i])
#     print()

[(0,
  [('person', 0.15961170571365257),
   ('tear', 0.054049528870208134),
   ('body', 0.054048960919355127),
   ('sad', 0.054048889347755584),
   ('cry', 0.054048889347755501),
   ('monster', 0.036613954044276337),
   ('tongue', 0.036613914152601776),
   ('wink', 0.036613820149575593),
   ('death', 0.03661380121855598),
   ('skull', 0.036613801218555647),
   ('eye', 0.036612959518771401),
   ('sob', 0.019178667725240899),
   ('joke', 0.019178667725222757),
   ('rolling', 0.019178667690230852),
   ('eyes', 0.019178667690229384),
   ('fairy tale', 0.019178116009641769),
   ('crossbones', 0.0017580003381538812),
   ('cat', 0.0017436440999485186),
   ('animal', 0.0017435659004636809),
   ('nature', 0.0017435561814640567)]),
 (1,
  [('person', 0.21984458527397097),
   ('mouth', 0.08379984779038116),
   ('smile', 0.07404616692994731),
   ('open', 0.06720580127213728),
   ('eye', 0.034035309510224009),
   ('frown', 0.02572075675343655),
   ('sweat', 0.025720709808085301),
   ('cold', 0.0257

# Word2Vec

In [603]:
from gensim.models.doc2vec import TaggedDocument, LabeledSentence, Doc2Vec

In [604]:
def label_sentence(_df):
    sentences = []
    for uid, line in enumerate(_df):
        sentences.append(LabeledSentence(words=df_combined.annotations[uid], \
                              tags=[df_combined.descriptions[uid]]))
    return sentences

label_sentence(df_combined)

[LabeledSentence(words=['face', 'grin', 'person'], tags=['grinning face']),
 LabeledSentence(words=['eye', 'face', 'grin', 'person', 'smile'], tags=['grinning face with smiling eyes']),
 LabeledSentence(words=['face', 'joy', 'person', 'tear'], tags=['face with tears of joy']),
 LabeledSentence(words=['face', 'mouth', 'open', 'person', 'smile'], tags=['smiling face with open mouth']),
 LabeledSentence(words=['eye', 'face', 'mouth', 'open', 'person', 'smile'], tags=['smiling face with open mouth and smiling eyes']),
 LabeledSentence(words=['cold', 'face', 'open', 'person', 'smile', 'sweat'], tags=['smiling face with open mouth and cold sweat']),
 LabeledSentence(words=['face', 'laugh', 'mouth', 'open', 'person', 'satisfied', 'smile'], tags=['smiling face with open mouth and tightly-closed eyes']),
 LabeledSentence(words=['face', 'person', 'wink'], tags=['winking face']),
 LabeledSentence(words=['blush', 'eye', 'face', 'person', 'smile'], tags=['smiling face with smiling eyes']),
 Labeled

In [605]:
sentences = list(LabeledSentence(df_combined.annotations[uid], (df_combined.descriptions[uid])) for uid, value in enumerate(df_combined))
model = Doc2Vec(size=200, min_count=1, workers=16)
model.build_vocab(sentences)


# model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=8)

# model.build_vocab(sentences.to_array)

In [607]:
for epoch in range(10):
    model.train(sentences)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha

In [609]:
# sims = model.docvecs.most_similar(['grinning face'])
# model.most_similar('grinning face')
model.most_similar("face with thermometer")

KeyError: "word 'face with thermometer' not in vocabulary"