In [1]:
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import MultinomialNB

from Emojidata import EMOJIdata

In [67]:
train = EMOJIdata("./data/us_train.text")
test = EMOJIdata("./data/us_test.text", vocab=train.vocab)

In [3]:
emojiMap = {}
for line in open("./data/us_mapping.txt"):
    line = line.split()    
    emojiMap[int(line[0])] = (line[1], line[2])
emojiMap

{0: ('‚ù§', '_red_heart_'),
 1: ('üòç', '_smiling_face_with_hearteyes_'),
 2: ('üòÇ', '_face_with_tears_of_joy_'),
 3: ('üíï', '_two_hearts_'),
 4: ('üî•', '_fire_'),
 5: ('üòä', '_smiling_face_with_smiling_eyes_'),
 6: ('üòé', '_smiling_face_with_sunglasses_'),
 7: ('‚ú®', '_sparkles_'),
 8: ('üíô', '_blue_heart_'),
 9: ('üòò', '_face_blowing_a_kiss_'),
 10: ('üì∑', '_camera_'),
 11: ('üá∫üá∏', '_United_States_'),
 12: ('‚òÄ', '_sun_'),
 13: ('üíú', '_purple_heart_'),
 14: ('üòâ', '_winking_face_'),
 15: ('üíØ', '_hundred_points_'),
 16: ('üòÅ', '_beaming_face_with_smiling_eyes_'),
 17: ('üéÑ', '_Christmas_tree_'),
 18: ('üì∏', '_camera_with_flash_'),
 19: ('üòú', '_winking_face_with_tongue_')}

In [4]:
#baseline by most frequent tag
print("Most requent tag is: ", emojiMap[train.mostFreqLbl])
print("Baseline by most frequent tag: ", np.count_nonzero(test.Y == train.mostFreqLbl)/test.Y.shape[0])

Most requent tag is:  ('‚ù§', '_red_heart_')
Baseline by most frequent tag:  0.21596


In [177]:
%%time
for alpha in [0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]:
    clfs = []
    for gold in range(20):
        clf = MultinomialNB(alpha=alpha, fit_prior=True, class_prior=None)
        Y = np.copy(train.Y)
        Y[Y != gold] = -1
        clf.fit(train.X, Y)
        clfs.append(clf)
    probs = np.zeros((20,test.Y.shape[0]))
    for i in range(20):
        probs[i] = clfs[i].predict_proba(test.X)[:, 1]
    predTags = np.argmax(probs, axis=0)
    score = np.sum(np.equal(predTags, test.Y)) / test.Y.shape[0]
    print("Alpha = %.2f\tscore = %.5f" %(alpha, score))

Alpha = 0.01	score = 0.37264
Alpha = 0.05	score = 0.38798
Alpha = 0.10	score = 0.39256
Alpha = 0.50	score = 0.36046
Alpha = 1.00	score = 0.32578
Alpha = 5.00	score = 0.25368
Alpha = 10.00	score = 0.23856
CPU times: user 1min 11s, sys: 2.75 s, total: 1min 14s
Wall time: 18.8 s


In [179]:
%%time
alpha = 0.1 #with best performance
clfs = []
for gold in range(20):
    clf = MultinomialNB(alpha=alpha, fit_prior=True, class_prior=None)
    Y = np.copy(train.Y)
    Y[Y != gold] = -1
    clf.fit(train.X, Y)
    clfs.append(clf)

probs = np.zeros((20,test.Y.shape[0]))
for i in range(20):
    probs[i] = clfs[i].predict_proba(test.X)[:, 1]

predTags = np.argmax(probs, axis=0)
score = np.sum(np.equal(predTags, test.Y)) / test.Y.shape[0]

score = np.sum(np.equal(predTags, test.Y)) / test.Y.shape[0]
print("Emoji Prediction by Naive Bayes with alpha=", alpha)
print("score: %.4f" % score)
print()
for i in range(20):
    print("For tag: ", emojiMap[i])
    coef = np.log(np.exp(clfs[i].feature_log_prob_[1]) 
                  / np.exp(clfs[i].feature_log_prob_[0]))
    coefSorted = np.argsort(coef)
    print("10 most important features: ")
    for coefid in coefSorted[-1:-11:-1]:
        print("{:<15}{:>.4f}".format(train.vocab.GetWord(coefid), coef[coefid]))
    print("10 least important features: ")
    for coefid in coefSorted[:10]:
        print("{:<15}{:>.4f}".format(train.vocab.GetWord(coefid), coef[coefid]))
    print()

Emoji Prediction by Naive Bayes with alpha= 0.1
score: 0.3926

For tag:  ('‚ù§', '_red_heart_')
10 most important features: 
Ô∏èny            9.0559
Ô∏ènyc           8.3861
Ô∏èu             7.6785
Ô∏èÔ∏è‚Ä¶            7.2890
Ô∏èla            7.0608
Ô∏èto            6.5982
edmodo         6.2814
Ô∏èny‚Ä¶           6.2173
#kfodiaries    6.2173
Ô∏èmiss          6.2173
10 least important features: 
:‚Ä¶             -5.8236
#govote        -5.7479
asf            -5.5696
).             -5.4859
#godblessamerica-5.4080
#electionday   -5.3944
#teamusa       -5.3806
#trump2016     -5.3806
#vote2016      -5.2785
sun!           -5.1297

For tag:  ('üòç', '_smiling_face_with_hearteyes_')
10 most important features: 
nlk            6.5157
#goodfoodfridays6.3839
tribez         6.3839
#aboutlastfriday6.3839
#b√∂hemia‚Ä¶      6.3839
#inkchurch     6.2321
#laurasboutique6.2321
allegiant      6.2321
#theamericandreamteam6.2321
#rawlsrealty   6.2321
10 least important features: 
Ô∏è              -8.3920
Ô∏

10 most important features: 
#nationalchristmastree7.8111
yuletide       7.6793
flockcityla    7.5275
objets         7.5275
#chirstmas     7.5275
#flockcity     7.5275
sugargrove     7.3485
decking        7.3485
#lifechangers  7.3485
#deckthehalls  7.3485
10 least important features: 
Ô∏è‚Ä¶             -6.8980
Ô∏è              -6.4197
Ô∏èÔ∏è             -6.2158
Ô∏è.             -5.6313
prom           -5.6218
#i             -5.5061
fuck           -5.4257
bitch          -5.2105
makeup         -5.1027
father's       -5.0805

For tag:  ('üì∏', '_camera_with_flash_')
10 most important features: 
#grigsby       7.7186
#mikeyshotya   7.5868
#widthisguy‚Ä¶   7.5868
#michaelgrigsby7.5868
#grigsgang     7.5868
#watarewegonnado7.5868
soymaico       7.5868
#thedstoneponyshow7.5868
#soreadytobang 7.4350
smartscott     7.4350
10 least important features: 
Ô∏è              -10.6231
Ô∏è‚Ä¶             -6.9905
Ô∏èÔ∏è             -6.3082
fl)            -5.8191
Ô∏è.             -5.7238
#i             -