In [1]:
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import MultinomialNB

from Emojidata import EMOJIdata

In [2]:
train = EMOJIdata("./data/us_train.text")
test = EMOJIdata("./data/us_test.text", vocab=train.vocab)

In [3]:
emojiMap = {}
for line in open("./data/us_mapping.txt"):
    line = line.split()    
    emojiMap[int(line[0])] = (line[1], line[2])
emojiMap

{0: ('❤', '_red_heart_'),
 1: ('😍', '_smiling_face_with_hearteyes_'),
 2: ('😂', '_face_with_tears_of_joy_'),
 3: ('💕', '_two_hearts_'),
 4: ('🔥', '_fire_'),
 5: ('😊', '_smiling_face_with_smiling_eyes_'),
 6: ('😎', '_smiling_face_with_sunglasses_'),
 7: ('✨', '_sparkles_'),
 8: ('💙', '_blue_heart_'),
 9: ('😘', '_face_blowing_a_kiss_'),
 10: ('📷', '_camera_'),
 11: ('🇺🇸', '_United_States_'),
 12: ('☀', '_sun_'),
 13: ('💜', '_purple_heart_'),
 14: ('😉', '_winking_face_'),
 15: ('💯', '_hundred_points_'),
 16: ('😁', '_beaming_face_with_smiling_eyes_'),
 17: ('🎄', '_Christmas_tree_'),
 18: ('📸', '_camera_with_flash_'),
 19: ('😜', '_winking_face_with_tongue_')}

In [4]:
#baseline by most frequent tag
print("Most requent tag is: ", emojiMap[train.mostFreqLbl])
print("Baseline by most frequent tag: ", np.count_nonzero(test.Y == train.mostFreqLbl)/test.Y.shape[0])

Most requent tag is:  ('❤', '_red_heart_')
Baseline by most frequent tag:  0.21596


In [5]:
#test on different value of alpha
for alpha in [0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]:
    clf = MultinomialNB(alpha=alpha, fit_prior=True, class_prior=None)
    clf.fit(train.X, train.Y)
    print("Alpha = %.2f\tscore = %.5f" %(alpha, clf.score(test.X, test.Y)))

Alpha = 0.01	score = 0.36806
Alpha = 0.05	score = 0.38478
Alpha = 0.10	score = 0.38888
Alpha = 0.50	score = 0.34890
Alpha = 1.00	score = 0.30862
Alpha = 5.00	score = 0.24102
Alpha = 10.00	score = 0.23036


In [6]:
#find the important features by highest alpha
alpha = 0.1 #best performance
print("Emoji Prediction by Naive Bayes with alpha=", alpha)
clf = MultinomialNB(alpha=alpha, fit_prior=True, class_prior=None)
clf.fit(train.X, train.Y)
for i in range(clf.coef_.shape[0]):
    print("For tag: ", emojiMap[i])
    coefSorted = np.argsort(clf.feature_log_prob_[i])
    print("10 most important features: ")
    for coefid in coefSorted[-1:-11:-1]:
        print("{:<15}{:>.4f}".format(train.vocab.GetWord(coefid), clf.coef_[i][coefid]))
    print("10 least important features: ")
    for coefid in coefSorted[:10]:
        print("{:<15}{:>.4f}".format(train.vocab.GetWord(coefid), clf.coef_[i][coefid]))
    print()

Emoji Prediction by Naive Bayes with alpha= 0.1
For tag:  ('❤', '_red_heart_')
10 most important features: 
️              -2.5937
@              -3.0097
the            -3.8616
my             -3.9121
@user          -4.1895
i              -4.2684
to             -4.3203
with           -4.5024
love           -4.5441
and            -4.5742
10 least important features: 
v'             -16.4123
"friday"       -16.4123
hubspot        -16.4123
cereal,        -16.4123
#meateaters    -16.4123
#setitandforgetit-16.4123
#slabbedup     -16.4123
cedarburg,     -16.4123
#newyorkyankees-16.4123
#bronxbombers  -16.4123

For tag:  ('😍', '_smiling_face_with_hearteyes_')
10 most important features: 
@              -3.0727
the            -3.8102
@user          -4.0322
my             -4.1167
i              -4.3815
to             -4.4139
in             -4.4493
this           -4.5458
a              -4.5464
and            -4.5810
10 least important features: 
v'             -15.6411
#theandrewsadventures…-15.6

10 most important features: 
@              -3.2688
the            -3.9302
@user          -4.1126
to             -4.4652
my             -4.6583
i              -4.6697
a              -4.7865
you            -4.8924
and            -4.9285
in             -4.9905
10 least important features: 
v'             -14.4861
chicagobulls#bullswin-14.4861
️#lacmalights  -14.4861
lacma.         -14.4861
#ralphie       -14.4861
ho-ey          -14.4861
•••proud       -14.4861
5.55           -14.4861
...#wedding    -14.4861
#citruspressco -14.4861

For tag:  ('😁', '_beaming_face_with_smiling_eyes_')
10 most important features: 
@              -3.4711
the            -3.9932
@user          -4.1934
to             -4.3604
a              -4.4674
in             -4.4787
i              -4.5501
my             -4.6045
and            -4.8273
for            -4.9031
10 least important features: 
v'             -14.5189
intentionz     -14.5189
sayed          -14.5189
#coolmomsclubsf-14.5189
rivet,         -14.5189
#ec

In [72]:
train = EMOJIdata("./data/us_train_stripped.text")
test = EMOJIdata("./data/us_test_stripped.text", vocab=train.vocab)

In [73]:
#test on different value of alpha
for alpha in [0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]:
    clf = MultinomialNB(alpha=alpha, fit_prior=True, class_prior=None)
    clf.fit(train.X, train.Y)
    print("Alpha = %.2f\tscore = %.5f" %(alpha, clf.score(test.X, test.Y)))

Alpha = 0.01	score = 0.28272
Alpha = 0.05	score = 0.29720
Alpha = 0.10	score = 0.30518
Alpha = 0.50	score = 0.30780
Alpha = 1.00	score = 0.29102
Alpha = 5.00	score = 0.25048
Alpha = 10.00	score = 0.24016


In [66]:
#find the important features by highest alpha
alpha = 0.1 #best performance
print("Emoji Prediction by Naive Bayes with alpha=", alpha)
clf = MultinomialNB(alpha=alpha, fit_prior=True, class_prior=None)
clf.fit(train.X, train.Y)
for i in range(clf.coef_.shape[0]):
    print("For tag: ", emojiMap[i])
    coefSorted = np.argsort(clf.coef_[i])
    print("10 most important features: ")
    for coefid in coefSorted[-1:-11:-1]:
        print("{:<15}{:>.4f}".format(train.vocab.GetWord(coefid), clf.coef_[i][coefid]))
    print("10 least important features: ")
    for coefid in coefSorted[:10]:
        print("{:<15}{:>.4f}".format(train.vocab.GetWord(coefid), clf.coef_[i][coefid]))
    print()

Emoji Prediction by Naive Bayes with alpha= 0.1
For tag:  ('❤', '_red_heart_')
10 most important features: 
user           -3.6587
love           -3.8191
the            -4.5697
happy          -4.7102
new            -4.8344
amp            -4.9301
day            -5.0240
best           -5.2249
birthday       -5.3406
family         -5.3644
10 least important features: 
kuchar         -15.8815
smilingforever -15.8815
smilingforthecamera-15.8815
gypsystone     -15.8815
judokarate     -15.8815
nnkvoting      -15.8815
democracyinaction-15.8815
zekepapi       -15.8815
iamcreative    -15.8815
xnilax         -15.8815

For tag:  ('😍', '_smiling_face_with_hearteyes_')
10 most important features: 
user           -3.6000
love           -4.3480
the            -4.5140
new            -4.7643
amp            -5.0492
beautiful      -5.2571
day            -5.4069
today          -5.4869
this           -5.5008
night          -5.5430
10 least important features: 
kuchar         -15.2089
zeroheyes      -15.2089

10 most important features: 
user           -2.6700
the            -4.7526
new            -4.9737
photo          -5.5093
amp            -5.5307
york           -5.6389
day            -5.6747
park           -5.7406
night          -5.7553
california     -5.7627
10 least important features: 
kuchar         -14.0620
mattmcqueeny   -14.0620
hiiigheye      -14.0620
ortons         -14.0620
niiiiice       -14.0620
doublehead     -14.0620
twod           -14.0620
crossfitproper -14.0620
uzbekstyle     -14.0620
emazinglights  -14.0620

For tag:  ('😜', '_winking_face_with_tongue_')
10 most important features: 
user           -3.9216
the            -4.9484
new            -5.2446
amp            -5.2974
day            -5.3258
night          -5.6602
good           -5.6625
time           -5.7086
happy          -5.7211
like           -5.7414
10 least important features: 
kuchar         -14.0104
montagemountain-14.0104
goodpop        -14.0104
mattapan       -14.0104
vintagequest   -14.0104
divalasvegas   

In [11]:
from sklearn.svm import LinearSVC

In [12]:
clf = LinearSVC(random_state=0, tol=1e-5)

In [13]:
clf.fit(train.X, train.Y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=1e-05, verbose=0)

In [20]:
for i in range(clf.coef_.shape[0]):
    print("For tag: ", emojiMap[i])
    coefSorted = np.argsort(clf.coef_[i])
    print("10 most important features: ")
    for coefid in coefSorted[-1:-11:-1]:
        print("{:<15}{:>.4f}".format(train.vocab.GetWord(coefid), clf.coef_[i][coefid]))
    print("10 least important features: ")
    for coefid in coefSorted[:10]:
        print("{:<15}{:>.4f}".format(train.vocab.GetWord(coefid), clf.coef_[i][coefid]))
    print()

For tag:  ('❤', '_red_heart_')
10 most important features: 
#matchingneonpinkpants3.8588
carvings       3.1957
tulocay        3.1368
#usteam        2.7261
venus,         2.6703
deary...       2.5863
elevationuptown2.4460
#watersunsets  2.4335
feeler         2.4329
alabama?"      2.4270
10 least important features: 
filter!!       -10.5073
golsa          -10.5073
chocolatey-date-9.6249
rasors.        -8.1100
toasty         -7.8548
️#ekbinteriors…-7.7005
colors!!!!     -7.7005
steinhardt,    -6.3195
(boardwalk),   -6.3085
️...#chicago   -5.9260

For tag:  ('😍', '_smiling_face_with_hearteyes_')
10 most important features: 
hawi           2.2745
laila's        2.2490
l_ratz8615     2.2212
#forher        2.1581
renee's        2.1235
hallsontheriver2.0501
#traveltips    1.9334
tholos         1.8982
coe…           1.8940
#jhonniblaze   1.8861
10 least important features: 
️              -2.3413
️…             -1.8515
️!             -1.6816
espresso!      -1.5511
#lifeamplifiedworldtour…-1.521

10 most important features: 
#thelemoyneowencollege…1.9741
mattramey10    1.8246
obsidian       1.8005
#jimmyhayes    1.7080
#dioneclan     1.6832
champagnejarvi 1.6764
party...aka...sparkle1.6726
correctly,     1.6602
#lyricswithmeaning1.6564
#steve…        1.6480
10 least important features: 
️              -1.2687
:…             -1.2037
southcliff     -1.1572
bni            -1.1060
sweeterville   -1.0986
dallas/fort    -1.0888
quincy         -1.0877
(bwi))         -1.0841
zeus'          -1.0820
treatment      -1.0786

For tag:  ('🎄', '_Christmas_tree_')
10 most important features: 
#family2nd     5.3115
#god1st        2.8872
swills         2.0968
butterflies?   1.9941
#chrismastree  1.8854
#xmas2015      1.8721
arrived...us   1.8559
crusade        1.8471
gallipolis     1.8215
alexis!…       1.8215
10 least important features: 
dylanventura   -2.7457
️              -2.5313
#party…        -2.5123
owasso         -2.1519
l.l.           -2.1475
️…             -2.0405
ellms          -1.97