<a href="https://colab.research.google.com/github/chcorophyll/my_deeplearning_cookbook/blob/master/my_Emoji_Suggestions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
!git clone https://github.com/chcorophyll/deep_learning_cookbook.git

fatal: destination path 'deep_learning_cookbook' already exists and is not an empty directory.


In [2]:
!ls

deep_learning_cookbook	sample_data


In [0]:
import os

path_org = os.getcwd()
data_path = os.path.join(path_org, "deep_learning_cookbook")
os.chdir(data_path)

**Trying out a simple learner**

In [4]:
import pandas as pd
from keras.utils.data_utils import get_file
import nb_utils

emotion_csv = get_file("tetx_emotion.csv", 
                       "https://www.crowdflower.com/wp-content/uploads/2016/07/text_emotion.csv")
emotion_df = pd.read_csv(emotion_csv)

Using TensorFlow backend.


Downloading data from https://www.crowdflower.com/wp-content/uploads/2016/07/text_emotion.csv


In [5]:
emotion_df.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [6]:
emotion_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 4 columns):
tweet_id     40000 non-null int64
sentiment    40000 non-null object
author       40000 non-null object
content      40000 non-null object
dtypes: int64(1), object(3)
memory usage: 1.2+ MB


In [7]:
emotion_df["sentiment"].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [0]:
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

VOCAB_SIZE = 50000

tfidf_vec = TfidfVectorizer(max_features=VOCAB_SIZE)
label_encoder = LabelEncoder()

X = tfidf_vec.fit_transform(emotion_df["content"])
y = label_encoder.fit_transform(emotion_df["sentiment"])

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, 
                                                    random_state=42)

In [9]:
print(X.shape)
X[0:10]

(40000, 48212)


<10x48212 sparse matrix of type '<class 'numpy.float64'>'
	with 103 stored elements in Compressed Sparse Row format>

In [10]:
y[0:10]

array([ 2, 10, 10,  3,  8, 12, 10, 12, 10, 10])

In [11]:
bayes = MultinomialNB()
bayes.fit(X_train, y_train)
predictions = bayes.predict(X_test)
precision_score(predictions, y_test, average="micro")

0.2802272727272727

In [12]:
classifiers = {"sgd": SGDClassifier(loss="hinge"),
               "svm": SVC(), 
               "random_forest": RandomForestClassifier()}

for lbl, clf in classifiers.items():
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print(lbl, precision_score(predictions, y_test, average="micro"))

sgd 0.32681818181818184




svm 0.21863636363636363




random_forest 0.2721969696969697


**Checking what our model learned**

In [13]:
len(tfidf_vec.vocabulary_)

48212

In [0]:
from scipy.sparse import eye

d = eye(len(tfidf_vec.vocabulary_))
word_pred = bayes.predict_proba(d)
inverse_vocab = {idx: word for word, idx in tfidf_vec.vocabulary_.items()}

In [15]:
word_pred.shape

(48212, 13)

In [0]:
from collections import Counter, defaultdict

by_cls = defaultdict(Counter)
for word_idx, pred in enumerate(word_pred):
    for class_idx, score in enumerate(pred):
        cls = label_encoder.classes_[class_idx]
        by_cls[cls][inverse_vocab[word_idx]] = score

In [17]:
for k in by_cls:
    words = [x[0] for x in by_cls[k].most_common(5)]
    print(k, ':', ' '.join(words))

anger : confuzzled fridaaaayyyyy aaaaaaaaaaa transtelecom filthy
boredom : squeaking ouuut cleanin sooooooo candyland3
empty : _cheshire_cat_ bethsybsb conversating kimbermuffin less_than_3
enthusiasm : lena_distractia foolproofdiva attending krisswouldhowse tatt
fun : xbox bamboozle sanctuary oldies toodaayy
happiness : excited woohoo excellent yay wars
hate : hate hates suck fucking zomberellamcfox
love : love mothers mommies moms loved
neutral : www painting souljaboytellem link frenchieb
relief : finally relax mastered relief inspiration
sadness : sad sadly cry cried miss
surprise : surprise wow surprised wtf surprisingly
worry : worried poor throat hurts sick


**Training a deep model**

In [18]:
from itertools import chain
from keras.preprocessing.sequence import pad_sequences
import numpy as np

chars = list(sorted(set(chain(*emotion_df["content"]))))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
max_sequence_len = max(len(x) for x in emotion_df["content"])

char_vectors = []
for txt in emotion_df["content"]:
    vec = np.zeros((max_sequence_len, len(char_to_idx)))
    vec[np.arange(len(txt)), [char_to_idx[ch] for ch in txt]] = 1
    char_vectors.append(vec)
    
char_vectors = np.asarray(char_vectors)
char_vectors = pad_sequences(char_vectors)
labels = label_encoder.transform(emotion_df["sentiment"])

def split(lst):
    training_count = int(0.9*len(char_vectors))
    return lst[:training_count], lst[training_count:]

training_char_vectors, test_char_vectors = split(char_vectors)
training_labels, test_labels = split(labels)

char_vectors.shape

(40000, 167, 100)

In [19]:
from keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM
from keras.models import Model
from keras.layers import Concatenate
from keras import regularizers

def create_char_cnn_model(num_chars, max_sequence_len, num_labels):
    char_input = Input(shape=(max_sequence_len, num_chars), name="input")
    conv_1x = Conv1D(128, 6, activation="relu", padding="valid")(char_input)
    max_pool_1x = MaxPooling1D(6)(conv_1x)
    conv_2x = Conv1D(256, 6, activation="relu", padding="valid")(max_pool_1x)
    max_pool_2x = MaxPooling1D(6)(conv_2x)
    
    flatten = Flatten()(max_pool_2x)
    dense = Dense(128, activation="relu", kernel_regularizer=regularizers.l2(0.01))(flatten)
    preds = Dense(num_labels, activation="softmax")(dense)
    
    model = Model(char_input, preds)
    model.compile(loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"])
    return model

char_cnn_model = create_char_cnn_model(len(char_to_idx), char_vectors.shape[1], len(label_encoder.classes_))
char_cnn_model.summary()

W0628 23:32:35.485127 139674020661120 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0628 23:32:35.527176 139674020661120 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0628 23:32:35.529573 139674020661120 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0628 23:32:35.563861 139674020661120 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.

W0628 23:32:35.637488 139674020661120 deprecation_wrapp

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 167, 100)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 162, 128)          76928     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 27, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 22, 256)           196864    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 3, 256)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 768)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               98432     
__________

In [20]:
char_cnn_model.fit(training_char_vectors, training_labels, epochs=20, batch_size=1024)
char_cnn_model.evaluate(test_char_vectors, test_labels)

W0628 23:32:35.784398 139674020661120 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0628 23:32:35.878913 139674020661120 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[2.053533175468445, 0.31175]

In [21]:
from keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM
from keras.models import Model
from keras.layers import Concatenate
from keras import regularizers

def create_char_cnn_model(num_chars, max_sequence_len, num_labels):
    char_input = Input(shape=(max_sequence_len, num_chars), name="input")
    
    layers = []
    
    for window in (5, 6, 7):
        conv_1x = Conv1D(128, window, activation="relu", padding="valid")(char_input)
        max_pool_1x = MaxPooling1D(window)(conv_1x)
        dropout_1x = Dropout(0.3)(max_pool_1x)
        conv_2x = Conv1D(128, window, activation="relu", padding="valid")(dropout_1x)
        max_pool_2x = MaxPooling1D(window)(conv_2x)
        dropout_2x = Dropout(0.3)(max_pool_2x)
        layers.append(dropout_2x)
        
    if len(layers) > 1:
        merged = Concatenate(axis=1)(layers)
    else:
        merged = layers[0]
        
    dropout = Dropout(0.3)(merged)
    
    flatten = Flatten()(dropout)
    dense = Dense(128, activation="relu")(flatten)
    preds = Dense(num_labels, activation="softmax")(dense)
    model = Model(char_input, preds)
    model.compile(loss="sparse_categorical_crossentropy", optimizer='rmsprop',  metrics=['acc'])
    return model

char_cnn_model = create_char_cnn_model(len(char_to_idx), char_vectors.shape[1], len(label_encoder.classes_))
char_cnn_model.summary()

W0628 23:33:47.505040 139674020661120 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              (None, 167, 100)     0                                            
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 163, 128)     64128       input[0][0]                      
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 162, 128)     76928       input[0][0]                      
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 161, 128)     89728       input[0][0]                      
__________________________________________________________________________________________________
max_poolin

In [22]:
char_cnn_model.fit(training_char_vectors, training_labels, epochs=20, batch_size=1024)
char_cnn_model.evaluate(test_char_vectors, test_labels)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[1.9519147100448608, 0.3565]

**Featurizing and preparing our data**

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot

VOCAB_SIZE = 50000
tokenizer = Tokenizer(num_words=VOCAB_SIZE) # a dict key = word value= count  maxz-index = num_w
tokenizer.fit_on_texts(emotion_df["content"]) # 

In [24]:
# This may take a while to load

w2v, idf = nb_utils.load_w2v(tokenizer)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
tokens = tokenizer.texts_to_sequences(emotion_df["content"]) # return matrix shape [len(emotion_df["content"]), len(emotion_df["content"][i])]
tokens = pad_sequences(tokens)
training_count = int(0.9*len(tokens))
training_tokens, training_labels = tokens[:training_count], labels[:training_count]
test_tokens, test_labels = tokens[training_count:], labels[training_count:]

In [26]:
from keras import layers, models
import keras.backend as K

def make_embedding(name, vocab_size, embedding_size, weights=None, mask_zero=True):
    if weights is not None:
        return layers.Embedding(mask_zero=mask_zero,
                                input_dim=vocab_size, 
                                output_dim=weights.shape[1],
                                weights=[weights],
                                trainable=False,
                                name="%s/embedding" % name)
    else:
        return layers.Embedding(mask_zero=mask_zero,
                                input_dim=vocab_size, 
                                output_dim=embedding_size,
                                name="%s/embedding" % name)
    
def create_unigram_model(vocab_size, embedding_size=None, embedding_weights=None, idf_weights=None):
    assert not (embedding_size is None and embedding_weights is None)
    message = layers.Input(shape=(None,), dtype="int32", name="message")
    
    embedding = make_embedding("message_vec", vocab_size, embedding_size, embedding_weights)
    idf = make_embedding("message_idf", vocab_size, embedding_size, idf_weights)
    mask = layers.Masking(mask_value=0)
    
    def _combine_and_sum(args):
        embedding, idf = args
        return K.sum(embedding*K.abs(idf), axis=1) # 实际张量是（None，None, 300)*（None，None, 1)
    
    sum_layer = layers.Lambda(_combine_and_sum, name="combine_and_sum")
    sum_msg = sum_layer([mask(embedding(message)), idf(message)])
    fcl = layers.Dense(units=128, activation="relu")(sum_msg)
    categories = layers.Dense(units=len(label_encoder.classes_), activation="softmax")(fcl)
    
    model = models.Model(inputs=[message], outputs=categories,)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    model.summary()
    return model

unigram_model = create_unigram_model(vocab_size=VOCAB_SIZE,
                                     embedding_weights=w2v,
                                     idf_weights=idf)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
message (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
message_vec/embedding (Embeddin (None, None, 300)    15000000    message[0][0]                    
__________________________________________________________________________________________________
masking_1 (Masking)             (None, None, 300)    0           message_vec/embedding[0][0]      
__________________________________________________________________________________________________
message_idf/embedding (Embeddin (None, None, 1)      50000       message[0][0]                    
__________________________________________________________________________________________________
combine_an

In [27]:
unigram_model.fit(training_tokens, training_labels, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f080cc6b710>

In [28]:
unigram_model.evaluate(test_tokens, test_labels, verbose=2)

[2.382763785362244, 0.32375]

**Learning Embeddings**

In [29]:
learned_embeddings_model = create_unigram_model(vocab_size=VOCAB_SIZE, embedding_size=25)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
message (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
message_vec/embedding (Embeddin (None, None, 25)     1250000     message[0][0]                    
__________________________________________________________________________________________________
masking_2 (Masking)             (None, None, 25)     0           message_vec/embedding[0][0]      
__________________________________________________________________________________________________
message_idf/embedding (Embeddin (None, None, 25)     1250000     message[0][0]                    
__________________________________________________________________________________________________
combine_an

In [30]:
learned_embeddings_model.fit(training_tokens, training_labels, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f07b00a7eb8>

In [31]:
# Note the test set accuracy is lower than that on the training set.

learned_embeddings_model.evaluate(test_tokens, test_labels, verbose=2)

[1.9881738328933716, 0.362]

**More Complex Models**

As with our previous task, we can try using more powerful models to classify our text. In this case, the limited training data and text size limit their effectiveness.

In [0]:
def create_cnn_model(vocab_size, embedding_size=None, embedding_weights=None):
    message = layers.Input(shape=(None, ), dtype="int32", name="title")
    # The convolution layer in keras does not support masking, so we just allow
    # the embedding layer to learn an explicit value.
    embedding = make_embedding("message_vec", vocab_size, 
                               embedding_size, embedding_weights, 
                               mask_zero=False)
    def _combine_sum(v):
        return K.sum(v, axis=1)
    
    cnn_1 = layers.Convolution1D(128, 3)
    cnn_2 = layers.Convolution1D(128, 3)
    cnn_3 = layers.Convolution1D(128, 3)
    
    global_pool = layers.GlobalMaxPooling1D()
    local_pool = layers.MaxPooling1D(strides=1, pool_size=3)
    
    cnn_encoding = global_pool(cnn_3(local_pool(cnn_2(local_pool(cnn_1(embedding(message)))))))
                                 

    fcl = layers.Dense(units=128, activation="elu")(cnn_encoding)
    categories = layers.Dense(units=len(label_encoder.classes_), activation="softmax")(fcl)
    model = models.Model(
        inputs=[message],
        outputs=[categories],
    )
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model
                               
                               

In [33]:
cnn_model = create_cnn_model(VOCAB_SIZE, embedding_weights=w2v)
cnn_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
message_vec/embedding (Embeddin (None, None, 300)    15000000    title[0][0]                      
__________________________________________________________________________________________________
conv1d_9 (Conv1D)               (None, None, 128)    115328      message_vec/embedding[0][0]      
__________________________________________________________________________________________________
max_pooling1d_9 (MaxPooling1D)  (None, None, 128)    0           conv1d_9[0][0]                   
                                                                 conv1d_10[0][0]                  
__________

In [34]:
cnn_model.fit(training_tokens, training_labels, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f07b0b124e0>

In [35]:
cnn_model.evaluate(test_tokens, test_labels)



[2.6002593450546265, 0.31425]

In [0]:
def create_lstm_model(vocab_size, embedding_size=None, embedding_weights=None):
    message = layers.Input(shape=(None,), dtype='int32', name='title')
    embedding = make_embedding('message_vec', vocab_size, embedding_size, embedding_weights)(message)
#     mask = layers.Masking(mask_value=0)(embedding)
    lstm_1 = layers.LSTM(units=128, return_sequences=False)(embedding)
#     lstm_2 = layers.LSTM(units=128, return_sequences=False)(lstm_1)
    category = layers.Dense(units=len(label_encoder.classes_), activation='softmax')(lstm_1)
    
    model = models.Model(
        inputs=[message],
        outputs=[category],
    )
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

In [37]:
lstm_model = create_lstm_model(VOCAB_SIZE, embedding_weights=w2v)
lstm_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
title (InputLayer)           (None, None)              0         
_________________________________________________________________
message_vec/embedding (Embed (None, None, 300)         15000000  
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_11 (Dense)             (None, 13)                1677      
Total params: 15,221,325
Trainable params: 221,325
Non-trainable params: 15,000,000
_________________________________________________________________


In [38]:
lstm_model.fit(training_tokens, training_labels, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0661dd5278>

In [39]:
lstm_model.evaluate(test_tokens, test_labels)



[1.9042014083862304, 0.38075]

**Comparing our models**

In [0]:
predictions = {"lstm": lstm_model.predict(test_tokens[:100]),
               "char_cnn": char_cnn_model.predict(test_char_vectors[:100]), 
               "cnn": cnn_model.predict(test_tokens[:100]), 
               "unigram": unigram_model.predict(test_tokens[:100])}
    

In [41]:
# Make a dataframe just for test data

pd.options.display.max_colwidth = 128
test_df = emotion_df[training_count:training_count+100].reset_index()
eval_df = pd.DataFrame({
    'content': test_df['content'],
    'true': test_df['sentiment'],
    'lstm': [label_encoder.classes_[np.argmax(x)] for x in predictions['lstm']],
    'cnn': [label_encoder.classes_[np.argmax(x)] for x in predictions['cnn']],
    'char_cnn': [label_encoder.classes_[np.argmax(x)] for x in predictions['char_cnn']],    
    'unigram': [label_encoder.classes_[np.argmax(x)] for x in predictions['unigram']],
})
eval_df = eval_df[['content', 'true', 'lstm', 'cnn', 'char_cnn', 'unigram']]
eval_df.head(10)

Unnamed: 0,content,true,lstm,cnn,char_cnn,unigram
0,HAPPY MOTHER'S DAY to all of the wonderful women out there. Have a great and relaxful day.,happiness,love,love,love,love
1,"browsing thru adopting agencies, i'm gonna get some exotic kids",enthusiasm,neutral,neutral,happiness,happiness
2,"I am tired of my phone. Walkman works like a charm, but l need better video and wap really. Thanks for yesterday and for buy...",love,worry,happiness,happiness,happiness
3,Happy Mother's Day to all the Mommiessss,love,love,love,love,love
4,@mattgarner haha what's up Matt ?,happiness,neutral,happiness,neutral,happiness
5,What's up!!? @guillermop,neutral,neutral,worry,neutral,neutral
6,@KandyBee we shuld do a dance like that its seriously the best thing haha. see yu tomoro.,fun,happiness,happiness,happiness,happiness
7,@TravelTweetie I will go to sleep now. Might be awakened early w/breakfast tray from my 'spark' &amp; my 'joper' w/their Dad...,happiness,neutral,worry,happiness,happiness
8,@nak1a &quot;If there's a camel up a hill&quot; and &quot;I'll give you plankton&quot; ....HILARIOUS!!,happiness,happiness,love,happiness,neutral
9,@Bern_morley LOL I love your kids,love,love,love,love,happiness


**Qualitative Evaluation**

In [42]:
eval_df[eval_df['lstm'] != eval_df['true']].head(10)

Unnamed: 0,content,true,lstm,cnn,char_cnn,unigram
0,HAPPY MOTHER'S DAY to all of the wonderful women out there. Have a great and relaxful day.,happiness,love,love,love,love
1,"browsing thru adopting agencies, i'm gonna get some exotic kids",enthusiasm,neutral,neutral,happiness,happiness
2,"I am tired of my phone. Walkman works like a charm, but l need better video and wap really. Thanks for yesterday and for buy...",love,worry,happiness,happiness,happiness
4,@mattgarner haha what's up Matt ?,happiness,neutral,happiness,neutral,happiness
6,@KandyBee we shuld do a dance like that its seriously the best thing haha. see yu tomoro.,fun,happiness,happiness,happiness,happiness
7,@TravelTweetie I will go to sleep now. Might be awakened early w/breakfast tray from my 'spark' &amp; my 'joper' w/their Dad...,happiness,neutral,worry,happiness,happiness
10,@davecandoit dude that honest to god happens to me all the time.. minus the trail mix.,sadness,neutral,worry,worry,worry
12,Happy Mother's Day to the tweetin' mamas Nite tweeple!,worry,love,love,love,happiness
13,On my way home...then SLEEP! Seeing Amber Pacific tomorow with the besties,happiness,neutral,happiness,happiness,happiness
14,@xoMusicLoverxo I'm using it in a story. I actually already wrote it but have to write the chapters before it.,relief,neutral,surprise,love,neutral


**Analyzing Tweets**

In [10]:
!pip install twitter



In [11]:
!pip install emoji



In [0]:
import random
import twitter
import emoji
import itertools
import pandas as pd
from itertools import chain
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
import keras.callbacks
import json

import os
import nb_utils
from keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM, Embedding, GlobalMaxPooling1D
from keras.models import Model
from keras.layers import Concatenate, Average

from gensim.models import Word2Vec

In [0]:
# Fill these in!

CONSUMER_KEY = 'xbMuxcJpRTiVGt2C2EYnA'
CONSUMER_SECRET = '2DbQTsvIptkPTdaUcos8DDvQH9fzO0hNjJpUT2uVzQ'
ACCESS_TOKEN = '7319442-EDm4CPxL7W4KkZcGWRMJNVHp88W5OH9vgblu898fg'
ACCESS_SECRET = '5ZxJSbqXhG7uhgXzTFWf9XhkfsxxinlPRXyDTzbA9w'

In [23]:
auth=twitter.OAuth(
    consumer_key=CONSUMER_KEY,
    consumer_secret=CONSUMER_SECRET,
    token=ACCESS_TOKEN,
    token_secret=ACCESS_SECRET,
)

status_stream = twitter.TwitterStream(auth=auth).statuses

[x['text'] for x in itertools.islice(status_stream.sample(), 0, 5) if x.get('text')]

['RT @SLOWTOWNVHS: tried to unfollow the most annoying bitch on twitter but all it said was edit profile \n\n https://t.co/FZ2YWcbUwM',
 '@tanamongeau JDHDUDIBD',
 '@ActuallyDice UwU I would also like cuddles from you too']

In [48]:
tatus_stream = twitter.TwitterStream(auth=auth).statuses

def english_has_emoji(tweet):
    if tweet.get('lang') != 'en':
        return False
    return any(ch for ch in tweet.get('text', '') if ch in emoji.UNICODE_EMOJI)

%time tweets = list(itertools.islice(filter(english_has_emoji, status_stream.sample()), 0, 100))

CPU times: user 1.51 s, sys: 307 ms, total: 1.82 s
Wall time: 55.9 s


In [49]:
stripped = []
for tweet in tweets:
    text = tweet['text']
    emojis = {ch for ch in text if ch in emoji.UNICODE_EMOJI}
    if len(emojis) == 1:
        emoiji = emojis.pop()
        text = ''.join(ch for ch in text if ch != emoiji)
        stripped.append((text, emoiji))
len(stripped)

63

**Using the CNN**

In [50]:
all_tweets = pd.read_csv("data/emojis.csv")
all_tweets.head()

Unnamed: 0,text,emoji
0,@ATLHawks: Chance The Rapper or Kent Bazemore? #Chance3 #ColoringBook #Twins,🤔
1,"@nice_aju: Yup we love you, you're so precious #WeLoveYouHoseok",💙
2,Fav Sing Me to Sleep by Alan Walker,💛
3,@AshBenzo: Wife From The Real-Life 'Fault In Our Stars' Couple Dies 5 Days After Her Husband,💔
4,Why am I up so late,😔


In [51]:
all_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 806203 entries, 0 to 806202
Data columns (total 2 columns):
text     806203 non-null object
emoji    806203 non-null object
dtypes: object(2)
memory usage: 12.3+ MB


In [52]:
all_tweets["emoji"].value_counts()

😂    124823
❤     43218
😍     40566
😭     35714
😊     20076
🙄     17963
😩     16232
🔥     15453
🤔     15419
💕     12026
💯     11783
😘     11065
💀      9928
✨      9886
🙃      9405
👀      7842
😒      7019
☺      6871
😢      6846
😳      6716
💙      6616
😎      6349
😉      6272
😅      6133
😁      6010
😌      5759
😏      5623
💖      5331
😔      5244
😴      4999
      ...  
🏬         1
🏤         1
🤡         1
🚸         1
🏣         1
㊙         1
🔏         1
🦈         1
🏦         1
📂         1
🔀         1
🕣         1
📇         1
🤠         1
🕡         1
↩         1
📳         1
🏗         1
🈹         1
👘         1
🎑         1
📭         1
🔣         1
🛐         1
🕍         1
🕜         1
🈚         1
🗂         1
🥇         1
🎚         1
Name: emoji, Length: 989, dtype: int64

In [53]:
tweets = all_tweets.groupby("emoji").filter(lambda c: len(c) > 1000)
tweets.head()

Unnamed: 0,text,emoji
0,@ATLHawks: Chance The Rapper or Kent Bazemore? #Chance3 #ColoringBook #Twins,🤔
1,"@nice_aju: Yup we love you, you're so precious #WeLoveYouHoseok",💙
2,Fav Sing Me to Sleep by Alan Walker,💛
3,@AshBenzo: Wife From The Real-Life 'Fault In Our Stars' Couple Dies 5 Days After Her Husband,💔
4,Why am I up so late,😔


In [54]:
tweets['emoji'].value_counts()

😂    124823
❤     43218
😍     40566
😭     35714
😊     20076
🙄     17963
😩     16232
🔥     15453
🤔     15419
💕     12026
💯     11783
😘     11065
💀      9928
✨      9886
🙃      9405
👀      7842
😒      7019
☺      6871
😢      6846
😳      6716
💙      6616
😎      6349
😉      6272
😅      6133
😁      6010
😌      5759
😏      5623
💖      5331
😔      5244
😴      4999
      ...  
✌      1523
📸      1496
🎤      1487
🌚      1452
👅      1431
🏈      1373
🌟      1355
⏩      1332
❗      1325
🔴      1304
☕      1296
👊      1273
👇      1259
❣      1254
🎧      1246
🎈      1210
⏭      1198
💫      1181
↪      1157
🤑      1152
⚽      1141
😹      1112
😶      1108
💦      1075
😣      1074
😥      1072
🙁      1066
🤕      1065
😰      1013
☀      1013
Name: emoji, Length: 121, dtype: int64

In [55]:
max(tweets["text"], key=lambda t: len(t))

"Don't worry, my love, I don't get you wrong Don't get me wrong. We're connected with our hearts, souls, minds, bodies. If any doubt, we ask"

In [0]:
chars = list(sorted(set(chain(*tweets['text']))))
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
max_sequence_len = max(len(x) for x in tweets['text'])

emojis = list(sorted(set(tweets['emoji'])))
emoji_to_idx = {em: idx for idx, em in enumerate(emojis)}
emojis[:10]

train_tweets, test_tweets = train_test_split(tweets, test_size=0.1)

In [57]:
print(len(chars), len(emojis))

96 121


In [58]:
def data_generator(tweets, batch_size):
    while True:
        if batch_size is None:
            batch = tweets
            batch_size = batch.shape[0]
        else:
            batch = tweets.sample(batch_size)
        X = np.zeros((batch_size, max_sequence_len, len(chars)))
        y = np.zeros((batch_size,))
        for row_idx, (_, row) in enumerate(batch.iterrows()):
            y[row_idx] = emoji_to_idx[row["emoji"]]
            for ch_idx, ch in enumerate(row["text"]):
                X[row_idx, ch_idx, char_to_idx[ch]] = 1
        yield X, y
        
next(data_generator(tweets, 10)) [0]      

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [59]:
next(data_generator(tweets, 10))[0].shape

(10, 139, 96)

In [0]:
def create_char_cnn_model(num_chars, max_sequence_len, num_labels):
    char_input = Input(shape=(max_sequence_len, num_chars),name="char_cnn_input")
    conv_1x = Conv1D(128, 6, activation="relu", padding="valid")(char_input)
    max_pool_1x = MaxPooling1D(4)(conv_1x)
    conv_2x = Conv1D(256, 6, activation="relu", padding="valid")(max_pool_1x)
    max_pool_2x = MaxPooling1D(4)(conv_2x)
    
    flatten = Flatten()(max_pool_2x)
    dense = Dense(128, activation="relu")(flatten)
    preds = Dense(num_labels, activation="softmax", name="char_cnn_preductins")(dense)
    
    model = Model(char_input, preds)
    model.compile(loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"])
    
    return model

In [61]:
char_cnn_model = create_char_cnn_model(len(char_to_idx), max_sequence_len, len(emojis))
char_cnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
char_cnn_input (InputLayer)  (None, 139, 96)           0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 134, 128)          73856     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 33, 128)           0         
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 28, 256)           196864    
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 7, 256)            0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 1792)              0         
_________________________________________________________________
dense_12 (Dense)             (None, 128)               229504    
__________

In [62]:
early = keras.callbacks.EarlyStopping(monitor="loss", min_delta=0.03, 
                                      patience=2, verbose=0, mode="auto")

BATCH_SIZE = 512
char_cnn_model.fit_generator(data_generator(train_tweets, batch_size=BATCH_SIZE),
                             epochs=20, 
                             steps_per_epoch=len(train_tweets)/BATCH_SIZE, 
                             verbose=2, callbacks=[early])


Epoch 1/20
 - 147s - loss: 3.5995 - acc: 0.2206
Epoch 2/20
 - 146s - loss: 3.2039 - acc: 0.2879
Epoch 3/20
 - 144s - loss: 3.0022 - acc: 0.3277
Epoch 4/20
 - 144s - loss: 2.8538 - acc: 0.3581
Epoch 5/20
 - 144s - loss: 2.7429 - acc: 0.3802
Epoch 6/20
 - 144s - loss: 2.6571 - acc: 0.3965
Epoch 7/20
 - 145s - loss: 2.5914 - acc: 0.4093
Epoch 8/20
 - 146s - loss: 2.5290 - acc: 0.4213
Epoch 9/20
 - 146s - loss: 2.4795 - acc: 0.4296
Epoch 10/20
 - 143s - loss: 2.4300 - acc: 0.4379
Epoch 11/20
 - 144s - loss: 2.3969 - acc: 0.4435
Epoch 12/20
 - 143s - loss: 2.3575 - acc: 0.4513
Epoch 13/20
 - 144s - loss: 2.3283 - acc: 0.4569
Epoch 14/20
 - 145s - loss: 2.2962 - acc: 0.4622
Epoch 15/20
 - 147s - loss: 2.2752 - acc: 0.4647
Epoch 16/20
 - 146s - loss: 2.2526 - acc: 0.4692
Epoch 17/20
 - 145s - loss: 2.2359 - acc: 0.4711
Epoch 18/20
 - 143s - loss: 2.2148 - acc: 0.4747
Epoch 19/20
 - 143s - loss: 2.1995 - acc: 0.4771
Epoch 20/20
 - 143s - loss: 2.1807 - acc: 0.4803


<keras.callbacks.History at 0x7f07b0ad3198>

In [63]:
char_cnn_model.evaluate_generator(data_generator(test_tweets, batch_size=BATCH_SIZE), 
                                  steps=len(test_tweets) / BATCH_SIZE)

[3.3476517373237056, 0.35964390851449274]

In [0]:
with open("zoo/07/emoji_chars.json", "w") as fout:
    json.dump({"emojis": "".join(emojis),
               "char_to_idx": char_to_idx, 
               "max_sequence_len": max_sequence_len,},
              fout)
    
char_cnn_model.save("zoo/07/char_cnn_model.h5")
char_cnn_model.save_weights("zoo/07/char_cnn_model_weights.h5")

In [65]:
pd.options.display.max_colwidth = 128
inspect_tweets = test_tweets.sample(100)
predicted = char_cnn_model.predict_generator(data_generator(inspect_tweets, batch_size=None), steps=1)
show = pd.DataFrame({"text": inspect_tweets["text"], 
                     "true": inspect_tweets["emoji"], 
                     "pred": [emojis[np.argmax(x)] for x in predicted],})
show = show[["text", "true", "pred"]]
show.head()

Unnamed: 0,text,true,pred
229070,"@v_sizzle LOLLL ""oh you're engaged? Congrats!"" No, it's a promise ring....",🙃,👊
772980,"@josephcaptures hey, do you need some ride money? I can send you some",💓,😂
601197,@IISuperwomanII Happy Birthday Girl!,🎉,🎉
263646,@Exoeshowtime: When you dare to ignore Min Yoongi's call to dance to your own ringtone #yoonmin,😂,😂
453388,@_theboulron: yooooooo i'm fucking crying,😂,😂


In [0]:
from keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM
from keras.models import Model
from keras.layers import Concatenate

def create_char_cnn_model2(num_chars, max_sequence_len, num_labels, drop_out=0.25):
    char_input = Input(shape=(max_sequence_len, num_chars), name='char_cnn_input')
    
    layers = []
    for window in (4, 5, 6):
        conv_1x = Conv1D(128, window, activation='relu', padding='valid')(char_input)
        max_pool_1x = MaxPooling1D(4)(conv_1x)
        dropout_1x = Dropout(drop_out)(max_pool_1x)
        conv_2x = Conv1D(256, window, activation='relu', padding='valid')(dropout_1x)
        max_pool_2x = MaxPooling1D(4)(conv_2x)
        dropout_2x = Dropout(drop_out)(max_pool_2x)
        layers.append(dropout_2x)
        
    merged = Concatenate(axis=1)(layers)

    dropout = Dropout(drop_out)(merged)
    flatten = Flatten()(dropout)
    dense = Dense(128, activation='relu')(flatten)
    preds = Dense(num_labels, activation='softmax', name='char_cnn_predictions')(dense)
    
    model = Model(char_input, preds)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    return model


In [67]:
char_cnn_model2 = create_char_cnn_model2(len(char_to_idx), max_sequence_len, len(emojis))
char_cnn_model2.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_cnn_input (InputLayer)     (None, 139, 96)      0                                            
__________________________________________________________________________________________________
conv1d_14 (Conv1D)              (None, 136, 128)     49280       char_cnn_input[0][0]             
__________________________________________________________________________________________________
conv1d_16 (Conv1D)              (None, 135, 128)     61568       char_cnn_input[0][0]             
__________________________________________________________________________________________________
conv1d_18 (Conv1D)              (None, 134, 128)     73856       char_cnn_input[0][0]             
__________________________________________________________________________________________________
max_poolin

In [68]:
early = keras.callbacks.EarlyStopping(monitor='loss',
                              min_delta=0.03,
                              patience=2,
                              verbose=0, mode='auto')

BATCH_SIZE = 2048
char_cnn_model2.fit_generator(
    data_generator(train_tweets, batch_size=BATCH_SIZE),
    epochs=30,
    steps_per_epoch=len(train_tweets) / BATCH_SIZE,
    verbose=2,
    callbacks=[early]
)

Epoch 1/30
 - 135s - loss: 3.8482 - acc: 0.1862
Epoch 2/30
 - 133s - loss: 3.4903 - acc: 0.2330
Epoch 3/30
 - 133s - loss: 3.3271 - acc: 0.2632
Epoch 4/30
 - 133s - loss: 3.2139 - acc: 0.2842
Epoch 5/30
 - 133s - loss: 3.1342 - acc: 0.2995
Epoch 6/30
 - 133s - loss: 3.0657 - acc: 0.3119
Epoch 7/30
 - 133s - loss: 3.0128 - acc: 0.3224
Epoch 8/30
 - 133s - loss: 2.9683 - acc: 0.3304
Epoch 9/30
 - 132s - loss: 2.9299 - acc: 0.3379
Epoch 10/30
 - 133s - loss: 2.8970 - acc: 0.3438
Epoch 11/30
 - 133s - loss: 2.8659 - acc: 0.3496
Epoch 12/30
 - 133s - loss: 2.8415 - acc: 0.3542
Epoch 13/30
 - 133s - loss: 2.8050 - acc: 0.3617
Epoch 14/30
 - 132s - loss: 2.7888 - acc: 0.3644
Epoch 15/30
 - 132s - loss: 2.7661 - acc: 0.3687
Epoch 16/30
 - 132s - loss: 2.7506 - acc: 0.3714
Epoch 17/30
 - 132s - loss: 2.7353 - acc: 0.3743
Epoch 18/30
 - 132s - loss: 2.7190 - acc: 0.3771
Epoch 19/30
 - 131s - loss: 2.7066 - acc: 0.3791


<keras.callbacks.History at 0x7f065d374518>

In [69]:
char_cnn_model2.evaluate_generator(
    data_generator(test_tweets, batch_size=BATCH_SIZE),
    steps=len(test_tweets) / BATCH_SIZE
)

[2.8042608329227994, 0.37989676339285716]

**Featurizing and preparing our data**

In [0]:
VOCAB_SIZE = 50000
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(tweets["text"])

In [0]:
training_tokens = tokenizer.texts_to_sequences(train_tweets["text"])
test_tokens = tokenizer.texts_to_sequences(test_tweets["text"])
max_num_tokens = max(len(x) for x in chain(training_tokens, test_tokens))
training_tokens = pad_sequences(training_tokens, maxlen=max_num_tokens)
test_tokens = pad_sequences(test_tokens, maxlen=max_num_tokens)

In [0]:
training_labels = np.asarray([emoji_to_idx[em] for em in train_tweets["emoji"]])
test_labels = np.asarray([emoji_to_idx[em] for em in test_tweets['emoji']])

In [0]:
# def load_weights(tokenizer):
#     w2v_model = Word2Vec.load("data/twitter_w2v.model") # model 没有给出
#     w2v = np.zeros((tokenizer.num_wrods, w2v_model.syn0.shape[1]))
#     for k, v in tokenizer.word_index.items():
#         if v >= tokenzier.num_words:
#             continue
#         if k in w2v_model:
#             w2v[v] = w2v_model[k]
#     return w2v
# w2v = load_weights(tokenizer)
# w2v.shape

In [0]:
import os
import re
import numpy as np
import gensim

CACHE_DIR = os.path.expanduser('~/.cache/dl-cookbook')

def download(url):
    filename = os.path.join(CACHE_DIR, re.sub('[^a-zA-Z0-9.]+', '_', url))
    if os.path.exists(filename):
        return filename
    else:
        os.system('mkdir -p "%s"' % CACHE_DIR)
        assert os.system('wget -O "%s" "%s"' % (filename, url)) == 0
        return filename
    
    
def load_w2v(tokenizer=None):
    word2vec_gz = download('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz')
    word2vec_vectors = word2vec_gz.replace('.gz', '')
    if not os.path.exists(word2vec_vectors):
        assert os.system('gunzip -d --keep "%s"' % word2vec_gz) == 0
        
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_vectors, binary=True)
    
#     total_count = sum(tokenizer.word_counts.values())
#     idf_dict = { k: np.log(total_count/v) for (k,v) in tokenizer.word_counts.items() }
    
    w2v = np.zeros((tokenizer.num_words, w2v_model.syn0.shape[1]))
#     idf = np.zeros((tokenizer.num_words, 1))

    for k, v in tokenizer.word_index.items():
        if v >= tokenizer.num_words:
            continue

        if k in w2v_model:
            w2v[v] = w2v_model[k]
#             idf[v] = idf_dict[k]

    del w2v_model
    return w2v

In [75]:
# This may take a while to load
w2v = load_w2v(tokenizer)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [76]:
w2v.shape

(50000, 300)

**World Level CNN**

In [0]:
from keras import layers, models
import keras.backend as K


def make_embedding(name, vocab_size, embedding_size, weights=None, mask_zero=True):
    if weights is not None:
        return layers.Embedding(mask_zero=mask_zero, input_dim=vocab_size, 
                                output_dim=weights.shape[1], 
                                weights=[weights], trainable=False, 
                                name='%s/embedding' % name)
    else:
        return layers.Embedding(mask_zero=mask_zero, input_dim=vocab_size, 
                                output_dim=embedding_size,
                                name='%s/embedding' % name)
    
def create_cnn_model(vocab_size, embedding_size=None, embedding_weights=None):
    assert not (embedding_size is None and embedding_weights is None)
    message = Input(shape=(max_num_tokens, ), dtype='int32', name='cnn_input')
    # The convolution layer in keras does not support masking, so we just allow
    # the embedding layer to learn an explicit value.
    embedding = make_embedding("cnn_embedding", 
                               vocab_size, embedding_size, 
                               embedding_weights, mask_zero=False)(message)
    global_pools = []
    for window in 2, 3:
        conv_1x = Conv1D(128, window, activation='relu', padding='valid')(embedding)
        max_pool_1x = MaxPooling1D(2)(conv_1x)
        conv_2x = Conv1D(256, window, activation='relu', padding='valid')(max_pool_1x)
        max_pool_2x = MaxPooling1D(2)(conv_2x)
        conv_3x = Conv1D(256, window, activation='relu', padding='valid')(max_pool_2x)
        global_pools.append(GlobalMaxPooling1D()(conv_3x))
        
    merged = Concatenate(axis=1)(global_pools)
    fc1 = Dense(units=128, activation='elu')(merged)
    preds = Dense(units=len(emojis), activation='softmax', name='cnn_predictions')(fc1)
    model = Model(
        inputs=[message],
        outputs=[preds],
    )
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model


In [78]:
cnn_model = create_cnn_model(VOCAB_SIZE, embedding_weights=w2v)
cnn_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
cnn_input (InputLayer)          (None, 54)           0                                            
__________________________________________________________________________________________________
cnn_embedding/embedding (Embedd (None, 54, 300)      15000000    cnn_input[0][0]                  
__________________________________________________________________________________________________
conv1d_20 (Conv1D)              (None, 53, 128)      76928       cnn_embedding/embedding[0][0]    
__________________________________________________________________________________________________
conv1d_23 (Conv1D)              (None, 52, 128)      115328      cnn_embedding/embedding[0][0]    
__________________________________________________________________________________________________
max_poolin

In [79]:
cnn_model.fit(training_tokens, training_labels, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f0652ead2e8>

Le**arning Embeddings**

In [80]:
learned_embeddings_cnn_model = create_cnn_model(VOCAB_SIZE, embedding_size=100)
learned_embeddings_cnn_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
cnn_input (InputLayer)          (None, 54)           0                                            
__________________________________________________________________________________________________
cnn_embedding/embedding (Embedd (None, 54, 100)      5000000     cnn_input[0][0]                  
__________________________________________________________________________________________________
conv1d_26 (Conv1D)              (None, 53, 128)      25728       cnn_embedding/embedding[0][0]    
__________________________________________________________________________________________________
conv1d_29 (Conv1D)              (None, 52, 128)      38528       cnn_embedding/embedding[0][0]    
__________________________________________________________________________________________________
max_poolin

In [81]:
learned_embeddings_cnn_model.fit(training_tokens, training_labels, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f0652ead470>

In [0]:
learned_embeddings_cnn_model.save('zoo/07/twitter_learned_embeddings_cnn_model.h5')

In [0]:
from keras.layers import Masking

def create_lstm_model(vocab_size,  embedding_size=None, embedding_weights=None):
    assert not (embedding_size is None and embedding_weights is None)
    message = Input(shape=(max_num_tokens, ), dtype='int32', name='lstm_input')
    embedding = Embedding(mask_zero=True, input_dim=vocab_size, 
                          output_dim=embedding_weights.shape[1], 
                          weights=[embedding_weights],
                          trainable=True,
                          name='lstm_embedding')(message)
    mask = Masking(mask_value=0)(embedding)
    lstm_1 = LSTM(units=128, return_sequences=False)(mask)
    preds = Dense(units=len(emojis), activation='softmax', name='lstm_predictions')(lstm_1)
    model = Model(
        inputs=[message],
        outputs=[preds],
    )
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model
    

In [84]:
lstm_model = create_lstm_model(VOCAB_SIZE, embedding_weights=w2v)
lstm_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_input (InputLayer)      (None, 54)                0         
_________________________________________________________________
lstm_embedding (Embedding)   (None, 54, 300)           15000000  
_________________________________________________________________
masking_3 (Masking)          (None, 54, 300)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               219648    
_________________________________________________________________
lstm_predictions (Dense)     (None, 121)               15609     
Total params: 15,235,257
Trainable params: 15,235,257
Non-trainable params: 0
_________________________________________________________________


In [85]:
early = keras.callbacks.EarlyStopping(monitor='loss',
                              min_delta=0.03,
                              patience=2,
                              verbose=0, mode='auto')

lstm_model.fit(training_tokens, training_labels, epochs=12, batch_size=1024, callbacks=[early])

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f0656572dd8>

In [86]:
lstm_model.evaluate(test_tokens, test_labels)



[2.9637703841747007, 0.3706077536104933]

**Comparing our models**

In [1]:
test_char_vectors, _ = next(data_generator(test_tweets, None))

NameError: ignored

In [0]:
predictions = {
    label: [emojis[np.argmax(x)] for x in pred]
    for label, pred in (
        ('lstm', lstm_model.predict(test_tokens[:100])),
        ('char_cnn', char_cnn_model.predict(test_char_vectors[:100])),
        ('cnn', cnn_model.predict(test_tokens[:100])),
    )
}


In [0]:
# Make a dataframe just for test data
pd.options.display.max_colwidth = 128
test_df = test_tweets[:100].reset_index()
eval_df = pd.DataFrame({
    'content': test_df['text'],
    'true': test_df['emoji'],
    **predictions
})
eval_df[['content', 'true', 'char_cnn', 'cnn', 'lstm']].head(25)

**Qualitative Evaluation**

In [0]:
eval_df[eval_df['lstm'] != eval_df['true']].head(10)

**Ensemble model**

In [0]:
def combined_data_generator(tweets, tokens, batch_size):
    tweets = tweets.reset_index()
    while True:
        batch_idx = random.sample(range(len(tweets)), batch_size)
        tweet_batch = tweets.iloc[batch_idx]
        token_batch = tokens[batch_idx]
        char_vec = np.zeros((batch_size, max_sequence_len, len(chars)))
        token_vec = np.zeros((batch_size, max_num_tokens))
        y = np.zeros((batch_size,))
        for row_idx, (token_row, (_, tweet_row)) in enumerate(zip(token_batch, tweet_batch.iterrows())):
            y[row_idx] = emoji_to_idx[tweet_row['emoji']]
            for ch_idx, ch in enumerate(tweet_row['text']):
                char_vec[row_idx, ch_idx, char_to_idx[ch]] = 1
            token_vec[row_idx, :] = token_row
        yield {'char_cnn_input': char_vec, 'cnn_input': token_vec, 'lstm_input': token_vec}, y

d, y = next(combined_data_generator(train_tweets, training_tokens, 5))
d['lstm_input'].shape

In [0]:
def prediction_layer(model):
    layers = [layer for layer in model.layers if layer.name.endswith('_predictions')]
    return layers[0].output

def create_ensemble(*models):
    inputs = [model.input for model in models]
    predictions = [prediction_layer(model) for model in models]
    merged = Average()(predictions)
    model = Model(
        inputs=inputs,
        outputs=[merged],
    )
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model


ensemble = create_ensemble(char_cnn_model2, cnn_model, lstm_model)
ensemble.summary()

In [0]:
BATCH_SIZE = 512
ensemble.fit_generator(
    combined_data_generator(train_tweets, training_tokens, BATCH_SIZE),
    epochs=20,
    steps_per_epoch=len(train_tweets) / BATCH_SIZE,
    verbose=2,
    callbacks=[early]
)

In [0]:
ensemble.evaluate_generator(
    combined_data_generator(test_tweets, test_tokens, BATCH_SIZE),
    steps=len(test_tweets) / BATCH_SIZE
)

**Tweet Embeddings**

In [2]:
import random
import twitter
import emoji
import gensim
import unicodedata
import html
from keras.preprocessing.text import text_to_word_sequence
import re

Using TensorFlow backend.


In [0]:
# Fill these in!

CONSUMER_KEY = 'xbMuxcJpRTiVGt2C2EYnA'
CONSUMER_SECRET = '2DbQTsvIptkPTdaUcos8DDvQH9fzO0hNjJpUT2uVzQ'
ACCESS_TOKEN = '7319442-EDm4CPxL7W4KkZcGWRMJNVHp88W5OH9vgblu898fg'
ACCESS_SECRET = '5ZxJSbqXhG7uhgXzTFWf9XhkfsxxinlPRXyDTzbA9w'

In [8]:
auth=twitter.OAuth(
    consumer_key=CONSUMER_KEY,
    consumer_secret=CONSUMER_SECRET,
    token=ACCESS_TOKEN,
    token_secret=ACCESS_SECRET,
)

status_stream = twitter.TwitterStream(auth=auth).statuses
next(status_stream.sample()).keys()

dict_keys(['delete'])

In [24]:
RE_URL = re.compile(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?')
RE_WHITESPACE = re.compile(r'\s+')

text = "RT @SLOWTOWNVHS: tried to unfollow the most annoying bitch on twitter but all it said was edit profile \n\n https://t.co/FZ2YWcbUwM"
text = html.unescape(text)
print(text)

RT @SLOWTOWNVHS: tried to unfollow the most annoying bitch on twitter but all it said was edit profile 

 https://t.co/FZ2YWcbUwM


In [25]:
text = RE_WHITESPACE.sub(' ', text)
print(text)

RT @SLOWTOWNVHS: tried to unfollow the most annoying bitch on twitter but all it said was edit profile https://t.co/FZ2YWcbUwM


In [26]:
text = RE_URL.sub(' ', text)
print(text)

RT @SLOWTOWNVHS: tried to unfollow the most annoying bitch on twitter but all it said was edit profile  


In [27]:
text = strip_accents(text)
print(text)

RT @SLOWTOWNVHS: tried to unfollow the most annoying bitch on twitter but all it said was edit profile  


In [28]:
text = ''.join(ch for ch in text if ord(ch) < 128)
print(text)

RT @SLOWTOWNVHS: tried to unfollow the most annoying bitch on twitter but all it said was edit profile  


In [38]:
text = text[3:].strip()
text

'@SLOWTOWNVHS: tried to unfollow the most annoying bitch on twitter but all it said was edit profile'

In [39]:
text = text_to_word_sequence(text)
text

['slowtownvhs',
 'tried',
 'to',
 'unfollow',
 'the',
 'most',
 'annoying',
 'bitch',
 'on',
 'twitter',
 'but',
 'all',
 'it',
 'said',
 'was',
 'edit',
 'profile']

In [7]:
RE_URL = re.compile(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?')
RE_WHITESPACE = re.compile(r'\s+')

def strip_accents(s):
    return "".join(c for c in unicodedata.normalize("NFD", s)
                   if unicodedata.category(c) != "Mn")

class TokensYielder(object):
    def __init__(self, tweet_count, stream):
        self.tweet_count = tweet_count
        self.stream = stream
        
    def __iter__(self):
        count = self.tweet_count
        for tweet in self.stream:
            if tweet.get("lang") != "en":
                continue
            text = text["text"]
            text = html.unescape(text)
            text = RE_WHITESPACE.sub(' ', text)
            text = RE_URL.sub(' ', text)
            text = strip_accents(text)
            text = ''.join(ch for ch in text if ord(ch) < 128)
            if text.startwith("RT "):
                text = text[3:]
                
            text = text.strip()
            if text:
                yield text_to_word_sequence(text)
                count -= 1
                if count <= 0:
                    break
for t in TokensYielder(3, twitter.TwitterStream(auth=auth).statuses.sample()):
    print(t)                    

UnboundLocalError: ignored

In [0]:
tweets = list(TokensYielder(70000, twitter.TwitterStream(auth=auth).statuses.sample()))

In [0]:
model = gensim.models.Word2Vec(tweets, 
                               workers=5,
                               min_count=2,
                              )
model.save('zoo/07/twitter_stream_w2v.model')

In [0]:
model.wv.most_similar(positive=['love'], topn=5)