In [1]:
from __future__ import print_function, division
import example_helper
import json
import csv
import numpy as np
from deepmoji.sentence_tokenizer import SentenceTokenizer
from deepmoji.model_def import deepmoji_feature_encoding
from deepmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH

Using TensorFlow backend.


In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

import re
import os
import wordsegment as ws
import preprocessor as p

In [15]:
data = pd.read_csv("../../datasets/train/SemEval2018-T3-train-taskB_emoji.txt", sep="\t")
data = data[data['Tweet text'].map(len)<=140]
ws.load()

In [16]:
len(data)

3813

In [17]:
def get_hashtags(tweet):
    parsed_tweet = p.parse(tweet.lower())
    parsed_hashtags = parsed_tweet.hashtags
    all_hashtags = {}
    
    hashtags = []
    if parsed_hashtags is not None:
        for h in parsed_hashtags:
            temp = h.match[1:].lower()
            hashtag = " ".join(ws.segment(temp))
            if hashtag in all_hashtags:
                all_hashtags[hashtag] += 1
            else:
                all_hashtags[hashtag] = 1
            hashtags.append(hashtag)

    hashtags_str = (" ").join(hashtags)
    return hashtags_str, len(hashtags), all_hashtags

def get_text(tweet):
    clean_tweet = p.clean(tweet)
    clean_tweet = re.sub(r'[^\w\s]','',clean_tweet)
    return clean_tweet.lower()


def get_emotion(tweet):
    emotion_keys = {}
    result = re.findall(r":\w+_\w+:",tweet)
    if result is not None:
        emotions = []
        for i in range(len(result)):
            emotion = result[i][1:-1]
            emotions.append(emotion)
            if emotion in emotion_keys:
                emotion_keys[emotion] += 1
            else:
                emotion_keys[emotion] = 1
    return emotions, emotion_keys

In [19]:
data['hashtags'], data['length'], data['hashtag_dict'] = zip(*data['Tweet text'].map(get_hashtags)) 
data['tweet'] = data['Tweet text'].map(get_text)
# data['emotion'], data['emotion_dict'] = zip(*data['tweet'].map(get_emotion))
data.head()

Unnamed: 0,Tweet Index,Label,Tweet text,hashtags,length,hashtag_dict,tweet
0,1,1,Sweet United Nations video. Just in time for Christmas. #imagine #NoReligion http://t.co/fej2v3OUBR,imagine no religion,2,"{'imagine': 1, 'no religion': 1}",sweet united nations video just in time for christmas
1,2,1,@mrdahl87 We are rumored to have talked to Erv's agent... and the Angels asked about Ed Escobar... that's hardly nothing ;),,0,{},we are rumored to have talked to ervs agent and the angels asked about ed escobar thats hardly nothing
2,3,1,Hey there! Nice to see you Minnesota/ND Winter Weather,,0,{},hey there nice to see you minnesotand winter weather
3,4,0,3 episodes left I'm dying over here,,0,{},3 episodes left im dying over here
4,5,2,I can't breathe! was chosen as the most notable quote of the year in an annual list released by a Yale University librarian,,0,{},i cant breathe was chosen as the most notable quote of the year in an annual list released by a yale university librarian


In [20]:
TEST_SENTENCES = data['tweet'].tolist()

In [21]:
TEST_SENTENCES[:5]

['sweet united nations video just in time for christmas',
 'we are rumored to have talked to ervs agent and the angels asked about ed escobar thats hardly nothing',
 'hey there nice to see you minnesotand winter weather',
 '3 episodes left im dying over here',
 'i cant breathe was chosen as the most notable quote of the year in an annual list released by a yale university librarian']

In [22]:
maxlen = max(len(x.split(' ')) for x in TEST_SENTENCES)
batch_size = 32

In [23]:
print('Tokenizing using dictionary from {}'.format(VOCAB_PATH))
with open(VOCAB_PATH, 'r') as f:
    vocabulary = json.load(f)
st = SentenceTokenizer(vocabulary, maxlen)
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES,maxlen)

Tokenizing using dictionary from D:\Research\tweet-irony-detection\DeepMoji/model/vocabulary.json
3813 3798
Assertion


In [24]:
print('Loading model from {}.'.format(PRETRAINED_PATH))
model = deepmoji_feature_encoding(maxlen, PRETRAINED_PATH)
model.summary()

Loading model from D:\Research\tweet-irony-detection\DeepMoji/model/deepmoji_weights.hdf5.
Loading weights for embedding
Loading weights for bi_lstm_0
Loading weights for bi_lstm_1
Loading weights for attlayer
Ignoring weights for softmax
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 32)           0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 32, 256)      12800000    input_2[0][0]                    
__________________________________________________________________________________________________
activation_2 (Activation)       (None, 32, 256)      0           embedding[0][0]                  
____________________________________________________________________

In [25]:
print('Encoding texts..')
encoding = model.predict(tokenized)

Encoding texts..


In [26]:
np.save('taskB_train_deepmoji',encoding)