In [17]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.engine.topology import Layer
from keras import initializers as initializers, regularizers, constraints
from keras.callbacks import Callback, ModelCheckpoint
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding, Input, Dense, CuDNNLSTM, LSTM, CuDNNGRU, GRU, Bidirectional, TimeDistributed, Dropout
from keras import backend as K
from keras import optimizers
from keras.models import Model
import nltk
import re
import matplotlib.pyplot as plt
import sys
from sklearn.metrics import roc_auc_score
from nltk import tokenize
import seaborn as sns

from attention_with_context import AttentionWithContext
from sklearn.utils import shuffle

In [2]:
max_features = 200000
max_senten_len = 40
max_senten_num = 6
embed_size = 100
VALIDATION_SPLIT = 0.2

In [27]:
TEXT_DATA_DIR = 'data'
import os

texts = []  # list of text samples
labels = []  # list of labels
files = ['train.csv', 'valid.csv']
for file_name in files:
    file = pd.read_csv(os.path.join(TEXT_DATA_DIR, file_name))
    for line in file['text']:
        texts.append(line)
    for label in file['stars']:
        labels.append(label)
    cates = file.groupby('stars')

    print("total categories:", cates.ngroups)
    print(cates.size())


total categories: 5
stars
1.0    14890
2.0     8109
3.0    11205
4.0    21838
5.0    43958
dtype: int64
total categories: 5
stars
1    1491
2     777
3    1114
4    2212
5    4406
dtype: int64


In [28]:
unknown_list = ['\x7f','\x80','\x94','\xa0','¡','¢','£','©','\xad','®','¯','°','±','´','·','¹','½','¿','à','á','â','ã','ä','å','æ','ç','è','é','ê','ë','í','î','ñ','ó','ö','ø','ú','û','ü','ć','č','ē','ğ','ō','ş','ū','ə','ɛ','ɪ','ˈ','ˌ','ː','̀','́','ά','έ','ί','α','γ','δ','ε','η','ι','κ','λ','μ','ν','ξ','ο','ρ','ς','σ','τ','υ','ω','ό','ύ','ώ','а','б','г','е','к','л','м','о','с','т','у','ь','я','ِ','আ','ই','গ','ট','ভ','র','ল','স','া','ি','ো','্','ಠ','ᴥ','ᵒ','ᶅ','ᶘ','\u2009','\u200a','\u200b','\u200d','\u200e','\u200f','‐','–','—','―','‘','’','“','”','•','…','\u2028','\u202a','\u202c','\u202f','′','″','‹','›','€','™','≠','⊙','②','─','◕','☀','☃','☔','☕','☘','♂','♥','♫','♬','⚡','⚾','⛱','⛽','✂','✈','✊','✌','✔','✨','❄','❤','➡','ツ','象','️','︿','\ufeff','🇦','🇨','🇫','🇬','🇭','🇵','🇷','🇸','🇺','🇿','🌈','🌊','🌍','🌎','🌑','🌒','🌓','🌔','🌕','🌖','🌗','🌘','🌝','🌞','🌤','🌦','🌮','🌳','🌴','🌷','🌸','🌹','🌿','🍁','🍂','🍃','🍅','🍆','🍋','🍌','🍍','🍎','🍑','🍒','🍓','🍔','🍕','🍖','🍗','🍜','🍝','🍞','🍩','🍪','🍭','🍰','🍴','🍷','🍸','🍹','🍻','🍽','🍾','🎀','🎁','🎂','🎃','🎄','🎅','🎆','🎈','🎉','🎊','🎓','🎤','🎮','🎵','🎶','🎾','🏀','🏃','🏅','🏆','🏈','🏊','🏨','🏫','🏰','🏼','🏽','🏾','🐊','🐋','🐍','🐎','🐔','🐘','🐛','🐝','🐭','🐮','🐱','🐲','🐶','🐷','🐸','🐾','👀','👅','👊','👋','👌','👍','👏','👑','👓','👖','👠','👩','👪','👬','👭','👯','👰','👴','👵','👶','👹','👻','👼','👽','💀','💁','💃','💄','💅','💇','💉','💊','💋','💍','💎','💐','💔','💕','💖','💗','💘','💙','💚','💛','💜','💞','💡','💤','💥','💦','💨','💩','💪','💯','💰','💸','💻','💼','📚','📝','📱','📷','📸','📺','🔋','🔍','🔑','🔥','🔪','🔮','🕵','🕺','🗣','😀','😁','😂','😃','😇','😈','😉','😊','😍','😎','😐','😑','😒','😔','😕','😘','😜','😞','😠','😡','😢','😩','😬','😭','😮','😰','😱','😳','😴','😵','😶','😷','😻','🙀','🙂','🙃','🙄','🙅','🙆','🙈','🙊','🙌','🙏','🚀','🚂','🚇','🚗','🚨','🚬','🚴','🛀','🤑','🤓','🤔','🤗','🤘','🤢','🤦','🤼','🥂','🥗','🦄']
unknown_dict = {'\x7f':"",'\x80':"euro",'\x94':"\"",'\xa0':" ",'¡':"i",'¢':"cent",'£':"pound",'©':"copyright",'\xad':" ",'®':"registered",'¯':"",'°':"",'±':"",'´':"'",'·':"",'¹':"1",'½':"",'¿':"",'à':"",'á':"",'â':"",'ã':"",'ä':"",'å':"",'æ':"",'ç':"",'è':"",'é':"",'ê':"",'ë':"",'í':"",'î':"",'ñ':"",'ó':"",'ö':"",'ø':"",'ú':"",'û':"",'ü':"",'ć':"",'č':"",'ē':"",'ğ':"",'ō':"",'ş':"",'ū':"",'ə':"",'ɛ':"",'ɪ':"",'ˈ':"",'ˌ':"",'ː':"",'̀':"",'́':"",'ά':"",'έ':"",'ί':"",'α':"",'γ':"",'δ':"",'ε':"",'η':"",'ι':"",'κ':"",'λ':"",'μ':"",'ν':"",'ξ':"",'ο':"o",'ρ':"",'ς':"",'σ':"",'τ':"",'υ':"",'ω':"w",'ό':"o",'ύ':"u",'ώ':"w",'а':"a",'б':"",'г':"r",'е':"e",'к':"k",'л':"",'м':"m",'о':"o",'с':"c",'т':"t",'у':"y",'ь':"b",'я':"r",'ِ':"",'আ':"",'ই':"",'গ':"",'ট':"",'ভ':"",'র':"",'ল':"",'স':"",'া':"",'ি':"",'ো':"",'্':"",'ಠ':"",'ᴥ':"",'ᵒ':"",'ᶅ':"",'ᶘ':"",'\u2009':" ",'\u200a':" ",'\u200b':" ",'\u200d':" ",'\u200e':" ",'\u200f':" ",'‐':"-",'–':"-",'—':"-",'―':"-",'‘':"\'",'’':"\'",'“':"\"",'”':"\"",'•':" ",'…':" ",'\u2028':" ",'\u202a':" ",'\u202c':" ",'\u202f':" ",'′':"''",'″':"\"",'‹':"<",'›':">",'€':"",'™':"",'≠':"",'⊙':"",'②':"",'─':"-",'◕':"",'☀':"",'☃':"",'☔':"",'☕':"",'☘':"",'♂':"",'♥':"",'♫':"",'♬':"",'⚡':"",'⚾':"",'⛱':"",'⛽':"",'✂':"",'✈':"",'✊':"",'✌':"",'✔':"",'✨':"",'❄':"",'❤':"good",'➡':"",'ツ':"good",'象':"house",'️':"",'︿':"or",'\ufeff':" ",'🇦':"a",'🇨':"c",'🇫':"f",'🇬':"g",'🇭':"h",'🇵':"p",'🇷':"r",'🇸':"s",'🇺':"u",'🇿':"z",'🌈':"",'🌊':"",'🌍':"",'🌎':"",'🌑':"",'🌒':"",'🌓':"",'🌔':"",'🌕':"",'🌖':"",'🌗':"",'🌘':"",'🌝':"",'🌞':"",'🌤':"",'🌦':"",'🌮':"",'🌳':"",'🌴':"",'🌷':"",'🌸':"",'🌹':"",'🌿':"",'🍁':"",'🍂':"",'🍃':"",'🍅':"",'🍆':"",'🍋':"",'🍌':"",'🍍':"",'🍎':"",'🍑':"",'🍒':"",'🍓':"",'🍔':"",'🍕':"",'🍖':"",'🍗':"",'🍜':"",'🍝':"",'🍞':"",'🍩':"",'🍪':"",'🍭':"",'🍰':"",'🍴':"",'🍷':"",'🍸':"",'🍹':"",'🍻':"",'🍽':"",'🍾':"",'🎀':"",'🎁':"",'🎂':"",'🎃':"",'🎄':"",'🎅':"",'🎆':"",'🎈':"",'🎉':"",'🎊':"",'🎓':"",'🎤':"",'🎮':"",'🎵':"",'🎶':"",'🎾':"",'🏀':"",'🏃':"",'🏅':"",'🏆':"",'🏈':"",'🏊':"",'🏨':"",'🏫':"",'🏰':"",'🏼':"",'🏽':"",'🏾':"",'🐊':"",'🐋':"",'🐍':"",'🐎':"",'🐔':"",'🐘':"",'🐛':"",'🐝':"",'🐭':"",'🐮':"",'🐱':"",'🐲':"",'🐶':"",'🐷':"",'🐸':"",'🐾':"",'👀':"",'👅':"",'👊':"",'👋':"",'👌':"",'👍':"",'👏':"",'👑':"",'👓':"",'👖':"",'👠':"",'👩':"",'👪':"",'👬':"",'👭':"",'👯':"",'👰':"",'👴':"",'👵':"",'👶':"",'👹':"",'👻':"",'👼':"",'👽':"",'💀':"",'💁':"",'💃':"",'💄':"",'💅':"",'💇':"",'💉':"",'💊':"",'💋':"",'💍':"",'💎':"",'💐':"",'💔':"",'💕':"",'💖':"",'💗':"",'💘':"",'💙':"",'💚':"",'💛':"",'💜':"",'💞':"",'💡':"",'💤':"",'💥':"",'💦':"",'💨':"",'💩':"",'💪':"",'💯':"",'💰':"",'💸':"",'💻':"",'💼':"",'📚':"",'📝':"",'📱':"",'📷':"",'📸':"",'📺':"",'🔋':"",'🔍':"",'🔑':"",'🔥':"",'🔪':"",'🔮':"",'🕵':"",'🕺':"good",'🗣':"noisy",'😀':"good",'😁':"good",'😂':"good",'😃':"good",'😇':"good",'😈':"good",'😉':"good",'😊':"good",'😍':"good",'😎':"good",'😐':"bad",'😑':"bad",'😒':"bad",'😔':"bad",'😕':"bad",'😘':"good",'😜':"good",'😞':"bad",'😠':"bad",'😡':"bad",'😢':"no",'😩':"no",'😬':"no",'😭':"lol",'😮':"good",'😰':"sick",'😱':"maybe",'😳':"good",'😴':"sleepy",'😵':"maybe",'😶':"maybe",'😷':"sick",'😻':"good",'🙀':"wow",'🙂':"good",'🙃':"ok",'🙄':"so so",'🙅':"no",'🙆':"yes",'🙈':"good",'🙊':"maybe",'🙌':"soso",'🙏':"please",'🚀':"rocket",'🚂':"train",'🚇':"train",'🚗':"car",'🚨':"signal",'🚬':"smoke",'🚴':"cycle",'🛀':"clean",'🤑':"good",'🤓':"good",'🤔':"doubt",'🤗':"happy",'🤘':"good",'🤢':"bad",'🤦':"confused",'🤼':"happy",'🥂':"beer",'🥗':"salad",'🦄':"good"}

In [35]:
# new_list = list()
# count = 0
# for line in texts:
#     if count % 1000 == 0:
#         print(count, end=" ")
#     new_line = ""
#     for char in line:
#         new_char = char
#         if char in unknown_list:
#             new_char = unknown_dict[char]
#         line = line + new_char
#     new_list.append(new_line)
#     count = count + 1
# print("DONE")

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
DONE


In [40]:
texts = new_list
# df = shuffle(pd.read_json('data/News_Category_Dataset.json', lines=True)).reset_index()
# df.category = df.category.map(lambda x: "WORLDPOST" if x == "THE WORLDPOST" else x)
# df['text'] = df['headline'] + '. ' + df['short_description']
# df = df[['text', 'category']]

categories = np.array(labels)
text = np.array(texts)


import re


def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

paras = []
labels = []
texts = []

sent_lens = []
sent_nums = []
for idx in range(text.shape[0]):
    text = clean_str(df.text[idx])
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    sent_nums.append(len(sentences))
    for sent in sentences:
        sent_lens.append(len(text_to_word_sequence(sent)))
    paras.append(sentences)

tokenizer = Tokenizer(num_words=max_features, oov_token=True)
tokenizer.fit_on_texts(texts)

data = np.zeros((len(texts), max_senten_num, max_senten_len), dtype='int32')
for i, sentences in enumerate(paras):
    for j, sent in enumerate(sentences):
        if j < max_senten_num:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                try:
                    if k < max_senten_len and tokenizer.word_index[word] < max_features:
                        data[i, j, k] = tokenizer.word_index[word]
                        k = k + 1
                except:
                    print(word)
                    pass

word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

labels = pd.get_dummies(categories)

print('Shape of data tensor:', data.shape)
print('Shape of labels tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels.iloc[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
print('Number of positive and negative reviews in traing and validation set')
print(y_train.columns.tolist())
print(y_train.sum(axis=0).tolist())
print(y_val.sum(axis=0).tolist())

REG_PARAM = 1e-13
l2_reg = regularizers.l2(REG_PARAM)

gratisography
— david
the hamilton creator
and its
but if
bob mcdonnell
Total 70043 unique tokens.
Shape of data tensor: (110000, 6, 40)
Shape of labels tensor: (110000, 5)
Number of positive and negative reviews in traing and validation set
[1.0, 2.0, 3.0, 4.0, 5.0]
[13025, 7163, 9916, 19164, 38732]
[3356, 1723, 2403, 4886, 9632]


In [41]:
print(x_train.shape)

(88000, 6, 40)


In [45]:
GLOVE_DIR = "data/glove.6B.100d.txt"
embeddings_index = {}
f = open(GLOVE_DIR, encoding='UTF-8')
for line in f:
    try:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print(word)
        pass
f.close()
print('Total %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((len(word_index) + 1, embed_size))
absent_words = 0
absent_set = set()
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        for j in str(word):
            absent_set.add(j)
        absent_words += 1
print('Total absent words are', absent_words, 'which is', "%0.2f" % (absent_words * 100 / len(word_index)),
      '% of total words')


Total 400000 word vectors.
Total absent words are 16061 which is 22.93 % of total words


In [43]:
absent_set

{'0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'T',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '\x7f',
 '\x80',
 '\x94',
 '\xa0',
 '¡',
 '¢',
 '£',
 '©',
 '\xad',
 '®',
 '¯',
 '°',
 '±',
 '´',
 '·',
 '¹',
 '½',
 '¿',
 'à',
 'á',
 'â',
 'ã',
 'ä',
 'å',
 'æ',
 'ç',
 'è',
 'é',
 'ê',
 'ë',
 'í',
 'î',
 'ñ',
 'ó',
 'ö',
 'ø',
 'ú',
 'û',
 'ü',
 'ć',
 'č',
 'ē',
 'ğ',
 'ō',
 'ş',
 'ū',
 'ə',
 'ɛ',
 'ɪ',
 'ˈ',
 'ˌ',
 'ː',
 '̀',
 '́',
 'ά',
 'έ',
 'ί',
 'α',
 'γ',
 'δ',
 'ε',
 'η',
 'ι',
 'κ',
 'λ',
 'μ',
 'ν',
 'ξ',
 'ο',
 'ρ',
 'ς',
 'σ',
 'τ',
 'υ',
 'ω',
 'ό',
 'ύ',
 'ώ',
 'а',
 'б',
 'г',
 'е',
 'к',
 'л',
 'м',
 'о',
 'с',
 'т',
 'у',
 'ь',
 'я',
 'ِ',
 'আ',
 'ই',
 'গ',
 'ট',
 'ভ',
 'র',
 'ল',
 'স',
 'া',
 'ি',
 'ো',
 '্',
 'ಠ',
 'ᴥ',
 'ᵒ',
 'ᶅ',
 'ᶘ',
 '\u2009',
 '\u200a',
 '\u200b',
 '\u200d',
 '\u200e',
 '\u200f',
 '‐',
 '–',
 '—',
 '―',
 '‘',
 '’',
 '“',
 

In [44]:
embedding_layer = Embedding(len(word_index) + 1, embed_size, weights=[embedding_matrix], input_length=max_senten_len,
                            trainable=False)

word_input = Input(shape=(max_senten_len,), dtype='float32')
word_sequences = embedding_layer(word_input)
word_lstm = Bidirectional(CuDNNLSTM(150, return_sequences=True, kernel_regularizer=l2_reg))(word_sequences)
word_dense = TimeDistributed(Dense(200, kernel_regularizer=l2_reg))(word_lstm)
word_att = AttentionWithContext()(word_dense)
wordEncoder = Model(word_input, word_att)

sent_input = Input(shape=(max_senten_num, max_senten_len), dtype='float32')
sent_encoder = TimeDistributed(wordEncoder)(sent_input)
sent_lstm = Bidirectional(CuDNNLSTM(150, return_sequences=True, kernel_regularizer=l2_reg))(sent_encoder)
sent_dense = TimeDistributed(Dense(200, kernel_regularizer=l2_reg))(sent_lstm)
sent_att = Dropout(0.5)(AttentionWithContext()(sent_dense))
preds = Dense(5, activation='softmax')(sent_att)
model = Model(sent_input, preds)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

checkpoint = ModelCheckpoint('best_model.h5', verbose=0, monitor='val_loss', save_best_only=True, mode='auto')

history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=50, batch_size=512, callbacks=[checkpoint])

Train on 88000 samples, validate on 22000 samples
Epoch 1/50

KeyboardInterrupt: 