In [21]:
import numpy as np

from keras.models import Sequential
from keras.layers import Conv1D
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from sklearn.manifold import TSNE
from gensim.models import KeyedVectors
import pandas as pd
from keras.layers import LSTM

EMBEDDING_DIMENSION = 100
VOCABULARY_SIZE = 150443

def read_corpus_dataset(dataset_path):
    data = pd.read_csv(dataset_path, error_bad_lines=False)
    labels = data['sentiment']
    tweets = data['tweet']

    return tweets, labels

In [2]:
def tokenize_dataset(tweets, vocabulary_size):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(tweets)
    sequences = tokenizer.texts_to_sequences(tweets)
    # dictionary word:index
    word_indices = {}
    for key, value in tokenizer.word_index.items():
        word_indices[key] = value
        if value == vocabulary_size:
            break
    print('===> Number of words in dataset: {}'.format(len(word_indices)))

    return sequences, word_indices

def read_embeddings():
    GLOVE_PATH = "glove.twitter.27B.100d.txt"
    embeddings = {}
    with open(GLOVE_PATH) as glove_file:
        for line in glove_file:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings[word] = coefs
    print('===> Using {} embedding vectors\n'.format(len(embeddings)))
    return embeddings


def load_word2vec_model(w2v_path):
    print('===> Loading Word2Vec model...')
    return KeyedVectors.load_word2vec_format(WORD2VEC_MODEL, binary=True)

In [22]:

# mapping pretrained coefficients to dataset
def create_embedding_matrix(word_indices, embeddings):
    WORDS_NUM = len(word_indices) + 1
    embedding_matrix = np.zeros((WORDS_NUM, EMBEDDING_DIMENSION))
    for word, i in word_indices.items():
        word_vector = embeddings.get(word)
        if word_vector is not None:
            embedding_matrix[i] = word_vector

    return WORDS_NUM, embedding_matrix


def get_model(WORDS_NUM, embedding_matrix, MAX_SEQUENCE_LENGTH):
    model = Sequential()
    model.add(Embedding(WORDS_NUM,
                        EMBEDDING_DIMENSION,
                        weights=[embedding_matrix],
                        input_length=MAX_SEQUENCE_LENGTH,
                        trainable=False))
    model.add(Conv1D(256, 3, padding='same', activation='relu'))
    model.add(Conv1D(128, 3, padding='same', activation='relu'))
    model.add(Conv1D(64, 3, padding='same'))
    model.add(GlobalMaxPooling1D())
    # model.add(Flatten())
    model.add(Dropout(0.4))
    model.add(Dense(180, activation='sigmoid'))
    model.add(Dropout(0.4))
    model.add(Dense(1, activation='sigmoid'))

    return model

def get_model_LSTM(WORDS_NUM, embedding_matrix, MAX_SEQUENCE_LENGTH):
    model = Sequential()
    model.add(Embedding(WORDS_NUM,
                        EMBEDDING_DIMENSION,
                        weights=[embedding_matrix],
                        input_length=MAX_SEQUENCE_LENGTH,
                        trainable=False))
    model.add(LSTM(128))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    return model

In [4]:
print('===> Reading GloVe words embeddings\n')
embeddings = read_embeddings()

===> Reading GloVe words embeddings

===> Using 1193514 embedding vectors



In [5]:
tweets, labels = read_corpus_dataset("new_data.csv")
sequences, word_indices = tokenize_dataset(tweets, VOCABULARY_SIZE)

===> Number of words in dataset: 150443


In [6]:
# (train_summary, train_review_text, train_labels), (test_summary, test_review_text, test_labels) = load_amazon_smaller()
# (X_train, y_train), (X_test, y_test) = (train_summary, train_labels), (test_summary, test_labels)

In [7]:
MAX_SEQUENCE_LENGTH = len(max(sequences, key=lambda x: len(x)))
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [8]:
# labels = to_categorical(labels)
print('===> Data shape: {}'.format(padded_sequences.shape))
print('===> Labels shape: {}'.format(labels.shape))
labels[labels==4]=1

===> Data shape: (200000, 46)
===> Labels shape: (200000,)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences,
                                                        labels,
                                                        test_size=0.2)

In [10]:
WORDS_NUM, embedding_matrix = create_embedding_matrix(word_indices,
                                                          embeddings)

In [11]:
model = get_model(WORDS_NUM, embedding_matrix, MAX_SEQUENCE_LENGTH)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 46, 100)           15044400  
_________________________________________________________________
conv1d (Conv1D)              (None, 46, 256)           77056     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 46, 128)           98432     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 46, 64)            24640     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 180)               1

In [17]:
model.fit(X_train,
              y_train,
              batch_size=128,
              epochs=5,
              validation_data=(X_test, y_test),
              verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f65fcf55f98>

In [18]:
pred = model.predict(X_test)
pred[pred<0.5] = 0
pred[pred!=0] =1

In [19]:
from sklearn.metrics import classification_report

In [20]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.78      0.79      0.78     19973
           1       0.79      0.78      0.78     20027

    accuracy                           0.78     40000
   macro avg       0.78      0.78      0.78     40000
weighted avg       0.78      0.78      0.78     40000



In [23]:
model = get_model_LSTM(WORDS_NUM, embedding_matrix, MAX_SEQUENCE_LENGTH)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

NameError: name 'vocab_size' is not defined