In [17]:
from pathlib import Path
%pip install -U tensorflow-addons
%pip show numpy

Note: you may need to restart the kernel to use updated packages.
Name: numpy
Version: 1.19.2
Summary: NumPy is the fundamental package for array computing with Python.
Home-page: https://www.numpy.org
Author: Travis E. Oliphant et al.
Author-email: None
License: BSD
Location: c:\users\daniel_janos_robert\anaconda3\envs\bow\lib\site-packages
Requires: 
Required-by: tensorflow, tensorboard, seaborn, scipy, scikit-learn, pandas, opt-einsum, mkl-random, mkl-fft, matplotlib, Keras, Keras-Preprocessing, Keras-Applications, h5py
Note: you may need to restart the kernel to use updated packages.


In [18]:
import pandas as pd
import re
import nltk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import tensorflow_addons as tfa

LABELS = ['positive', 'neutral', 'negative']

SAVE_MODEL = True
DUMP_DIRECTORY = 'created_models'

TRAIN_RANDOM_STATE = 42
DEV_RANDOM_STATE = 42

DROP=0.2
LAYER_1_DENSITY = 1024
LAYER_2_DENSITY = 512
LAYER_3_DENSITY = 256
ACTIVATION = 'sigmoid'

EMBEDDING_DIMENSIONS = 100
LAST_DIMENSIONS = 3 #based on labels
LAST_ACTIVATION = 'softmax'
MAX_LENGTH = 1000

NUM_EPOCHS = 50
BATCH_SIZE = 600
COMPILE_LOSS = 'categorical_crossentropy'
COMPILE_METRICS = ['accuracy']

GLOVE_DIMENSIONS = 100


In [19]:
import os
from datetime import datetime

import joblib


def dump_file(o, object_name, name):
    if not os.path.exists(DUMP_DIRECTORY):
        os.mkdir(DUMP_DIRECTORY)
    time = datetime.now().strftime("%d%b%Y%H%M%S")
    created_model_path = DUMP_DIRECTORY + '/' + name + '_' + time + '.dump'
    joblib.dump(o, created_model_path)

In [20]:
from nltk.corpus import stopwords
from sklearn.utils import shuffle

HEADER = ['id1', 'id2', 'sentiment', 'tweet_text']
HEADER_TO_DELETE = ['id1', 'id2']

def remove_stopwords(input_text):
    stopwords_list = stopwords.words('english')
    whitelist = ["n't", "not", "no"]
    words = input_text.split()
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1]
    return " ".join(clean_words)

def remove_mentions(input_text):
    URL_RE = 'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}([-a-zA-Z0-9()@:%_+.~#?&/=]*)'
    input_text = re.sub(URL_RE, '', input_text)
    input_text = re.sub(r'@\w+', '', input_text)
    return re.sub(r'#\w+', '', input_text)

def clean(dataset, col='tweet_text', not_equals_text='Not Available'):
    return dataset[dataset[col] != not_equals_text]

def merge_neutrals(dataset):
    neutral_sentiments = ['objective', 'objective-OR-neutral']
    dataset['sentiment'] = dataset['sentiment'].apply(lambda x: 'neutral' if x in neutral_sentiments else x)
    return dataset

def evaluate(predict, labels):
    print('Classification report:')
    print(classification_report(labels, predict))
    print('Accuracy:')
    print(accuracy_score(labels, predict))

    print('Confusion matrix:')
    df_cm = pd.DataFrame(confusion_matrix(labels, predict),
                         index=[i for i in ['positive', 'neutral', 'negative']],
                         columns=[i for i in ['positive', 'neutral', 'negative']])
    plt.figure(figsize=(10,7))
    hm = sn.heatmap(df_cm, annot=True, fmt='g', cmap="Blues")
    hm.set(ylabel='True label', xlabel='Predicted label')
    plt.show()


train_dataset = pd.read_csv('db/train.tsv', sep='\t', header=None, names=HEADER)
train_dataset = shuffle(train_dataset, random_state=TRAIN_RANDOM_STATE)
train_dataset = clean(train_dataset)
train_dataset = merge_neutrals(train_dataset)
train_dataset.drop(HEADER_TO_DELETE, axis=1, inplace=True)
train_dataset['tweet_text'] = train_dataset['tweet_text'].apply(remove_stopwords).apply(remove_mentions)
train_dataset.info()

dev_dataset = pd.read_csv('db/dev-full.tsv', sep='\t', header=None, names=HEADER)
dev_dataset = shuffle(dev_dataset, random_state=DEV_RANDOM_STATE)
dev_dataset = clean(dev_dataset)
dev_dataset = merge_neutrals(dev_dataset)
dev_dataset.drop(HEADER_TO_DELETE, axis=1, inplace=True)
dev_dataset['tweet_text'] = dev_dataset['tweet_text'].apply(remove_stopwords).apply(remove_mentions)
dev_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6275 entries, 5821 to 7270
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   sentiment   6275 non-null   object
 1   tweet_text  6275 non-null   object
dtypes: object(2)
memory usage: 147.1+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1051 entries, 65 to 1126
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   sentiment   1051 non-null   object
 1   tweet_text  1051 non-null   object
dtypes: object(2)
memory usage: 24.6+ KB


In [21]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

def evaluate(predict, labels, history):
    print('Classification report:')
    print(classification_report(labels, predict))
    print('Accuracy:')
    print(accuracy_score(labels, predict))

    print('Confusion matrix:')
    df_cm = pd.DataFrame(confusion_matrix(labels, predict),
                         index=[i for i in ['positive', 'neutral', 'negative']],
                         columns=[i for i in ['positive', 'neutral', 'negative']])
    plt.figure(figsize=(10,7))
    hm = sn.heatmap(df_cm, annot=True, fmt='g', cmap="Blues")
    hm.set(ylabel='True label', xlabel='Predicted label')
    plt.show()

    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train acc', 'val acc'], loc='upper left')
    plt.show()

    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train loss', 'val loss'], loc='upper left')
    plt.show()

x_train = train_dataset['tweet_text'].values
y_train = train_dataset['sentiment'].values

x_dev = dev_dataset['tweet_text'].values
y_dev = dev_dataset['sentiment'].values

train_dataset.head(5)

Unnamed: 0,sentiment,tweet_text
5821,neutral,I'm bout listen nicki minaj night
2833,positive,see C. Edwards anything racing hard Thurs Due...
2543,positive,fux wit yo 3rd choice Gifted Hands (the Ben C...
1737,positive,"First time listing Red: ""Does diva think Avril..."
6582,positive,Congrats Lloyd Robertson Gordon Sinclair Awar...


In [22]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# tk = Tokenizer(num_words=NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{"}~\t\n',
#               lower=True, split=" ")

tk = Tokenizer()
all_tweets = np.append(x_train, x_dev)
tk.fit_on_texts(all_tweets)

vocab_size = len(tk.word_index) + 1

x_train_seq = tk.texts_to_sequences(x_train)
x_dev_seq = tk.texts_to_sequences(x_dev)

x_train_seq_trunc = pad_sequences(x_train_seq, maxlen=MAX_LENGTH, padding='post')
x_dev_seq_trunc = pad_sequences(x_dev_seq, maxlen=MAX_LENGTH, padding='post')

In [23]:
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical

le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_dev_le = le.fit_transform(y_dev)
y_train_categorical = to_categorical(y_train_le)
y_dev_categorical = to_categorical(y_dev_le)

In [24]:
from keras import models
from keras import layers

def create_model(embedding_layer):
    sequence_input = models.Input(shape=(MAX_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = layers.Conv1D(128, 5, activation='relu')(embedded_sequences)
    x = layers.MaxPooling1D(5)(x)
    x = layers.Conv1D(128, 5, activation='relu')(x)
    x = layers.MaxPooling1D(5)(x)
    x = layers.Conv1D(128, 5, activation='relu')(x)
    x = layers.MaxPooling1D(35)(x)
    x = layers.Flatten()(x)
    x = layers.Dense(128, activation='relu')(x)
    preds = layers.Dense(LAST_DIMENSIONS, activation='softmax')(x)
    return sequence_input, preds

embedding_layer = layers.Embedding(vocab_size, GLOVE_DIMENSIONS, input_length=max_length)
sequence_input, preds = create_model(embedding_layer)

emb_model = models.Model(sequence_input, preds)
emb_model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 1000)]            0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 1000, 100)         1510900   
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 35, 128)          

In [25]:
tqdm_callback = tfa.callbacks.TQDMProgressBar()
from tensorflow.python.keras.callbacks import EarlyStopping

def deep_model(model, x_train, y_train, x_dev, y_dev):
    model.compile(loss=COMPILE_LOSS, metrics=COMPILE_METRICS, optimizer='adam')

    history = model.fit(x_train, y_train, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE,
                        validation_data=(x_dev, y_dev), verbose=0, callbacks=[EarlyStopping(patience=2), tqdm_callback])
    # TODO: shuffle -> random state, with seeds
    # if SAVE_MODEL:
    #   dump_file(model, 'neural net', 'deep_model')

    result = model.evaluate(x_dev, y_dev)
    predict = model.predict_classes(x_dev)

    return history, result, predict


In [26]:
emb_history, emb_result, emb_predict = deep_model(emb_model, x_train_seq_trunc, y_train_categorical,
                         x_dev_seq_trunc, y_dev_categorical)

Training:   0%|           0/50 ETA: ?s,  ?epochs/s

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


0/11           ETA: ?s - 

0/11           ETA: ?s - 

0/11           ETA: ?s - 

0/11           ETA: ?s - 

0/11           ETA: ?s - 

0/11           ETA: ?s - 

AttributeError: 'Functional' object has no attribute 'predict_classes'

In [None]:
print(emb_result)
evaluate(emb_predict, y_dev_le, emb_history)

In [None]:
glove_file = 'glove.twitter.27B.' + str(GLOVE_DIMENSIONS) + 'd.txt'
emb_dict = {}
glove = open(Path('./db') / glove_file, encoding="utf8")
for line in glove:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    emb_dict[word] = vector
glove.close()

In [None]:

emb_matrix = np.zeros((vocab_size, GLOVE_DIMENSIONS))

for w, i in tk.word_index.items():
    if i < vocab_size:
        vect = emb_dict.get(w)
        if vect is not None:
            emb_matrix[i] = vect
    else:
        break

In [None]:
embedding_layer = layers.Embedding(vocab_size, GLOVE_DIMENSIONS, input_length=max_length,
                                   weights=[emb_matrix], trainable=False)
sequence_input, preds = create_model(embedding_layer)

glove_model = models.Model(sequence_input, preds)
glove_model.summary()

In [None]:
glove_model.layers[0].set_weights([emb_matrix])
glove_model.layers[0].trainable = False

glove_history, glove_result, glove_predict = deep_model(glove_model, x_train_seq_trunc, y_train_categorical,
                                                        x_dev_seq_trunc, y_dev_categorical)

In [None]:
print(glove_result)
evaluate(glove_predict, y_dev_le, glove_history)