In [1]:
import tweepy
import twint
import nltk
from nltk.corpus import stopwords
#from nltk.stem import PorterStemmer
from nltk.stem import Cistem
#from nltk.stem.snowball import GermanStemmer
from nltk.tokenize import TweetTokenizer
#import re
import regex as re
import emoji
import datetime
import glob
import io
import os
import random
import string
import unicodedata
import time
import sys
import pandas as pd
import numpy as np
from scipy import stats
import sklearn
from sklearn.preprocessing import QuantileTransformer
from sklearn.mixture import GaussianMixture as GMM
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
from pprint import pprint

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.initializers import Constant
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

import tqdm

# Workaround for Jupyter's "This event loop is already running" exception
import nest_asyncio
nest_asyncio.apply()

# Application
### Or: sequential execution of the trained models on a given string.

In [2]:
handle = "Erdayastronaut"
batch_size = 32 #16 #4 #= 32
buffer_size = 10000
seed = 42
window_size = 4
embedding_dim = 0 #128 # Must be equal to the dimension of the trained embedding; is set later on
num_ns = 4 # Number of negative samples
sequence_length = 20

# Load the trained models

# TODO: Save entire models so that we don't have to copy&paste between notebooks nor redefine and reinstantiate every time...

# In this example on the Erydayastronaut-Dataset, we used the CNN-Multichannel-Classifier and both basic regressors.
classifier = None
regressor0 = None
regressor1 = None

In [8]:
# The input
tweet = "Hello, everybody! I love Elon Musk and SpaceX."
tweet_raw = tweet

# Preprocess the tweet
def preprocess(t, lang='en'):
    """
    Based on the preprocessing functions in the Miner notebook.
    """
    # Remove Mentions
    mentions = r'@\w*'
    mentions = re.compile(mentions)

    # Remove URLs
    urls = r'https?:\/\/.*[\r\n]*'
    urls = re.compile(urls)

    # Remove emojis
    es = map(lambda x: x, emoji.UNICODE_EMOJI['en'].keys())
    emojis = re.compile('|'.join(re.escape(e) for e in es))
    
    # Remove punctuations
    punctuations = r'[^\w\s]|_'
    punctuations = re.compile(punctuations)

    t = t.replace("#", "")
    t = mentions.sub(r'', t)
    t = urls.sub(r'', t)
    t = emojis.sub(r'', t)
    t = punctuations.sub(r'', t)
    
    # Remove control sequences
    # Adapted from https://stackoverflow.com/a/19016117
    t = "".join(ch for ch in t if unicodedata.category(ch)[0]!="C")
    
    # Tokenize
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    if lang == 'de':
        stopwords_ = stopwords.words('german')
        stopwords_file = open("stop_words_german.txt", "r")
        sws = stopwords_file.read().split("\n")
        stopwords_file.close()
        stopwords_ += sws
        no_stopwords = ['mann', 'mensch', 'menschen', 'recht', 'rechte', 'rechten', 'rechter', 'rechtes']
        additional_stopwords = ['mal', 'halt', 'sagen', 'schon', 'lassen', 'danke', 'bitte', 'einfach', 'eigentlich', 'schon', 'sich']
        stopwords_ += additional_stopwords
        stopwords_ = [x for x in stopwords_ if x not in no_stopwords]
    elif lang == 'en':
        stopwords_ = stopwords.words('english')
    else:
        print("Language not supported yet, please add it to this function...")
        return
    
    stemmer = Cistem()
    
    tweetClean = []
    tokenizedTweet = tokenizer.tokenize(t)
    for word in tokenizedTweet:
        if (word not in stopwords_):
            stem = stemmer.stem(word)
            tweetClean.append(stem)
    
    return ' '.join(tweetClean)

tweet = preprocess(tweet)
print(tweet)

hello everybody lov elo musk spacex


In [9]:
# Convert the string to its integer representation
emb_df = pd.read_pickle('{}/preproc/gensim_w2v_dict.pkl'.format(handle))
# Since the StringLookup-Layer or the TextVectorization-Layer prepend two new words ('' and [UNK]),
# we need to "shift" the weight matrix by two by prepending "empty" rows...
# This could lead to problems if we later try to infer on new strings containing
# words which are out of vocabulary... (?)

embedding_matrix = emb_df["vec"]
embedding_dim = len(embedding_matrix[0])

data = []
data.insert(0, np.zeros(embedding_dim))
data.insert(0, np.zeros(embedding_dim))
embedding_matrix = pd.concat([pd.Series(data), embedding_matrix], ignore_index=True)


#emb_df = pd.concat([pd.DataFrame(data), emb_df], ignore_index=True)
#print(emb_df.head())
#layer = StringLookup(vocabulary=list(emb_df["word"]))
vectorize_layer = TextVectorization(output_sequence_length=sequence_length,
                                    vocabulary=list(emb_df["word"]))

vocab_size = len(vectorize_layer.get_vocabulary())

def vectorize_text(text, label):
    text = text['tweet']
    return vectorize_layer(text), label

tweet = tf.constant([tweet])
tweet = vectorize_layer(tweet)

print(tweet)

tf.Tensor(
[[ 998 1985   59  620  675   17    0    0    0    0    0    0    0    0
     0    0    0    0    0    0]], shape=(1, 20), dtype=int64)


In [6]:
# Run the classificator and keep the result
def make_multichannel_cnn(sl=20, output_bias=None):
    # Bias etc. from https://www.tensorflow.org/tutorials/structured_data/imbalanced_data
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)

    # Multichannel CNN (from https://machinelearningmastery.com/develop-n-gram-multichannel-convolutional-neural-network-sentiment-analysis/)
    # 1.  Embedding: Representation of words and their similarity
    # 1.1 (LSTM directly connected to 3.?)
    # 2.  Convolutional Model: Feature extraction
    # 2.1 (LSTM?)
    # 3.  Fully Connected Model: Interpretation

    # Channel1
    input1 = tf.keras.Input(shape=(sl,), dtype=tf.int32, name="tweet")
    vec1   = input1
    emb1   = layers.Embedding(vocab_size-0, embedding_dim,
                              # Weights should be initialized after defining the model
                              # due to protobuf's limit of 2GB:
                              # ValueError: Message tensorflow.SavedModel exceeds maximum protobuf size of 2GB: 6768286642
                              embeddings_initializer=Constant(list(embedding_matrix)),
                              #input_length=batch_size, Deprecated?
                              trainable=True,
                              name="embedding1",
                              mask_zero=True)(vec1)
    conv1 = layers.Conv1D(filters=16, kernel_size=3, activation='relu')(emb1)
    drop1 = layers.Dropout(0.5)(conv1)
    pool1 = layers.MaxPooling1D(pool_size=2)(drop1)
    lstm1 = layers.Bidirectional(layers.LSTM(64, dropout=0.2, recurrent_dropout=0.0))(pool1)
    flat1 = layers.Flatten()(lstm1)

    # Channel2
    conv2 = layers.Conv1D(filters=16, kernel_size=6, activation='relu')(emb1)
    drop2 = layers.Dropout(0.5)(conv2)
    pool2 = layers.MaxPooling1D(pool_size=2)(drop2)
    lstm2 = layers.Bidirectional(layers.LSTM(64, dropout=0.2, recurrent_dropout=0.0))(pool2)
    flat2 = layers.Flatten()(lstm2)

    # Channel 3
    conv3 = layers.Conv1D(filters=16, kernel_size=8, activation='relu')(emb1)
    drop3 = layers.Dropout(0.5)(conv3)
    pool3 = layers.MaxPooling1D(pool_size=2)(drop3)
    lstm3 = layers.Bidirectional(layers.LSTM(64, dropout=0.2, recurrent_dropout=0.0))(pool3)
    flat3 = layers.Flatten()(lstm3)


    # Merge
    merged = layers.concatenate([flat1, flat2, flat3])

    # Interpretation
    dense1  = layers.Dense(256, activation='relu')(merged)
    dense1  = layers.Dense(10, activation='relu')(dense1)
    outputs = layers.Dense(1, activation='sigmoid', bias_initializer=output_bias)(dense1)

    model = Model(inputs=input1, outputs=outputs, name="Classificator")

    return model

classifier = make_multichannel_cnn(sl=sequence_length)

# Load the weights from the last checkpoint in classifier.ipynb
classifier.load_weights('{}/models/classificator_best.tf'.format(handle))


result = classifier.predict(tweet)
print(result)

[[0.4348981]]


In [11]:
# Now run one of both regressors depending on the tweet's predicted class
def make_basic(sl=20):
    """
    A basic and simple sequential model.
    :param sl: Input sequence length
    :return:
    """
    model = Sequential([
        layers.Input(shape=(sl,), dtype=tf.int32),
        layers.Embedding(vocab_size-0, embedding_dim,
                         embeddings_initializer=Constant(list(embedding_matrix)),
                         #input_length=batch_size, Deprecated?
                         trainable=False,
                         mask_zero=True,
                         name="embedding"),
        layers.Dense(64, activation='relu'),
        layers.LSTM(64),
        layers.Flatten(),
        layers.Dense(1)
    ])

    return model


threshold = 0.25553209091655876 # Erdayastronaut (from the classifier notebook)

regressor0 = make_basic(sl=sequence_length)
regressor1 = make_basic(sl=sequence_length)

regressor0.load_weights('{}/models/regressor0_best.tf'.format(handle))
regressor1.load_weights('{}/models/regressor1_best.tf'.format(handle))

rating = 0.0

if result <= threshold:
    print("Predicted class 0... running regressor 0:")
    rating = regressor0.predict(tweet)
else:
    print("Predicted class 1... running regressor 1:")
    rating = regressor1.predict(tweet)

print()
print()
print("Predicted Rating for tweet: {}".format(tweet_raw))
print(rating)

Predicted class 1... running regressor 1:


Predicted Rating for tweet: Hello, everybody! I love Elon Musk and SpaceX.
[[0.51202065]]
