In [None]:
import tweepy
import twint
import nltk
from nltk.corpus import stopwords
#from nltk.stem import PorterStemmer
from nltk.stem import Cistem
#from nltk.stem.snowball import GermanStemmer
from nltk.tokenize import TweetTokenizer
#import re
import regex as re
import emoji
import datetime
import glob
import io
import os
import random
import string
import unicodedata
import time
import sys
import pandas as pd
import numpy as np
from scipy import stats
import sklearn
from sklearn.preprocessing import QuantileTransformer
from sklearn.mixture import GaussianMixture as GMM
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
from pprint import pprint

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras.initializers import Constant
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

import tqdm

# Workaround for Jupyter's "This event loop is already running" exception
import nest_asyncio
nest_asyncio.apply()

In [None]:
# Check for GPU availability
if tf.test.gpu_device_name():
    print(tf.test.gpu_device_name())
    print("GPU available.")
else:
    print("GPU not available.")

# Print numpy version
print(np.__version__) ## For LSTM layers to work, numpy must be version 1.19.5 for some reason

tweets = None


In [None]:
#handle = "gecko203"
#handle = "FiatPecunia"
handle = "Erdayastronaut"
batch_size = 32 #16 #4 #= 32
buffer_size = 10000
seed = 42
window_size = 4
embedding_dim = 0 #128 # Must be equal to the dimension of the trained embedding; is set later on
num_ns = 4 # Number of negative samples
sequence_length = 20 #10 TODO: Set to the longest tweet!

tweets_dataset = tf.data.experimental.make_csv_dataset(
    "{}/preproc/dataset.csv".format(handle), batch_size=batch_size,
    label_name="bin", select_columns=['bin', 'tweet'],
    num_epochs=1)

train_dataset = tf.data.experimental.make_csv_dataset(
    "{}/preproc/train.csv".format(handle), batch_size=batch_size,
    label_name="bin", select_columns=['bin', 'tweet'],
    num_epochs=1)
test_dataset = tf.data.experimental.make_csv_dataset(
    "{}/preproc/test.csv".format(handle), batch_size=batch_size,
    label_name="bin", select_columns=['bin', 'tweet'],
    num_epochs=1)
val_dataset = tf.data.experimental.make_csv_dataset(
    "{}/preproc/val.csv".format(handle), batch_size=batch_size,
    label_name="bin", select_columns=['bin', 'tweet'],
    num_epochs=1)

In [None]:
for feature_batch, label in train_dataset.take(1):
    for key, value in feature_batch.items():
        print(f"{key:20s}: {value}")
    print()
    print(f"{'label':20s}: {label}")

### Load the Word2Vec-Embedding

In [None]:
emb_df = pd.read_pickle('{}/preproc/gensim_w2v_dict.pkl'.format(handle))
emb_df.head()

In [None]:
print(len(emb_df["word"]))

# Since the StringLookup-Layer or the TextVectorization-Layer prepend two new words ('' and [UNK]),
# we need to "shift" the weight matrix by two by prepending "empty" rows...
# This could lead to problems if we later try to infer on new strings containing
# words which are out of vocabulary... (?)

embedding_matrix = emb_df["vec"]
embedding_dim = len(embedding_matrix[0])
print(embedding_dim)

data = []
print(type(emb_df["vec"][0]))
data.insert(0, np.zeros(embedding_dim))
data.insert(0, np.zeros(embedding_dim))
embedding_matrix = pd.concat([pd.Series(data), embedding_matrix], ignore_index=True)
print(embedding_matrix.head())


#emb_df = pd.concat([pd.DataFrame(data), emb_df], ignore_index=True)
#print(emb_df.head())
#layer = StringLookup(vocabulary=list(emb_df["word"]))
vectorize_layer = TextVectorization(output_sequence_length=sequence_length,
                                    vocabulary=list(emb_df["word"]))

vocab_size = len(vectorize_layer.get_vocabulary())

print(vocab_size)
print(vectorize_layer.get_vocabulary()[:3])


In [None]:
data = tf.constant(["frau man honclbrif mal"])
vectorize_layer(data)

In [None]:
inverse_vocab = vectorize_layer.get_vocabulary()
print(len(inverse_vocab))
print(inverse_vocab[:20])

In [None]:
def vectorize_text(text, label):
    text = text['tweet']
    return vectorize_layer(text), label

In [None]:
# Map all strings to their integer representations
train_dataset = train_dataset.map(vectorize_text)
test_dataset  = test_dataset.map(vectorize_text)
val_dataset   = val_dataset.map(vectorize_text)

AUTOTUNE = tf.data.AUTOTUNE

tweets_dataset = tweets_dataset.cache().prefetch(buffer_size=AUTOTUNE)
train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)
val_dataset = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)

## Finally define and train our classification model...

In [None]:

def make_basic(sl=20):
    """
    A basic and simple sequential model.
    :param sl: Input sequence length
    :return:
    """
    # Functional API
    inputs = tf.keras.Input(shape=(sl,), dtype=tf.int32, name="tweet")
    emb    = layers.Embedding(vocab_size-0, embedding_dim,
                              embeddings_initializer=Constant(list(embedding_matrix)),
                              #input_length=batch_size, Deprecated?
                              trainable=True,
                              mask_zero=True,
                              name="embedding")
    x = inputs
    x = emb(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.LSTM(64)(x)
    x = layers.Flatten()(x)
    x = layers.Dense(128, activation='relu')(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs, name="basic_classifier")

    return model

def make_basic2(sl=20):
    """
    Just the word2vec embedding and a single output neuron...
    :param sl: Input sequence length
    :return:
    """
    # Functional API
    input1 = tf.keras.Input(shape=(sl,), dtype=tf.string, name="tweet")
    # Vectorization layer not needed anymore because
    vec1   = input1
    emb1   = layers.Embedding(vocab_size-0, embedding_dim,
                              # Weights should be initialized after defining the model
                              # due to protobuf's limit of 2GB:
                              # ValueError: Message tensorflow.SavedModel exceeds maximum protobuf size of 2GB: 6768286642
                              embeddings_initializer=Constant(list(embedding_matrix)),
                              #input_length=batch_size, Deprecated?
                              trainable=False,
                              name="embedding",
                              mask_zero=True)(vec1)
    x = layers.Flatten()(emb1)
    outputs = layers.Dense(1, activation='sigmoid')(x)

    model = tf.keras.Model(inputs=input1, outputs=outputs, name="embedding_classifier")

    return model

def make_feedforward_lstm(sl=20, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    input = tf.keras.Input(shape=(sl,), dtype=tf.int32, name="tweet")
    emb1   = layers.Embedding(vocab_size-0, embedding_dim,
                              # Weights should be initialized after defining the model
                              # due to protobuf's limit of 2GB:
                              # ValueError: Message tensorflow.SavedModel exceeds maximum protobuf size of 2GB: 6768286642
                              embeddings_initializer=Constant(list(embedding_matrix)),
                              #input_length=batch_size, Deprecated?
                              trainable=True,
                              name="embedding1",
                              mask_zero=True)(input)

    x = layers.Bidirectional(layers.LSTM(sequence_length))(emb1)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    ##x = layers.Flatten()(x)
    ##x = layers.Dense(512, activation='relu', kernel_constraint=max_norm(3))(x)
    ##x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)

    model = Model(inputs=input, outputs=outputs, name="feedforward_lstm")

    return model

def make_stacked_lstm(sl=20, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    input = tf.keras.Input(shape=(sl,), dtype=tf.int32, name="tweet")
    emb1   = layers.Embedding(vocab_size-0, embedding_dim,
                              # Weights should be initialized after defining the model
                              # due to protobuf's limit of 2GB:
                              # ValueError: Message tensorflow.SavedModel exceeds maximum protobuf size of 2GB: 6768286642
                              embeddings_initializer=Constant(list(embedding_matrix)),
                              #input_length=batch_size, Deprecated?
                              trainable=True,
                              name="embedding1",
                              mask_zero=True)(input)

    # Stacked LSTM
    x = layers.Dropout(0.2)(emb1)
    x = layers.Bidirectional((layers.LSTM(int(embedding_dim / 4), return_sequences=True)))(x)
    x = layers.Dropout(0.2)(x)
    x = layers.LSTM(int(embedding_dim / 2), return_sequences=True)(x)
    x = layers.Dropout(0.2)(x)
    x = layers.LSTM(int(embedding_dim / 2))(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)

    model = Model(inputs=input, outputs=outputs, name="feedforward_lstm")

    return model

def make_multichannel_cnn(sl=20, output_bias=None):
    # Bias etc. from https://www.tensorflow.org/tutorials/structured_data/imbalanced_data
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)

    # Multichannel CNN (from https://machinelearningmastery.com/develop-n-gram-multichannel-convolutional-neural-network-sentiment-analysis/)
    # 1.  Embedding: Representation of words and their similarity
    # 1.1 (LSTM directly connected to 3.?)
    # 2.  Convolutional Model: Feature extraction
    # 2.1 (LSTM?)
    # 3.  Fully Connected Model: Interpretation

    # Channel1
    input1 = tf.keras.Input(shape=(sl,), dtype=tf.int32, name="tweet")
    vec1   = input1
    emb1   = layers.Embedding(vocab_size-0, embedding_dim,
                              # Weights should be initialized after defining the model
                              # due to protobuf's limit of 2GB:
                              # ValueError: Message tensorflow.SavedModel exceeds maximum protobuf size of 2GB: 6768286642
                              embeddings_initializer=Constant(list(embedding_matrix)),
                              #input_length=batch_size, Deprecated?
                              trainable=True,
                              name="embedding1",
                              mask_zero=True)(vec1)
    conv1 = layers.Conv1D(filters=16, kernel_size=3, activation='relu')(emb1)
    drop1 = layers.Dropout(0.5)(conv1)
    pool1 = layers.MaxPooling1D(pool_size=2)(drop1)
    lstm1 = layers.Bidirectional(layers.LSTM(64, dropout=0.2, recurrent_dropout=0.0))(pool1)
    flat1 = layers.Flatten()(lstm1)

    # Channel2
    conv2 = layers.Conv1D(filters=16, kernel_size=6, activation='relu')(emb1)
    drop2 = layers.Dropout(0.5)(conv2)
    pool2 = layers.MaxPooling1D(pool_size=2)(drop2)
    lstm2 = layers.Bidirectional(layers.LSTM(64, dropout=0.2, recurrent_dropout=0.0))(pool2)
    flat2 = layers.Flatten()(lstm2)

    # Channel 3
    conv3 = layers.Conv1D(filters=16, kernel_size=8, activation='relu')(emb1)
    drop3 = layers.Dropout(0.5)(conv3)
    pool3 = layers.MaxPooling1D(pool_size=2)(drop3)
    lstm3 = layers.Bidirectional(layers.LSTM(64, dropout=0.2, recurrent_dropout=0.0))(pool3)
    flat3 = layers.Flatten()(lstm3)


    # Merge
    merged = layers.concatenate([flat1, flat2, flat3])

    # Interpretation
    dense1  = layers.Dense(256, activation='relu')(merged)
    dense1  = layers.Dense(10, activation='relu')(dense1)
    outputs = layers.Dense(1, activation='sigmoid', bias_initializer=output_bias)(dense1)

    model = Model(inputs=input1, outputs=outputs, name="Classificator")

    return model

def make_multichannel_cnn_nogensim(sl=20, output_bias=None):
    # Bias etc. from https://www.tensorflow.org/tutorials/structured_data/imbalanced_data
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)

    # Channel1
    input1 = tf.keras.Input(shape=(sl,), dtype=tf.int32, name="tweet")
    vec1   = input1
    emb1   = layers.Embedding(vocab_size-0, 100)(vec1)
    conv1 = layers.Conv1D(filters=16, kernel_size=3, activation='relu')(emb1)
    drop1 = layers.Dropout(0.5)(conv1)
    pool1 = layers.MaxPooling1D(pool_size=2)(drop1)
    lstm1 = layers.Bidirectional(layers.LSTM(20, dropout=0.2, recurrent_dropout=0.0))(pool1)
    flat1 = layers.Flatten()(lstm1)

    # Channel2
    emb2  = layers.Embedding(vocab_size-0, 100)(vec1)
    conv2 = layers.Conv1D(filters=16, kernel_size=6, activation='relu')(emb2)
    drop2 = layers.Dropout(0.5)(conv2)
    pool2 = layers.MaxPooling1D(pool_size=2)(drop2)
    lstm2 = layers.Bidirectional(layers.LSTM(20, dropout=0.2, recurrent_dropout=0.0))(pool2)
    flat2 = layers.Flatten()(lstm2)

    # Channel 3
    emb3  = layers.Embedding(vocab_size-0, 100)(vec1)
    conv3 = layers.Conv1D(filters=16, kernel_size=8, activation='relu')(emb3)
    drop3 = layers.Dropout(0.5)(conv3)
    pool3 = layers.MaxPooling1D(pool_size=2)(drop3)
    lstm3 = layers.Bidirectional(layers.LSTM(20, dropout=0.2, recurrent_dropout=0.0))(pool3)
    flat3 = layers.Flatten()(lstm3)

    # Merge
    merged = layers.concatenate([flat1, flat2, flat3])

    # Interpretation
    dense1  = layers.Dense(256, activation='relu')(merged)
    dense1  = layers.Dropout(0.2)(dense1)
    dense1  = layers.Dense(16, activation='relu')(dense1)
    dense1  = layers.Dropout(0.2)(dense1)
    outputs = layers.Dense(1, activation='sigmoid', bias_initializer=output_bias)(dense1)

    model = Model(inputs=input1, outputs=outputs, name="Classificator")

    return model



#def tweedie_loglikelihood(y, y_hat):
#    """
#    Implements the Tweedie loss function which is better suited to extremely skewed data like ours.
#
#    From: https://towardsdatascience.com/tweedie-loss-function-for-right-skewed-data-2c5ca470678f
#    """
#    p = 2 # power hyper-parameter
#
#    loss = - y * tf.pow(y_hat, 1 - p) / (1 - p) + \
#            tf.pow(y_hat, 2 - p) / (2 - p)
#    return tf.reduce_mean(loss)

def tweedieloss(y_true, y_pred):
    """
    An alternative loss function which should handle datasets with outliers better...
    It did not help with our problem.
    From: https://datascience.stackexchange.com/a/55393
    """
    p=0.0 #1.5
    dev = 2 * (tf.pow(y_true, 2-p)/((1-p) * (2-p)) -
                   y_true * tf.pow(y_pred, 1-p)/(1-p) +
                   tf.pow(y_pred, 2-p)/(2-p))
    return tf.reduce_mean(dev)

In [None]:
### This model did not work - it predicted constant values for every input!
### -> The reason we need to first classify the input and then perform regression depending on its predicted class.

#model = Sequential([
#    layers.Input(shape=(1,), dtype=tf.string),
#    vectorize_layer,
#    #layers.Embedding(vocab_size, embedding_dim,
#    #                 name="embedding"),
#    layers.Embedding(vocab_size-1, embedding_dim,
#                     embeddings_initializer=Constant(embedding_vectors),
#                     trainable=False,
#                     name="embedding"),
#    #layers.GlobalAvgPool1D(),
#    #layers.Bidirectional(layers.LSTM(512)),
#    #layers.LSTM(64),
#    #layers.Bidirectional(layers.Dense(64)),
#    layers.Dense(64, activation='relu'),
#    #layers.Dense(64, activation='relu'),
#    layers.Dense(1)
#])

#model.compile(loss='mean_absolute_error',
#              optimizer=tf.keras.optimizers.Adam(0.001))


epochs = 20 #100
patience = int(epochs * 0.50) #int(epochs * 0.3)
learning_rate = 1e-2 * 5 #1e-4 * 5
decay_rate = learning_rate / (epochs*epochs)
momentum = 0.8

# The threshold for deciding whether prediction values are 1 or 0 should be
# the value where we have split the data into both bins.
#x_split:
#threshold = 0.2754579724298949
#threshold = 0.3
threshold = 0.25553209091655876 # Erdayastronaut
#threshold = 0.5

#initial_bias = [0.53100804]
initial_bias = [2.89037479] # Erdayastronaut

model = make_multichannel_cnn(sl=sequence_length, output_bias=initial_bias)
#model = make_multichannel_cnn_nogensim(sl=sequence_length, output_bias=initial_bias)


# Compute class weights so that the optimizer doesn't get stuck
# in a local minimum and both classes are balanced.
labels = np.concatenate([y for x, y in train_dataset], axis=0)
classWeights = compute_class_weight('balanced', np.unique(labels), labels)
classWeights = dict(enumerate(classWeights))


METRICS = [
    tf.keras.metrics.TruePositives(name='tp'),
    tf.keras.metrics.FalsePositives(name='fp'),
    tf.keras.metrics.TrueNegatives(name='tn'),
    tf.keras.metrics.FalseNegatives(name='fn'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall'),
    tf.keras.metrics.AUC(name='auc'),
    tf.keras.metrics.AUC(name='prc', curve='PR'),
    tf.metrics.BinaryAccuracy(threshold=threshold)
]

loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
#opt = tf.keras.optimizers.SGD(lr=learning_rate, decay=decay_rate, momentum=momentum, nesterov=True) #1e-5
#opt = tf.keras.optimizers.Adamax(learning_rate=1e-04, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
opt = tf.keras.optimizers.Adam(learning_rate=1e-3)

model.compile(loss=loss,
              #optimizer='adam', #opt,
              optimizer=opt,
              #metrics=tf.metrics.BinaryAccuracy(threshold=threshold))
              metrics=METRICS)

model.summary()

tf.keras.utils.plot_model(model, show_shapes=True, to_file='{}.png'.format(model.name))

In [None]:
print(np.unique(labels))
print(classWeights)

In [None]:
test_features = test_dataset.map(lambda x, y: x)
test_labels = test_dataset.map(lambda x, y: y)
model.predict(test_features)

In [None]:
tf.autograph.set_verbosity(0)
#logging.getLogger("tensorflow").setLevel(logging.ERROR)
early_stopping = EarlyStopping(monitor='val_binary_accuracy', verbose=1, patience=patience)
#early_stopping = EarlyStopping(monitor='val_precision', mode='max', verbose=1, patience=patience)
model_checkpoint = ModelCheckpoint('{}/models/classificator_best.tf'.format(handle),
                                   monitor='val_binary_accuracy',
                                   #monitor='val_precision', mode='max',
                                   save_format='tf',
                                   #save_format='h5',
                                   save_best_only=True,
                                   save_weights_only=True,
                                   verbose=1)


history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    class_weight=classWeights,
    callbacks=[early_stopping, model_checkpoint],
    #steps_per_epoch=steps_per_epoch,
    epochs=epochs
)

In [None]:
saved_model = load_model('{}/models/classificator_best.tf'.format(handle))

_, train_acc = saved_model.evaluate(train_dataset, verbose=1)
_, test_acc  = saved_model.evaluate(test_dataset, verbose=1)

print('Train Accuracy: %.3f, Test Accuracy: %.3f' % (train_acc, test_acc))

In [None]:
def plot_loss(hist):
    plt.plot(hist.history['loss'], label='loss')
    plt.plot(hist.history['val_loss'], label='val_loss')
    plt.ylim([0, .3])
    plt.xlabel=('Epoch')
    plt.ylabel=('Error')
    plt.legend()
    plt.grid(True)

plot_loss(history)

# Evaluate the classifier

In [None]:
for feature_batch, label in test_dataset.take(1):
    for key, value in feature_batch.items():
        print(f"{key:20s}: {value}")
    print()
    print(f"{'label':20s}: {label}")

In [None]:
history_dict = history.history

acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs_ = range(1, len(acc) + 1)

plt.plot(epochs_, loss, 'bo', label='Training loss')
plt.plot(epochs_, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.plot(epochs_, acc, 'bo', label='Training acc')
plt.plot(epochs_, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()

In [None]:
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
def plot_metrics(history):
    metrics = ['loss', 'prc', 'precision', 'recall']
    for n, metric in enumerate(metrics):
        name = metric.replace("_"," ").capitalize()
        plt.subplot(2,2,n+1)
        plt.plot(history.epoch, history.history[metric], color=colors[0], label='Train')
        plt.plot(history.epoch, history.history['val_'+metric],
                 color=colors[0], linestyle="--", label='Val')
        plt.xlabel('Epoch')
        plt.ylabel(name)
        if metric == 'loss':
              #plt.ylim([0, plt.ylim()[1]])
                print("")
        elif metric == 'auc':
              plt.ylim([0.8,1])
        else:
              #plt.ylim([0,1])
            print("")

    plt.legend()

plot_metrics(history)

In [None]:
def plot_cm(labels, predictions, p=0.5):
    cm = confusion_matrix(labels, predictions > p)
    plt.figure(figsize=(5,5))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title("Confusion matrix @{:.2f}".format(p))
    plt.ylabel("Actual label")
    plt.xlabel("Predicted label")

    print('True Negatives: ', cm[0][0])
    print('False Positives: ', cm[0][1])
    print('False Negatives: ', cm[1][0])
    print('True Positives: ', cm[1][1])
    print('Total: ', np.sum(cm[1]))

test_results = model.evaluate(
    test_dataset
)
print(test_results)

test_features = test_dataset.map(lambda x, y: x)
test_labels = test_dataset.map(lambda x, y: y)
test_predictions = model.predict(test_features)

#print(test_labels)

ex = test_labels.unbatch()
ex = pd.DataFrame(ex)
print(ex)
print(test_predictions)

train_features = train_dataset.map(lambda x, y: x)
train_labels = train_dataset.map(lambda x, y: y)
ex2 = train_labels.unbatch()
ex2 = pd.DataFrame(ex2)
train_predictions = model.predict(train_features)


a = plt.axes(aspect='equal')
plt.scatter(ex, test_predictions)
plt.xlabel('True Values')
plt.ylabel('Predictions')
lims = [0, 1]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

for name, value in zip(model.metrics_names, test_results):
    print(name, ': ', value)
print()

plot_cm(ex, test_predictions, p=threshold)

In [None]:
def plot_roc(name, labels, predictions, **kwargs):
    fp, tp, _ = sklearn.metrics.roc_curve(labels, predictions)

    plt.plot(100*fp, 100*tp, label=name, linewidth=2, **kwargs)
    plt.xlabel('False positives [%]')
    plt.ylabel('True positives [%]')
    plt.xlim([-0.5,20])
    plt.ylim([80,100.5])
    plt.grid(True)
    ax = plt.gca()
    ax.set_aspect('equal')

plot_roc("Train", ex2, train_predictions, color=colors[0])
plot_roc("Test", ex, test_predictions, color=colors[0], linestyle='--')
plt.legend(loc='lower right')
