### GLOBAL CONFIG

In [None]:
%load_ext autoreload
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle

PREPROCESS = True # Do a fresh preprocess
NEW_MODEL = False
MAKE_NEW_EMBEDDING = True # If False, the stored one will be loaded
# EMB_MAX_WORDS = None
RANDOM_SEED = 456
SAVE_TRAINED_MODEL = True

PREPROCESS_INPUT = './data/training.1600000.processed.noemoticon.csv'
PREPROCESS_OUTPUT = './data/preprocessed.csv'
GLOVE_FILE = './data/glove.6B.50d.txt'
EMB_PKL = './models/emb_layer.pkl'
MODEL_PKL = './models/model.pkl'
MODEL_TYPE = 'sequential'
MODEL_PATH = 'models/trained_'+ MODEL_TYPE
TWEETS_TO_EXPLAIN = [111, 156, 933487, 933565]


### Preprocess data and store it

In [None]:
from pre import preprocess

if PREPROCESS:
    preprocess(i=PREPROCESS_INPUT, o=PREPROCESS_OUTPUT, slice=None)

### Load preprocessed data

In [None]:
from parse import load_data
from sklearn.model_selection import train_test_split

data = load_data(PREPROCESS_OUTPUT)
X = data['tweet']
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=RANDOM_SEED)

y_train = tf.keras.utils.to_categorical(y_train, 2)
y_test = tf.keras.utils.to_categorical(y_test, 2)
y_val = tf.keras.utils.to_categorical(y_val, 2)


### Create/load model

In [None]:
%autoreload 2
from TextClassifierModel import new_classifier, load_classifier, save_classifier

if NEW_MODEL:
    text_classifier = new_classifier(glove_file = GLOVE_FILE, data=data, model_type=MODEL_TYPE)
    save_classifier(text_classifier, MODEL_PATH)
else:
    text_classifier = load_classifier(model_path=MODEL_PATH)
print(text_classifier.model.summary())


### Train

In [None]:
%autoreload 2
from evaluation import plot_history

history = text_classifier.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=60, epochs=30, verbose=1)
plot_history(history)


### Save trained model

In [None]:
from TextClassifierModel import save_classifier
save_classifier(text_classifier, MODEL_PATH)

### Evaluate production (forbidden during tuning)

In [None]:
from evaluation import evaluate_model

evaluate_model(text_classifier, X_test, y_test, verbose=False)

### Explain prediction

In [None]:
# Load unprocessed data for readability
%autoreload 2
from pre import load_unprocessed, split_and_preprocess
df_orig = load_unprocessed(PREPROCESS_INPUT)

# Fetch subset of training tweets
explain_tweets_orig, explain_tweets_prep = split_and_preprocess(df_orig, TWEETS_TO_EXPLAIN)


In [None]:
from explainability import explain_and_save
explain_and_save(explain_tweets_orig, explain_tweets_prep, TWEETS_TO_EXPLAIN, text_classifier,  MODEL_TYPE)


In [None]:

from explainability import save_predictions
save_predictions(explain_tweets_prep, TWEETS_TO_EXPLAIN,  text_classifier)


In [None]:
from explainability import display_html_browser
import codecs
idx = TWEETS_TO_EXPLAIN[0]
html=codecs.open(f"data/predictions/html_{idx}.html", 'r').read()
display_html_browser(html, f"explain_{idx}")

In [None]:
%autoreload 2

from visualize_embeddings import  display_pca_scatter_plot
display_pca_scatter_plot(GLOVE_FILE)
