In [None]:
!pip install --upgrade transformers # only run this once per kernel session - dont want to overload kaggle

Explanation of each library we use here:
* numpy - numerical linear algebra library, makes it easy for us to do some array operations
* transformers - huggingface's transformers NLP library, contains the BERT model + tokenizer that we will use
* pickle - a library used for reading pickled files. Our data is pickled, so we need to use this library to open the data
* tensorflow - the ML training library that we will use to train BERT and fine-tune it
* re - regex python library, used in data cleaning
* nltk - natural language toolkit, using in data cleaning

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import transformers as ppb # BERT Model
import pickle # decode pickled data
import tensorflow as tf
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import regexp_tokenize


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

First, load all the data into the program. We use the given training and testing set.

In [None]:
train_x = pd.read_pickle("../input/humor-detection/X_train.pickle")
train_y = pd.read_pickle("../input/humor-detection/y_train.pickle")
test_x = pd.read_pickle("../input/humor-detection/X_test.pickle")
test_y = pd.read_pickle("../input/humor-detection/y_test.pickle")

Below is the main model and tokenizer setup.

To tokenize our data, we use huggingface's BertTokenizer, which does some extra stuff on top of our own cleaning, like adding tokens like \[CLS\], and other necessary steps for BERT. We still have to do the same data cleaning that we had done previously, and so I copied over the steps from there into here.

The model is the TFBertForSequenceClassification model, which is basically a seqeunce classifier (like we want). This is better than previous models since the classification step and fine-tuning step are packed into one step, making it easier for us to use and work with.

In [None]:
tokenizer = ppb.AutoTokenizer.from_pretrained("bert-base-uncased")
model = ppb.TFBertForSequenceClassification.from_pretrained("bert-base-uncased")

These data cleaning functions below perform the same functions as were done in the main data processing notebook:
* ``lemmatize()`` - lemmatizes the sentence input ``s``, helps simplify model vocabulary
* ``lower()`` - lowercases all the words in the sentence inputs ``s`` - is a remnant of a previous iteration, but I kept it around since it did no harm
* ``clean()`` - a generalized cleaning function that does the two steps above + removes all numbers from ``data``, list of sentences passed in
* ``tokenize()`` - tokenizes the list of sentences passed in ``text`` - this is what the ``BertTokenizer`` from the transformers library does. Returns an array of word vectors.
* ``process()`` - a combination of cleaning and tokenizing, a function really created for our ease of use

In [None]:
def lemmatize(s):
    wordnet_lemmatizer = WordNetLemmatizer()
    return " ".join([wordnet_lemmatizer.lemmatize(w,'v') for w in s.split(" ")])
def lower(s):
    return s.lower()
def clean(data):
    for item in data:
        lemmatize(item)
        lower(item)
        re.sub(r'\d+', '', item) # remove nums
    return data
def tokenize(text):
    tokenized = tokenizer(text, padding=True, truncation=True, return_tensors="tf")
    return tokenized
def process(data):
    cleaned = clean(data)
    return tokenize(data)

In [None]:
train_batch = process(train_x)
test_batch = process(test_x)

Below we set up the TensorFlow model that we're going to use to classify and fine-tune BERT.

In [None]:
learning_rate = 2e-5
epochs = 10
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-8)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric1 = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
metric2 = tf.keras.metrics.Precision(name="precision")
metric3 = tf.keras.metrics.Recall(name="recall")
model.compile(optimizer=optimizer, loss=loss, metrics=[metric1])

In [None]:
history = model.fit(x=train_batch.input_ids, y=np.array(train_y), epochs=epochs)

In [None]:
model.evaluate(x=test_batch.input_ids, y=np.array(test_y))

In [None]:
from sklearn.metrics import classification_report
y_pred = model.predict(x=test_batch.input_ids)

In [None]:
print(y_pred[0].shape)
y_pred_bool = np.argmax(y_pred[0], axis=1)
print(np.array(test_y).shape)
print(y_pred_bool)
print(y_pred[0])
print(classification_report(test_y, y_pred_bool,))

In [None]:
tk = tokenizer("When my son told me to stop impersonating a flamingo, I had to put my foot down.", padding=True)
out = model.predict(x=tk.input_ids)
print(np.argmax(out[0], axis=1))

In [None]:
submission = pd.DataFrame({"Prediction":y_pred_bool})
submission.to_csv("predictions.csv", index=True, index_label="Id")