# About

See what BERT is all about. This version is meant to run on a laptop


### Understanding searches better than ever before

google's white paper

https://www.blog.google/products/search/search-language-understanding-bert


In [21]:
import logging

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger("bert")

In [27]:
import json
import gc

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from pytorch_pretrained_bert.modeling import BertForNextSentencePrediction
from pytorch_pretrained_bert.tokenization import BertTokenizer
# from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear, SCHEDULES
from fastprogress import master_bar, progress_bar
from sklearn.model_selection import StratifiedShuffleSplit

import random
import numpy as np
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device == torch.device("cuda"):
    torch.cuda.manual_seed_all(SEED)

In [6]:

SEED = 42
PYTORCH_PRETRAINED_BERT_CACHE = "../models/bert_en_uncased_L-12_H-768_A-12/1"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Prep

# sample

In [4]:
t1 = questions[questions["accepted_answer_id"] > 0]
questions_sample = t1.sample(600, random_state = 20191102)

In [5]:
questions_sample.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,title,body,accepted_answer_id,answer_count,comment_count,community_owned_date,creation_date,...,seaborn,geospatial,stata,plyr,pie-chart,graphviz,spss,diagram,qlikview,altair
557822,557822,557822,17108288,Why is my CSS :hover @keyframes animation not ...,<p>I am a newbie. Why is this code not working...,17108610.0,1,18,,2013-06-14 12:16:20.233000+00:00,...,0,0,0,0,0,0,0,0,0,0
209952,209952,209952,13656097,Mixing line and scatterplot in ggplot,<p>I've looked around a fair bit but I am stum...,13656277.0,2,0,,2012-12-01 03:35:01.673000+00:00,...,0,0,0,0,0,0,0,0,0,0
275911,275911,275911,395599,Java graphic library for multicoloured text,<p>I would like to know the recommended librar...,395679.0,4,0,,2008-12-27 23:44:33.840000+00:00,...,0,0,0,0,0,0,0,0,0,0


# corpus

In [6]:
# constructs corpus
# with question id, title, accepted answer id, answer body
def construct_corpus(questions, answers = answers):
    t1 = questions[["id", "title", "tags", "accepted_answer_id"]].rename(columns = {"id" : "q_id", "title" : "q_title"})
    t2 = answers[["id", "body"]].rename(columns = {"id" : "a_id", "body" : "a_body"})
    t3 = t1.merge(t2, left_on = "accepted_answer_id", right_on = "a_id", how = "inner").drop(columns = "a_id")
    return(t3)

In [7]:
# corpus = construct_corpus(questions_sample)
# corpus.head(3)

Unnamed: 0,q_id,q_title,tags,accepted_answer_id,a_body
0,17108288,Why is my CSS :hover @keyframes animation not ...,htmlcss3cross-browsercss-animations,17108610.0,<p>here it is:</p>\r\n\r\n<p>u have to add ven...
1,13656097,Mixing line and scatterplot in ggplot,rggplot2,13656277.0,"<p>As @MattBagg has pointed out, this issue is..."
2,395599,Java graphic library for multicoloured text,javagraphics,395679.0,<p>I'm assuming you're rendering text to an ar...


# helper functions

In [11]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, target):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.target = target

In [12]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""
    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [13]:
def convert_sentence_pair(titles, descs, max_seq_length, tokenizer):
    features = []
    for (ex_index, (title, desc)) in enumerate(zip(titles, descs)):
        tokens_a = tokenizer.tokenize(title)

        tokens_b = None
        tokens_b = tokenizer.tokenize(desc)
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))

        features.append(
                InputFeatures(
                    input_ids=input_ids,
                    input_mask=input_mask,
                    segment_ids=segment_ids,
                    target=1
        ))
    return features

In [14]:
tokenizer = BertTokenizer.from_pretrained(
    "bert-base-uncased", do_lower_case=True, 
    cache_dir=PYTORCH_PRETRAINED_BERT_CACHE)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 231508/231508 [00:00<00:00, 4457611.70B/s]


In [17]:
corpus.q_title.tolist()

['Why is my CSS :hover @keyframes animation not working?',
 'Mixing line and scatterplot in ggplot',
 'Java graphic library for multicoloured text',
 'what would be a good way to have a day background and night backgroud switch over with time am pm?',
 'ForeignObject text not appearing',
 'igraph graph.data.frame silently converts factors to character vectors',
 'How can I write a multi-line if block on Bitbucket Pipeline?',
 'C# List elements lifetime Unity3d',
 'Vertex label in JUNG graph visualization',
 'Get a color component from an rgb string in Javascript?',
 'iOS app crashes on device not simulator: Im using an animation with 280+ images',
 'Stage untracked files for commit without staging tracked file changes',
 'Dictionary uses tostring instead of object?',
 'Writing text into and retrieving text from the text file using command line auguments',
 'Overlaying ggplot data layers',
 'How to create an Orbit Chart in R? (Plotly/ggplot2)',
 'Highcharts metric prefix',
 "Using Rapha

In [None]:
correct_pairs = convert_sentence_pair(corpus.q_title.tolist(), corpus.a_body.tolist(), max_seq_length=200, tokenizer=tokenizer)

# Model

In [None]:
model = BertForNextSentencePrediction.from_pretrained(
    "bert-base-uncased",
    cache_dir=PYTORCH_PRETRAINED_BERT_CACHE
).to(device)

# Eval Correct Pairs

In [28]:
BATCH_SIZE = 128
logger.info("***** Running evaluation *****")
all_input_ids = torch.tensor([f.input_ids for f in correct_pairs], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in correct_pairs], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in correct_pairs], dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=BATCH_SIZE)

logger.info("  Num examples = %d", len(correct_pairs))
logger.info("  Batch size = %d", BATCH_SIZE)

model.eval()

res = []

mb = progress_bar(eval_dataloader)
for input_ids, input_mask, segment_ids in mb:
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)

    with torch.no_grad():
        res.append(nn.functional.softmax(
            model(input_ids, segment_ids, input_mask), dim=1
        )[:, 0].detach().cpu().numpy())
        
res = np.concatenate(res)

11/06/2019 18:23:27 - INFO - bert -   ***** Running evaluation *****
11/06/2019 18:23:27 - INFO - bert -     Num examples = 600
11/06/2019 18:23:27 - INFO - bert -     Batch size = 128


# Find Similar entries

In [None]:
idx = 102
sentence_pairs = convert_sentence_pair(
    [corpus.iloc[idx]["q_title"]] * corpus.shape[0], 
    corpus.a_body.tolist(), max_seq_length=200, tokenizer=tokenizer)

In [37]:
BATCH_SIZE = 128
logger.info("***** Running evaluation *****")
all_input_ids = torch.tensor([f.input_ids for f in sentence_pairs], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in sentence_pairs], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in sentence_pairs], dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=BATCH_SIZE)

logger.info("  Num examples = %d", len(correct_pairs))
logger.info("  Batch size = %d", BATCH_SIZE)

model.eval()

res = []

mb = progress_bar(eval_dataloader)
for input_ids, input_mask, segment_ids in mb:
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)

    with torch.no_grad():
        res.append(nn.functional.softmax(
            model(input_ids, segment_ids, input_mask), dim=1
        )[:, 0].detach().cpu().numpy())
        
res = np.concatenate(res)

11/06/2019 18:36:58 - INFO - bert -   ***** Running evaluation *****
11/06/2019 18:36:58 - INFO - bert -     Num examples = 600
11/06/2019 18:36:58 - INFO - bert -     Batch size = 128


In [39]:
best_matches = np.argsort(res)[::-1][:10]
best_matches

array([560, 528, 102, 364, 581, 312,  78, 558,   5, 574], dtype=int64)