In [1]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)


In [3]:
def tokenize_sentence(sentence):
    tokens = tokenizer.tokenize(sentence)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    return token_ids


In [6]:
from helpers import *

df = pd.read_json(QUOTES_2020_FOR_BERT, lines=True, compression='bz2')
df['quotation_length'] = df['quotation'].apply(lambda x: len(x.split()))
df['quotation_clean_length'] = df['quotation_clean'].apply(lambda x: len(x.split()))



In [10]:
df.quantile(0.95)

numOccurrences            12.0
quotation_length          54.0
quotation_clean_length    54.0
Name: 0.95, dtype: float64

In [17]:
df.quantile(0.05)

numOccurrences            1.0
quotation_length          6.0
quotation_clean_length    6.0
Name: 0.05, dtype: float64

In [18]:
def keep_range_lengths_quotes(df,field_to_filter_on,min_length,max_length):
    return df[np.logical_and(df[field_to_filter_on] >= min_length,df[field_to_filter_on] <= max_length)]

df = keep_range_lengths_quotes(df,'quotation_clean_length',6,54)

In [19]:
df.shape

(316436, 11)

In [21]:
sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.'

In [23]:
encoding = tokenizer.encode_plus(
  sample_txt,
  max_length=54,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)
encoding.keys()
#dict_keys(['input_ids', 'attention_mask'])

dict_keys(['input_ids', 'attention_mask'])

In [25]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

['[CLS]',
 'When',
 'was',
 'I',
 'last',
 'outside',
 '?',
 'I',
 'am',
 'stuck',
 'at',
 'home',
 'for',
 '2',
 'weeks',
 '.',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [20]:
df.quantile(0.99)

numOccurrences            58.0
quotation_length          84.0
quotation_clean_length    83.0
length_diff                1.0
Name: 0.99, dtype: float64

In [8]:
df.shape

(349146, 10)

In [16]:
df[df['quotation_length'] ==  df['quotation_length'].min()].quotation

261604    # for #
Name: quotation, dtype: object

In [23]:
df['quotation_length'].min()

2