<div class="alert alert-success">
<h1> Beer Data Science Assignment </h1>
</div>


In [17]:
from IPython.display import HTML
import pandas as pd
import os
import numpy as np
from datetime import datetime
import wordcloud
import warnings
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
warnings.filterwarnings('ignore')

BASE_PATH = "../"
INPUT_DATA_PATH = os.path.join(BASE_PATH, "data/raw/BeerDataScienceProject.csv")

In [2]:
df_beer_raw = pd.read_csv(INPUT_DATA_PATH, encoding="ISO-8859-1")

In [3]:
df_beer_raw.shape

(528870, 13)

In [4]:
df_beer_raw.dtypes

beer_ABV              float64
beer_beerId             int64
beer_brewerId           int64
beer_name              object
beer_style             object
review_appearance     float64
review_palette        float64
review_overall        float64
review_taste          float64
review_profileName     object
review_aroma          float64
review_text            object
review_time             int64
dtype: object

### Data Cleaning
```In the below cell we can see that there are some columns where data is missing. 
   We need to remove those data points from our data set.
```

In [5]:
df_beer_raw.isna().sum()

beer_ABV              20280
beer_beerId               0
beer_brewerId             0
beer_name                 0
beer_style                0
review_appearance         0
review_palette            0
review_overall            0
review_taste              0
review_profileName      115
review_aroma              0
review_text             119
review_time               0
dtype: int64

In [6]:
df_beer = df_beer_raw.dropna()

In [7]:
df_beer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 508358 entries, 0 to 528796
Data columns (total 13 columns):
beer_ABV              508358 non-null float64
beer_beerId           508358 non-null int64
beer_brewerId         508358 non-null int64
beer_name             508358 non-null object
beer_style            508358 non-null object
review_appearance     508358 non-null float64
review_palette        508358 non-null float64
review_overall        508358 non-null float64
review_taste          508358 non-null float64
review_profileName    508358 non-null object
review_aroma          508358 non-null float64
review_text           508358 non-null object
review_time           508358 non-null int64
dtypes: float64(6), int64(3), object(4)
memory usage: 54.3+ MB


## Data Changes in Raw and Clean dataset

In [8]:
print("Raw data size: ", df_beer_raw.shape[0])
print("Clean data size: ", df_beer.shape[0])
print("Data points difference: ", df_beer_raw.shape[0]-df_beer.shape[0])

Raw data size:  528870
Clean data size:  508358
Data points difference:  20512


### Describe the categorical features

In [9]:
df_beer.describe(include=[np.object])

Unnamed: 0,beer_name,beer_style,review_profileName,review_text
count,508358,508358,508358,508358
unique,14028,104,22563,508007
top,Sierra Nevada Celebration Ale,American IPA,northyorksammy,#NAME?
freq,2998,42092,1732,90


### Describe the numerical features

In [10]:
df_beer.describe(include=[np.number])

Unnamed: 0,beer_ABV,beer_beerId,beer_brewerId,review_appearance,review_palette,review_overall,review_taste,review_aroma,review_time
count,508358.0,508358.0,508358.0,508358.0,508358.0,508358.0,508358.0,508358.0,508358.0
mean,7.0174,21824.12084,2534.273207,3.872676,3.768993,3.840805,3.775327,3.827646,1226175000.0
std,2.204528,22124.969107,5237.843825,0.601764,0.682355,0.706408,0.665586,0.715121,75308430.0
min,0.01,5.0,1.0,0.0,1.0,0.0,1.0,1.0,884390400.0
25%,5.3,1673.0,132.0,3.5,3.5,3.5,3.5,3.5,1177202000.0
50%,6.5,13850.0,392.0,4.0,4.0,4.0,4.0,4.0,1241503000.0
75%,8.5,40418.0,1315.0,4.0,4.0,4.5,4.0,4.5,1289073000.0
max,57.7,77310.0,27980.0,5.0,5.0,5.0,5.0,5.0,1326277000.0


In [11]:
NUMERICAL_ATTRIBUTE = ['beer_ABV', 'review_appearance', 'review_palette', 
                       'review_overall', 'review_taste', 'review_aroma', 'review_time']
CATEGORICAL_ATTRIBUTE = ['beer_name', 'beer_style', 'review_profileName', 'review_text']

In [12]:
df_beer.sample(4)

Unnamed: 0,beer_ABV,beer_beerId,beer_brewerId,beer_name,beer_style,review_appearance,review_palette,review_overall,review_taste,review_profileName,review_aroma,review_text,review_time
289283,9.2,47360,35,Samuel Adams Imperial Stout,Russian Imperial Stout,4.5,4.0,3.0,4.0,Naugled,4.0,12oz bottle served in tulip glass A: thick bro...,1240498698
1127,3.5,436,163,Amstel Light,Light Lager,2.5,2.0,2.0,2.5,DoubleJ,2.0,"What's with all the fuss over this beer? Well,...",1184607136
80413,5.6,279,140,Sierra Nevada Porter,American Porter,4.0,3.0,4.0,4.0,RJLarse,4.5,I picked up a couple of bottles of Sierra Neva...,1163376203
356653,5.9,6368,1177,Masala Mama India Pale Ale,American IPA,5.0,4.5,4.5,4.0,kbeals,4.0,Growler sent to me by younger35. A- Dark amber...,1273412648


<div class="alert alert-info">
Convert the review timestamp to datetime format
</div>

In [13]:
df_beer['d_review_time'] = df_beer.review_time.apply(lambda t: datetime.fromtimestamp(t))

In [14]:
df_beer['d_review_time_year'] = df_beer['d_review_time'].dt.year
df_beer['d_review_time_month'] = df_beer['d_review_time'].dt.month

<div class="alert alert-success">
Q7) How do find similar beer drinkers by using written reviews only?   
</div>


In [18]:
import torch
import transformers as ppb

model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')


In [19]:
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

I0531 18:28:04.546885 140234029037376 tokenization_utils.py:1015] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/chetan/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
I0531 18:28:06.023221 140234029037376 configuration_utils.py:285] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json from cache at /home/chetan/.cache/torch/transformers/a41e817d5c0743e29e86ff85edc8c257e61bc8d88e4271bb1b243b6e7614c633.8949e27aafafa845a18d98a0e3a88bc2d248bbc32a1b75947366664658f23b1c
I0531 18:28:06.026210 140234029037376 configuration_utils.py:321] Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…

I0531 18:30:58.036954 140234029037376 file_utils.py:440] storing https://cdn.huggingface.co/distilbert-base-uncased-pytorch_model.bin in cache at /home/chetan/.cache/torch/transformers/ae9df7a8d658c4f3e1917a471a8a21cf678fa1d4cb91e7702dfe0598dbdcf354.c2015533705b9dff680ae707e205a35e2860e8d148b45d35085419d74fe57ac5
I0531 18:30:58.039834 140234029037376 file_utils.py:443] creating metadata file for /home/chetan/.cache/torch/transformers/ae9df7a8d658c4f3e1917a471a8a21cf678fa1d4cb91e7702dfe0598dbdcf354.c2015533705b9dff680ae707e205a35e2860e8d148b45d35085419d74fe57ac5
I0531 18:30:58.043862 140234029037376 filelock.py:318] Lock 140230454146720 released on /home/chetan/.cache/torch/transformers/ae9df7a8d658c4f3e1917a471a8a21cf678fa1d4cb91e7702dfe0598dbdcf354.c2015533705b9dff680ae707e205a35e2860e8d148b45d35085419d74fe57ac5.lock
I0531 18:30:58.045972 140234029037376 modeling_utils.py:650] loading weights file https://cdn.huggingface.co/distilbert-base-uncased-pytorch_model.bin from cache at /home




In [None]:
tokenized = df_beer['review_text'].progress_apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

HBox(children=(FloatProgress(value=0.0, max=508358.0), HTML(value='')))

W0531 18:31:24.101449 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (579 > 512). Running this sequence through the model will result in indexing errors
W0531 18:31:24.136695 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (657 > 512). Running this sequence through the model will result in indexing errors
W0531 18:31:24.997313 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (588 > 512). Running this sequence through the model will result in indexing errors
W0531 18:31:26.435384 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (586 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:31:31.703445 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (657 > 512). Running this sequence through the model will result in indexing errors
W0531 18:31:31.730535 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (566 > 512). Running this sequence through the model will result in indexing errors
W0531 18:31:32.271724 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (652 > 512). Running this sequence through the model will result in indexing errors
W0531 18:31:32.327774 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:31:36.870691 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (813 > 512). Running this sequence through the model will result in indexing errors
W0531 18:31:36.928255 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (586 > 512). Running this sequence through the model will result in indexing errors
W0531 18:31:36.969678 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (632 > 512). Running this sequence through the model will result in indexing errors
W0531 18:31:38.065981 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (552 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:31:45.759685 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (581 > 512). Running this sequence through the model will result in indexing errors
W0531 18:31:46.681937 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (1012 > 512). Running this sequence through the model will result in indexing errors
W0531 18:31:48.030011 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (621 > 512). Running this sequence through the model will result in indexing errors
W0531 18:31:48.661880 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (1129 > 512). Running this sequence through the model will result in indexing errors
W0531 

W0531 18:31:59.862231 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (730 > 512). Running this sequence through the model will result in indexing errors
W0531 18:31:59.884137 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (561 > 512). Running this sequence through the model will result in indexing errors
W0531 18:31:59.929752 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (631 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:00.291922 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:32:05.947100 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (523 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:06.515937 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (521 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:06.663372 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (555 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:06.744937 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:32:14.944621 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:15.765446 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:15.885201 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:16.453570 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (660 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:32:24.123324 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (635 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:24.450961 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (607 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:24.472759 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (627 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:24.660334 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (583 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:32:34.575630 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (644 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:34.926573 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (572 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:35.351104 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:36.011059 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (553 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:32:41.364641 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (928 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:41.371722 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:41.470720 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (628 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:41.591218 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (600 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:32:50.001140 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:50.496281 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (674 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:53.788206 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors
W0531 18:32:53.837247 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:33:08.150704 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (657 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:08.466449 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (599 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:09.066299 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (908 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:09.585616 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (611 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:33:22.476176 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (549 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:22.967377 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (584 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:23.173871 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (718 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:23.205635 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:33:34.038899 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (536 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:34.269447 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (561 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:34.382219 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (616 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:34.814750 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (528 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:33:37.103584 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:37.113833 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (520 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:37.136465 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (520 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:37.167402 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (591 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:33:43.002972 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (677 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:43.066970 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:43.229315 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (626 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:43.446762 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (592 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:33:50.280625 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (576 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:50.728350 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:50.974656 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (562 > 512). Running this sequence through the model will result in indexing errors
W0531 18:33:51.077563 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (760 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:34:03.708184 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (706 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:03.830718 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (861 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:04.505418 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:04.692383 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:34:16.703673 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:16.808173 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:17.054844 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (579 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:17.254893 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (693 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:34:22.305567 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:22.352591 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (672 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:22.375408 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:22.463145 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (559 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:34:32.031160 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (608 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:32.040334 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (647 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:32.070108 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (548 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:32.560196 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (678 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:34:40.506129 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (529 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:41.653726 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (561 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:41.697454 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (549 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:41.943778 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (809 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:34:55.396182 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (607 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:55.570881 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (673 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:55.601911 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (554 > 512). Running this sequence through the model will result in indexing errors
W0531 18:34:55.659365 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (703 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:35:04.512338 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (569 > 512). Running this sequence through the model will result in indexing errors
W0531 18:35:04.572793 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (550 > 512). Running this sequence through the model will result in indexing errors
W0531 18:35:05.950249 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (565 > 512). Running this sequence through the model will result in indexing errors
W0531 18:35:06.128890 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (618 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:35:14.933783 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (1170 > 512). Running this sequence through the model will result in indexing errors
W0531 18:35:15.364720 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (523 > 512). Running this sequence through the model will result in indexing errors
W0531 18:35:15.602710 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (518 > 512). Running this sequence through the model will result in indexing errors
W0531 18:35:16.053889 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (579 > 512). Running this sequence through the model will result in indexing errors
W0531 1

W0531 18:35:28.346060 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (613 > 512). Running this sequence through the model will result in indexing errors
W0531 18:35:28.664114 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (793 > 512). Running this sequence through the model will result in indexing errors
W0531 18:35:28.789460 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (745 > 512). Running this sequence through the model will result in indexing errors
W0531 18:35:29.319364 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (604 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:35:35.834122 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
W0531 18:35:35.982301 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (996 > 512). Running this sequence through the model will result in indexing errors
W0531 18:35:39.546598 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (529 > 512). Running this sequence through the model will result in indexing errors
W0531 18:35:39.688960 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (536 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:35:57.555815 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors
W0531 18:35:57.716820 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (581 > 512). Running this sequence through the model will result in indexing errors
W0531 18:35:57.871606 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (663 > 512). Running this sequence through the model will result in indexing errors
W0531 18:35:57.884720 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (609 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:36:07.966910 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (568 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:08.908946 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (732 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:11.314030 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:11.610371 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:36:18.777862 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (552 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:19.295760 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (848 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:19.382186 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:19.407066 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (619 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:36:24.774930 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (606 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:25.212912 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (550 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:25.370967 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (554 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:25.581736 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (701 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:36:33.550114 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (879 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:34.290900 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (684 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:34.494427 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (659 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:34.699803 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (575 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:36:40.746817 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (603 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:40.765391 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (628 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:41.390394 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (584 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:41.697847 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (642 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:36:49.407923 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (576 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:49.542794 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:49.564069 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (598 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:49.604738 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:36:59.279743 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (812 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:59.297543 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (643 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:59.453108 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (609 > 512). Running this sequence through the model will result in indexing errors
W0531 18:36:59.708306 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (512 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:37:10.283729 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (565 > 512). Running this sequence through the model will result in indexing errors
W0531 18:37:10.326447 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (511 > 512). Running this sequence through the model will result in indexing errors
W0531 18:37:10.500725 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (655 > 512). Running this sequence through the model will result in indexing errors
W0531 18:37:10.973905 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (511 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:37:17.003967 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (625 > 512). Running this sequence through the model will result in indexing errors
W0531 18:37:17.244385 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (593 > 512). Running this sequence through the model will result in indexing errors
W0531 18:37:17.431776 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (579 > 512). Running this sequence through the model will result in indexing errors
W0531 18:37:17.672846 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (697 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:37:25.310031 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (593 > 512). Running this sequence through the model will result in indexing errors
W0531 18:37:25.384355 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (588 > 512). Running this sequence through the model will result in indexing errors
W0531 18:37:25.520509 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (620 > 512). Running this sequence through the model will result in indexing errors
W0531 18:37:25.542099 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (780 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:37:32.619159 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (586 > 512). Running this sequence through the model will result in indexing errors
W0531 18:37:33.066320 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (600 > 512). Running this sequence through the model will result in indexing errors
W0531 18:37:33.706670 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (585 > 512). Running this sequence through the model will result in indexing errors
W0531 18:37:34.104824 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (574 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:37:49.564293 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (894 > 512). Running this sequence through the model will result in indexing errors
W0531 18:37:49.598153 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (893 > 512). Running this sequence through the model will result in indexing errors
W0531 18:37:49.684541 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors
W0531 18:37:50.149985 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (767 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:38:04.439390 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:04.488739 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (597 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:04.532851 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (568 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:04.627201 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (587 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:38:10.968880 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (558 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:11.011753 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (576 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:11.270526 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (573 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:11.412119 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (598 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:38:15.638200 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (616 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:15.650139 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (682 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:15.735110 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (739 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:15.763884 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (539 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:38:19.832945 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (631 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:19.867576 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (931 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:20.035834 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (511 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:20.186301 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (512 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:38:25.747117 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (572 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:25.795144 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (580 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:25.828790 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:25.840884 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (733 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:38:38.127847 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (806 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:38.492005 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (561 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:38.601561 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (1172 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:38.865117 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Running this sequence through the model will result in indexing errors
W0531 1

W0531 18:38:52.156810 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (541 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:52.990591 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (719 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:53.687242 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (554 > 512). Running this sequence through the model will result in indexing errors
W0531 18:38:54.529766 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:39:04.892177 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (512 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:04.906079 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (512 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:05.527801 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (523 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:06.276893 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (580 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:39:21.437840 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (512 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:22.744479 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:23.225472 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (562 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:23.553769 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (588 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:39:35.165585 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (717 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:35.242957 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (526 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:35.638623 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (550 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:35.797445 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (596 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:39:40.237327 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:40.373967 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:40.521862 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (658 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:40.636848 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (553 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:39:46.961959 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (537 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:47.141761 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (598 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:47.382919 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (571 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:47.483626 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (946 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:39:58.200406 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (654 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:58.676370 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (559 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:59.174953 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (651 > 512). Running this sequence through the model will result in indexing errors
W0531 18:39:59.592022 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (592 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:40:06.470893 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (585 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:06.583846 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (625 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:06.678116 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:06.818988 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (639 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:40:09.933014 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (511 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:09.953961 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (965 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:09.968742 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:10.394033 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (727 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:40:18.874690 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (566 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:18.955321 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (584 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:19.251368 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (568 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:19.278692 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (545 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:40:30.496677 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (529 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:30.505784 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (576 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:31.570160 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (537 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:32.064994 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (571 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:40:42.800273 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (930 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:43.132705 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:44.040002 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (788 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:44.173050 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (828 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:40:52.947500 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (821 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:52.977100 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (585 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:53.604777 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (637 > 512). Running this sequence through the model will result in indexing errors
W0531 18:40:54.213124 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (520 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:41:05.273574 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (819 > 512). Running this sequence through the model will result in indexing errors
W0531 18:41:05.301176 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (1149 > 512). Running this sequence through the model will result in indexing errors
W0531 18:41:05.397472 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (685 > 512). Running this sequence through the model will result in indexing errors
W0531 18:41:05.422306 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors
W0531 1

W0531 18:41:11.683372 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors
W0531 18:41:11.693974 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (681 > 512). Running this sequence through the model will result in indexing errors
W0531 18:41:12.367853 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (584 > 512). Running this sequence through the model will result in indexing errors
W0531 18:41:12.752774 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (552 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:41:23.295950 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (673 > 512). Running this sequence through the model will result in indexing errors
W0531 18:41:23.311210 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (812 > 512). Running this sequence through the model will result in indexing errors
W0531 18:41:25.327743 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (681 > 512). Running this sequence through the model will result in indexing errors
W0531 18:41:26.708126 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (512 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:41:38.828225 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (512 > 512). Running this sequence through the model will result in indexing errors
W0531 18:41:39.036908 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors
W0531 18:41:39.199679 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (724 > 512). Running this sequence through the model will result in indexing errors
W0531 18:41:39.535647 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (554 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:41:50.904599 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (652 > 512). Running this sequence through the model will result in indexing errors
W0531 18:41:50.938245 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (591 > 512). Running this sequence through the model will result in indexing errors
W0531 18:41:50.966412 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors
W0531 18:41:51.154350 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (622 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:42:03.337794 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (541 > 512). Running this sequence through the model will result in indexing errors
W0531 18:42:03.400922 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (617 > 512). Running this sequence through the model will result in indexing errors
W0531 18:42:03.863047 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (544 > 512). Running this sequence through the model will result in indexing errors
W0531 18:42:04.755135 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (668 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:42:08.072679 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (672 > 512). Running this sequence through the model will result in indexing errors
W0531 18:42:08.214580 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (886 > 512). Running this sequence through the model will result in indexing errors
W0531 18:42:08.763877 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (656 > 512). Running this sequence through the model will result in indexing errors
W0531 18:42:09.023857 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (566 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:42:21.598022 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (559 > 512). Running this sequence through the model will result in indexing errors
W0531 18:42:21.697729 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (696 > 512). Running this sequence through the model will result in indexing errors
W0531 18:42:21.743963 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
W0531 18:42:21.774906 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:42:32.513154 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (851 > 512). Running this sequence through the model will result in indexing errors
W0531 18:42:33.185612 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (557 > 512). Running this sequence through the model will result in indexing errors
W0531 18:42:33.485337 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors
W0531 18:42:33.955113 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (539 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:42:53.548917 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (652 > 512). Running this sequence through the model will result in indexing errors
W0531 18:42:53.801850 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (646 > 512). Running this sequence through the model will result in indexing errors
W0531 18:42:54.481084 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (553 > 512). Running this sequence through the model will result in indexing errors
W0531 18:42:54.555144 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (715 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:43:08.118282 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (677 > 512). Running this sequence through the model will result in indexing errors
W0531 18:43:08.479152 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (680 > 512). Running this sequence through the model will result in indexing errors
W0531 18:43:09.899101 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (658 > 512). Running this sequence through the model will result in indexing errors
W0531 18:43:10.589162 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (534 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:43:23.641185 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (599 > 512). Running this sequence through the model will result in indexing errors
W0531 18:43:23.810994 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (637 > 512). Running this sequence through the model will result in indexing errors
W0531 18:43:24.315104 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (811 > 512). Running this sequence through the model will result in indexing errors
W0531 18:43:24.524308 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (650 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:43:36.998832 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (556 > 512). Running this sequence through the model will result in indexing errors
W0531 18:43:37.093760 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (556 > 512). Running this sequence through the model will result in indexing errors
W0531 18:43:37.117819 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors
W0531 18:43:37.168440 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (562 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:43:48.989497 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors
W0531 18:43:49.559569 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (604 > 512). Running this sequence through the model will result in indexing errors
W0531 18:43:50.487701 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (696 > 512). Running this sequence through the model will result in indexing errors
W0531 18:43:50.754710 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (529 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:43:59.860052 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:00.070180 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:00.136275 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (553 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:00.245453 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (543 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:44:07.292400 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (597 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:07.664928 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:08.042098 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (526 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:08.146974 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (521 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:44:16.347238 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (750 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:16.520546 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (668 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:17.123515 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (511 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:17.342660 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (611 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:44:31.260136 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (608 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:31.786679 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (610 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:31.892643 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (593 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:32.094770 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:44:39.426794 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (958 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:39.516041 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (915 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:39.531314 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:39.564131 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (766 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:44:51.434939 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (573 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:51.802670 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (598 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:51.924689 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (1032 > 512). Running this sequence through the model will result in indexing errors
W0531 18:44:52.063013 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors
W0531 1

W0531 18:44:59.994809 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (550 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:00.136860 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (783 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:01.242002 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (526 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:01.298995 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:45:10.044406 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:10.126690 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (589 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:10.198223 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:10.252980 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (727 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:45:24.076907 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (604 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:24.125777 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:24.453989 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (549 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:24.464870 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (587 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:45:30.725669 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (770 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:31.316503 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (643 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:31.328983 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:31.422228 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:45:37.936993 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (992 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:38.302213 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (654 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:38.313766 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (607 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:38.519330 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (589 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:45:48.231218 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (537 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:48.462619 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (582 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:48.614767 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (594 > 512). Running this sequence through the model will result in indexing errors
W0531 18:45:48.838437 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (859 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:46:03.520614 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors
W0531 18:46:04.303773 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (614 > 512). Running this sequence through the model will result in indexing errors
W0531 18:46:04.403604 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors
W0531 18:46:04.462110 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (608 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:46:14.245799 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors
W0531 18:46:14.800537 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (524 > 512). Running this sequence through the model will result in indexing errors
W0531 18:46:14.893448 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (643 > 512). Running this sequence through the model will result in indexing errors
W0531 18:46:15.045573 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:46:25.247129 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors
W0531 18:46:27.227656 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors
W0531 18:46:27.486471 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (553 > 512). Running this sequence through the model will result in indexing errors
W0531 18:46:27.859849 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (569 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:46:40.969363 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (521 > 512). Running this sequence through the model will result in indexing errors
W0531 18:46:41.030996 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
W0531 18:46:41.082074 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (569 > 512). Running this sequence through the model will result in indexing errors
W0531 18:46:41.428333 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (603 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:46:54.491382 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (592 > 512). Running this sequence through the model will result in indexing errors
W0531 18:46:54.535124 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (676 > 512). Running this sequence through the model will result in indexing errors
W0531 18:46:54.784492 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (571 > 512). Running this sequence through the model will result in indexing errors
W0531 18:46:56.056298 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (536 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:47:08.889888 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (733 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:09.286589 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (876 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:10.079952 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (520 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:10.284166 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (710 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:47:20.429700 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:20.451905 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (595 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:20.537789 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:20.613799 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (763 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:47:30.805335 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:30.859256 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (610 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:31.878901 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:31.994085 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (563 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:47:39.373923 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (613 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:39.455591 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (694 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:39.632688 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (571 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:39.655484 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (561 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:47:45.894396 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (622 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:45.969395 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (582 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:46.467522 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (688 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:46.561611 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (642 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:47:54.564697 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (609 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:54.611520 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (641 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:54.647380 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (529 > 512). Running this sequence through the model will result in indexing errors
W0531 18:47:55.048845 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (703 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:48:05.712404 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (617 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:05.850759 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (566 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:05.902292 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (583 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:05.988123 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (622 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:48:11.822147 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (548 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:11.979711 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (528 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:11.990559 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (727 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:12.105643 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (757 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:48:26.980822 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:27.138043 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (777 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:27.378746 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (534 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:27.629357 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (561 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:48:38.411855 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (609 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:38.572872 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (530 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:38.666569 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:38.729635 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (569 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:48:48.116572 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (619 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:48.140364 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:48.428652 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:48.790024 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:48:56.523749 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (621 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:56.722252 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (555 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:56.793153 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (536 > 512). Running this sequence through the model will result in indexing errors
W0531 18:48:57.085441 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (709 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:49:10.740348 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (659 > 512). Running this sequence through the model will result in indexing errors
W0531 18:49:10.781272 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (909 > 512). Running this sequence through the model will result in indexing errors
W0531 18:49:10.830339 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (524 > 512). Running this sequence through the model will result in indexing errors
W0531 18:49:10.896939 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (544 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:49:15.129358 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (650 > 512). Running this sequence through the model will result in indexing errors
W0531 18:49:15.381384 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (614 > 512). Running this sequence through the model will result in indexing errors
W0531 18:49:15.513591 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Running this sequence through the model will result in indexing errors
W0531 18:49:15.604538 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (511 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:49:19.565897 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
W0531 18:49:19.772792 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (539 > 512). Running this sequence through the model will result in indexing errors
W0531 18:49:19.982968 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (739 > 512). Running this sequence through the model will result in indexing errors
W0531 18:49:20.138754 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors
W0531 18

W0531 18:49:26.304068 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (760 > 512). Running this sequence through the model will result in indexing errors
W0531 18:49:26.362171 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (518 > 512). Running this sequence through the model will result in indexing errors
W0531 18:49:26.406711 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors
W0531 18:49:26.914590 140234029037376 tokenization_utils.py:1934] Token indices sequence length is longer than the specified maximum sequence length for this model (640 > 512). Running this sequence through the model will result in indexing errors
W0531 18

## Padding
After tokenization, tokenized is a list of sentences -- each sentences is represented as a list of tokens. We want BERT to process our examples all at once (as one batch). It's just faster that way. For that reason, we need to pad all lists to the same size, so we can represent the input as one 2-d array, rather than a list of lists (of different lengths).

In [None]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

## Masking
If we directly send padded to BERT, that would slightly confuse it. We need to create another variable to tell it to ignore (mask) the padding we've added when it's processing its input. That's what attention_mask is:

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

The model() function runs our sentences through BERT. The results of the processing will be returned into last_hidden_states.

In [None]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
## Create dataset for faiss indexing

In [None]:
list_train = [features[i:i+100] for i in range(0,len(features),100)]

In [None]:
import faiss

dimension = 512
nlist = 5  # number of clusters
quantiser = faiss.IndexFlatL2(dimension)  
index = faiss.IndexIVFFlat(quantiser, dimension, nlist, faiss.METRIC_L2)


In [None]:
for x in list_train:
    db_vectors = embed(x).numpy()
    print(index.is_trained)   # False
    index.train(db_vectors)  # train on the database vectors
    print(index.ntotal)   # 0
    index.add(db_vectors)   # add the vectors and update the index
    print(index.is_trained)  # True
    print(index.ntotal)