In [1]:
import os
import pickle

import torch
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from transformers import BertModel, BertTokenizer
import re
import csv   

import sys
sys.path.append("..")
from src.bert import *


def get_context(token_ids, target_position, sequence_length=128):
    """
    Given a text containing a target word, return the sentence snippet which surrounds the target word
    (and the target word's position in the snippet).

    :param token_ids: list of token ids (for an entire line of text)
    :param target_position: index of the target word's position in `tokens`
    :param sequence_length: desired length for output sequence (e.g. 128, 256, 512)
    :return: (context_ids, new_target_position)
                context_ids: list of token ids for the output sequence
                new_target_position: index of the target word's position in `context_ids`
    """
    # -2 as [CLS] and [SEP] tokens will be added later; /2 as it's a one-sided window
    window_size = int((sequence_length - 2) / 2)
    context_start = max([0, target_position - window_size])
    padding_offset = max([0, window_size - target_position])
    padding_offset += max([0, target_position + window_size - len(token_ids)])

    context_ids = token_ids[context_start:target_position + window_size]
    context_ids += padding_offset * [0]

    new_target_position = target_position - context_start

    return context_ids, new_target_position

def get_usage_vectors( model,
                       batch_input_ids,
                       batch_tokens,
                       batch_snippets,
                       batch_pos, 
                       batch_decades):
    
    with torch.no_grad():
        # collect list of input ids into a single batch tensor
        input_ids_tensor = torch.tensor(batch_input_ids)
        if torch.cuda.is_available():
            input_ids_tensor = input_ids_tensor.to('cuda')

            
        #print(input_ids_tensor)
        # run usages through language model
        outputs = model(input_ids_tensor,  output_hidden_states=True )
        #print(len(outputs.hidden_states)) # items in the tuple = 1 + num layers
        if torch.cuda.is_available():
            hidden_states = [l.detach().cpu().clone().numpy() for l in outputs[2]]
        else:
            #print("fjekl")
            hidden_states = [l.clone().numpy() for l in outputs.hidden_states]

        # get usage vectors from hidden states
        hidden_states = np.stack(hidden_states)  # (13, B, |s|, 768)
        #print('Expected hidden states size: (13, B, |s|, 768). Got {}'.format(hidden_states.shape))
        # usage_vectors = np.sum(hidden_states, 0)  # (B, |s|, 768)
        # usage_vectors = hidden_states.view(hidden_states.shape[1],
        #                                    hidden_states.shape[2],
        #                                    -1)
        usage_vectors = np.sum(hidden_states[1:, :, :, :], axis=0)
        
        # usage_vectors = hidden_states.reshape((hidden_states.shape[1], hidden_states.shape[2], -1))
        #print(usage_vectors.shape)
        return usage_vectors
    
def get_feature_vectors( model,
                        feature_model,
                       batch_input_ids,
                       batch_tokens,
                       batch_snippets,
                       batch_pos, 
                       batch_decades):
    
    with torch.no_grad():
        # collect list of input ids into a single batch tensor
        input_ids_tensor = torch.tensor(batch_input_ids)
        if torch.cuda.is_available():
            input_ids_tensor = input_ids_tensor.to('cuda')

            
        #print(input_ids_tensor)
        # run usages through language model
        outputs = model(input_ids_tensor,  output_hidden_states=True )
        #print(len(outputs.hidden_states)) # items in the tuple = 1 + num layers
        if torch.cuda.is_available():
            hidden_states = [l.detach().cpu().clone().numpy() for l in outputs[2]]
        else:
            #print("fjekl")
            hidden_states = [l.clone().numpy() for l in outputs.hidden_states]

        # get usage vectors from hidden states
        hidden_states = np.stack(hidden_states)  # (13, B, |s|, 768)
        print('Expected hidden states size: (13, B, |s|, 768). Got {}'.format(hidden_states.shape))
        # usage_vectors = np.sum(hidden_states, 0)  # (B, |s|, 768)
        # usage_vectors = hidden_states.view(hidden_states.shape[1],
        #                                    hidden_states.shape[2],
        #                                    -1)
        
        #print(hidden_states.shape)
        #print(hidden_states[1:, :, :, :].shape)
        usage_vectors = hidden_states[8, :, :, :] # get the 8th layer, which the model expects as input
        
        # usage_vectors = hidden_states.reshape((hidden_states.shape[1], hidden_states.shape[2], -1))
        #print(usage_vectors.shape)
        
        
        
        return usage_vectors


def collect_from_coha(target_words,
                      decades,
                      sequence_length,
                      pretrained_weights='models/bert-base-uncased',
                      coha_dir='data/coha',
                      output_path=None,
                      buffer_size=1024):
    """
    Collect usages of target words from the COHA dataset.

    :param target_words: list of words whose usages are to be collected
    :param decades: list of year integers (e.g. list(np.arange(1910, 2001, 10)))
    :param sequence_length: the number of tokens in the context of a word occurrence
    :param pretrained_weights: path to model folder with weights and config file
    :param coha_dir: path to COHA directory (containing `all_1810.txt`, ..., `all_2000.txt`)
    :param output_path: path to output file for `usages` dictionary. If provided, data is stored
                        in this file incrementally (use e.g. to avoid out of memory errors)
    :param buffer_size: (max) number of usages to process in a single model run
    :return: usages: a dictionary from target words to lists of usage tuples
                     lemma -> [(vector, sentence, word_position, decade), (v, s, p, d), ...]
    """

    # load model and tokenizer
    tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
    model = BertModel.from_pretrained(pretrained_weights)
    if torch.cuda.is_available():
        model.to('cuda')
        
        
    buchanan = torch.load('../trained_models/model.plsr.buchanan.allbuthomonyms.5k.300components.500max_iters')


    # build word-index vocabulary for target words
    i2w = {}
    for t, t_id in zip(target_words, tokenizer.encode(' '.join(target_words))[1:-1]): #changed for new hugginface api
        i2w[t_id] = t

    # buffers for batch processing
    batch_input_ids = []
    batch_tokens = []
    batch_pos = []
    batch_snippets = []
    batch_decades = []

    usages = defaultdict(list)  # w -> (vector, sentence, word_position, decade)
    
    # do collection
    
    print(len(decades))
    for T, decade in enumerate(decades):
        # one time interval at a time
        print('Decade {}...'.format(decade))

        
        ### gabriella changes
        ### my coha is organized differently. 
        ### the decades have random numbers for the alphabet index places , so i have to use regex
        ### to ignore that. 
        print(coha_dir)
        print(decade)
        my_regex = r'text_' + re.escape(str(decade)) + 's.*'

        #print("running through decade ", decade)

        # iterate through directories
        for decade_dir in os.listdir(coha_dir):

            if re.match(my_regex, decade_dir):
                # get all the text files for that decade
                # iterate through text files for this decade
                this_decade_files = os.listdir(os.path.join(coha_dir, decade_dir))
                for F, filename in enumerate(tqdm(this_decade_files)):
                    #print(filename)
                    
                    with open(os.path.join(coha_dir, decade_dir, filename), 'r') as f:
                        lines = f.readlines()
                        #print("gets here")

                        # get the usages from this file
                        for L, line in enumerate(lines):
                            #print("gets to line: ", L)



                            # tokenize line and convert to token ids
                            tokens = tokenizer.encode(line)

                            for pos, token in enumerate(tokens):
                                #print(token)
                                # store usage info of target words only
                                if token in i2w:
                                    context_ids, pos_in_context = get_context(tokens, pos, sequence_length)

                                    input_ids = [101] + context_ids + [102]


                                    # convert later to save storage space
                                    snippet = tokenizer.convert_ids_to_tokens(context_ids)
                                    #print(i2w[token])
                                    #print(' '.join(snippet))

                                    # add usage info to buffers
                                    batch_input_ids.append(input_ids)
                                    batch_tokens.append(i2w[token])
                                    batch_pos.append(pos_in_context)
                                    batch_snippets.append(snippet)
                                    batch_decades.append(decade)

#                                 print("batch size ", len(batch_input_ids))
#                                 print("lines left ", len(lines) - L)
#                                 print("files left in this decade" , len(this_decade_files) - F)
#                                 print("decades left", len(decades) - T)
                                    
                                # if the buffers are full...             or if we're at the end of the dataset
                                if (len(batch_input_ids) >= buffer_size) or (L == len(lines) - 1 and T == len(decades) - 1 and F==len(this_decade_files)):
                                    
                                    usage_vectors = get_feature_vectors(
                                        model,
                                        buchanan,
                                        batch_input_ids,
                                        batch_tokens,
                                        batch_snippets, 
                                        batch_pos, 
                                        batch_decades )


                                    # store usage tuples in a dictionary: lemma -> (vector, snippet, position, decade)
                                    #print(len(batch_input_ids))
                                    for b in np.arange(len(batch_input_ids)):
                                        #print(usage_vectors.shape)
                                        layer_8_bert_vector = usage_vectors[b, batch_pos[b]+1, :] # get the right position
                                        
                                        word, feature_vector = buchanan.predict_from_single_context_vector(batch_tokens[b], layer_8_bert_vector)
                                        
                                        usages[batch_tokens[b]].append(
                                             (feature_vector, batch_snippets[b], batch_pos[b], batch_decades[b]))
                                    
#                                     print(usages)
                        
                                    # finally, empty the batch buffers
                                    batch_input_ids, batch_tokens, batch_pos, batch_snippets, batch_decades = [], [], [], [], []
                        
        print("saving usages for decade")
        if os.path.exists(output_path):
            append_write = 'ab' # append if already exists
        else:
            append_write = 'wb' # make a new file if not

        # and store data incrementally
        if output_path:
            #print(append_write)
            with open(output_path, append_write) as f:
                pickle.dump(usages, file=f)
    

    return usages


  from .autonotebook import tqdm as notebook_tqdm


In [39]:
# Target words: we want to collect tokens of each of these words from COHA

targets = ['net', 'virtual', 'disk', 'card', 'optical', 'virus',
           'signal', 'mirror', 'energy', 'compact', 'leaf',
           'brick', 'federal', 'sphere', 'coach', 'spine', 'parent', 'sleep']

decades = [decade for decade in np.arange(1910, 2009, 10)]
print(len(decades))

buffer_size=1024
sequence_length=128


coha_dir = '/home/shared/corpora/Corpus of Historical American English/TEXTS'
bert_dir = 'bert-base-uncased'
output_dir = '../data/cwr4lsc/feature_prediction'

10


In [40]:
### collect just the usages and not the vectors. 

for decade in decades:
    collect_from_coha(targets,
                      [decade],
                      sequence_length=sequence_length,
                      pretrained_weights=bert_dir,
                      coha_dir=coha_dir,
                      output_path='{}/usages_with_vectors_16_len{}_{}.dict'.format(output_dir, sequence_length, decade),
                      buffer_size=buffer_size)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1
Decade 1910...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1910


  0%|                                                                                                                         | 0/3355 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (7482 > 512). Running this sequence through the model will result in indexing errors
  8%|█████████▍                                                                                                     | 284/3355 [00:18<02:28, 20.69it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 18%|████████████████████▍                                                                                          | 616/3355 [00:43<01:15, 36.51it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 28%|███████████████████████████████▌                                                                               | 955/3355 [01:09<02:54, 13.78it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 36%|███████████████████████████████████████                                                                       | 1193/3355 [01:39<14:42,  2.45it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 37%|████████████████████████████████████████▋                                                                     | 1241/3355 [02:04<08:18,  4.24it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 39%|██████████████████████████████████████████▍                                                                   | 1294/3355 [02:33<28:06,  1.22it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 40%|████████████████████████████████████████████▏                                                                 | 1349/3355 [03:08<15:55,  2.10it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 42%|█████████████████████████████████████████████▉                                                                | 1400/3355 [03:41<09:51,  3.31it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 44%|███████████████████████████████████████████████▊                                                              | 1460/3355 [04:14<03:51,  8.18it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 45%|█████████████████████████████████████████████████▊                                                            | 1520/3355 [04:48<34:47,  1.14s/it]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 47%|███████████████████████████████████████████████████▌                                                          | 1571/3355 [05:17<11:41,  2.54it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3355/3355 [05:45<00:00,  9.71it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


1
Decade 1920...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1920


  0%|                                                                                                                        | 0/11557 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (5408 > 512). Running this sequence through the model will result in indexing errors
 14%|███████████████▎                                                                                            | 1639/11557 [00:23<01:31, 108.14it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 28%|██████████████████████████████▋                                                                             | 3278/11557 [00:52<00:40, 206.01it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 51%|███████████████████████████████████████████████████████▉                                                     | 5936/11557 [01:21<03:11, 29.29it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 63%|████████████████████████████████████████████████████████████████████▍                                        | 7258/11557 [01:47<09:15,  7.74it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 64%|█████████████████████████████████████████████████████████████████████▌                                       | 7372/11557 [02:19<34:08,  2.04it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 64%|█████████████████████████████████████████████████████████████████████▉                                       | 7412/11557 [02:52<45:10,  1.53it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 64%|██████████████████████████████████████████████████████████████████████▏                                      | 7445/11557 [03:19<19:57,  3.43it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 65%|██████████████████████████████████████████████████████████████████████▋                                      | 7494/11557 [03:46<42:17,  1.60it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 65%|███████████████████████████████████████████████████████████████████████                                      | 7539/11557 [04:18<38:01,  1.76it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 66%|███████████████████████████████████████████████████████████████████████▌                                     | 7592/11557 [04:52<50:12,  1.32it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 66%|████████████████████████████████████████████████████████████████████████▍                                    | 7674/11557 [05:22<18:16,  3.54it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 68%|█████████████████████████████████████████████████████████████████████████▌                                   | 7805/11557 [05:45<02:49, 22.13it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 81%|███████████████████████████████████████████████████████████████████████████████████████▌                    | 9373/11557 [06:09<00:20, 104.08it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████      | 10922/11557 [06:33<00:06, 96.62it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11557/11557 [06:46<00:00, 28.41it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


1
Decade 1930...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1930


  0%|                                                                                                                        | 0/10352 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors
 10%|██████████▌                                                                                                   | 996/10352 [00:20<06:30, 23.94it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 17%|██████████████████▋                                                                                          | 1776/10352 [00:41<05:26, 26.29it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 30%|█████████████████████████████████▏                                                                           | 3149/10352 [01:06<05:19, 22.57it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 51%|██████████████████████████████████████████████████████▊                                                     | 5259/10352 [01:30<00:25, 197.91it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 56%|████████████████████████████████████████████████████████████▋                                                | 5761/10352 [01:56<21:04,  3.63it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 57%|█████████████████████████████████████████████████████████████▊                                               | 5873/10352 [02:25<26:30,  2.82it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 58%|██████████████████████████████████████████████████████████████▋                                              | 5959/10352 [02:56<26:22,  2.78it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 58%|███████████████████████████████████████████████████████████████▎                                             | 6018/10352 [03:24<37:53,  1.91it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 59%|████████████████████████████████████████████████████████████████▎                                            | 6108/10352 [03:53<14:51,  4.76it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 60%|█████████████████████████████████████████████████████████████████▏                                           | 6192/10352 [04:22<27:38,  2.51it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 60%|█████████████████████████████████████████████████████████████████▊                                           | 6245/10352 [04:40<28:49,  2.37it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 61%|██████████████████████████████████████████████████████████████████▎                                          | 6298/10352 [05:06<22:20,  3.02it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 62%|███████████████████████████████████████████████████████████████████▉                                         | 6453/10352 [05:31<09:17,  7.00it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 69%|███████████████████████████████████████████████████████████████████████████                                 | 7194/10352 [05:56<00:27, 112.85it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 80%|██████████████████████████████████████████████████████████████████████████████████████▏                     | 8262/10352 [06:14<00:20, 101.71it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 88%|███████████████████████████████████████████████████████████████████████████████████████████████▍            | 9144/10352 [06:31<00:10, 116.12it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▏| 10279/10352 [06:51<00:01, 71.73it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10352/10352 [06:59<00:00, 24.69it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


1
Decade 1940...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1940


  0%|                                                                                                                        | 0/11343 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (768 > 512). Running this sequence through the model will result in indexing errors
 17%|██████████████████▍                                                                                          | 1925/11343 [00:20<03:25, 45.87it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 35%|██████████████████████████████████████▌                                                                      | 4015/11343 [00:51<01:34, 77.55it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 51%|███████████████████████████████████████████████████████▍                                                     | 5770/11343 [01:18<01:11, 77.82it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 61%|██████████████████████████████████████████████████████████████████▏                                          | 6893/11343 [01:42<13:25,  5.53it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 62%|███████████████████████████████████████████████████████████████████▍                                         | 7013/11343 [02:11<19:40,  3.67it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 62%|███████████████████████████████████████████████████████████████████▉                                         | 7068/11343 [02:37<15:47,  4.51it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 63%|████████████████████████████████████████████████████████████████████▋                                        | 7146/11343 [03:04<31:58,  2.19it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 63%|█████████████████████████████████████████████████████████████████████                                        | 7185/11343 [03:31<38:48,  1.79it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 64%|█████████████████████████████████████████████████████████████████████▋                                       | 7251/11343 [03:57<16:06,  4.24it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 65%|██████████████████████████████████████████████████████████████████████▎                                      | 7317/11343 [04:27<48:55,  1.37it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 65%|██████████████████████████████████████████████████████████████████████▊                                      | 7369/11343 [04:52<24:56,  2.66it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 66%|███████████████████████████████████████████████████████████████████████▌                                     | 7453/11343 [05:13<28:25,  2.28it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 66%|███████████████████████████████████████████████████████████████████████▉                                     | 7484/11343 [05:33<25:58,  2.48it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 76%|██████████████████████████████████████████████████████████████████████████████████▎                         | 8641/11343 [05:59<00:23, 114.19it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 88%|███████████████████████████████████████████████████████████████████████████████████████████████▎            | 10012/11343 [06:21<00:16, 82.01it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11343/11343 [06:43<00:00, 28.08it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


1
Decade 1950...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1950


  0%|                                                                                                                        | 0/11935 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (603 > 512). Running this sequence through the model will result in indexing errors
 14%|███████████████▋                                                                                             | 1722/11935 [00:18<06:35, 25.79it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 26%|████████████████████████████▎                                                                               | 3123/11935 [00:42<00:56, 156.13it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 42%|█████████████████████████████████████████████▎                                                              | 5002/11935 [01:06<00:44, 154.48it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 57%|█████████████████████████████████████████████████████████████                                               | 6749/11935 [01:31<00:50, 102.05it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 62%|███████████████████████████████████████████████████████████████████▍                                         | 7379/11935 [01:54<12:48,  5.92it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 62%|████████████████████████████████████████████████████████████████████                                         | 7454/11935 [02:25<05:12, 14.36it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 63%|████████████████████████████████████████████████████████████████████▊                                        | 7534/11935 [02:54<40:19,  1.82it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 64%|█████████████████████████████████████████████████████████████████████▋                                       | 7624/11935 [03:21<04:59, 14.42it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 64%|██████████████████████████████████████████████████████████████████████▏                                      | 7683/11935 [03:47<15:55,  4.45it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 65%|██████████████████████████████████████████████████████████████████████▌                                      | 7733/11935 [04:14<47:23,  1.48it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 65%|██████████████████████████████████████████████████████████████████████▉                                      | 7770/11935 [04:37<24:42,  2.81it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 66%|███████████████████████████████████████████████████████████████████████▌                                     | 7832/11935 [05:03<04:40, 14.62it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 67%|████████████████████████████████████████████████████████████████████████▌                                    | 7952/11935 [05:28<07:21,  9.03it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 68%|██████████████████████████████████████████████████████████████████████████                                   | 8111/11935 [05:54<01:44, 36.56it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 80%|███████████████████████████████████████████████████████████████████████████████████████▏                     | 9544/11935 [06:15<00:24, 97.88it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 90%|████████████████████████████████████████████████████████████████████████████████████████████████▍          | 10762/11935 [06:36<00:10, 109.26it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 11786/11935 [06:53<00:01, 91.36it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11935/11935 [07:02<00:00, 28.26it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


1
Decade 1960...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1960


  0%|                                                                                                                        | 0/10113 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (693 > 512). Running this sequence through the model will result in indexing errors
 15%|███████████████▉                                                                                             | 1484/10113 [00:19<01:46, 81.25it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 29%|███████████████████████████████▋                                                                             | 2940/10113 [00:44<06:15, 19.11it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 42%|█████████████████████████████████████████████▏                                                              | 4233/10113 [01:07<00:33, 174.23it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 54%|███████████████████████████████████████████████████████████▏                                                 | 5489/10113 [01:30<00:46, 99.87it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 57%|██████████████████████████████████████████████████████████████▏                                              | 5768/10113 [01:58<35:31,  2.04it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 58%|███████████████████████████████████████████████████████████████                                              | 5851/10113 [02:24<17:37,  4.03it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 59%|███████████████████████████████████████████████████████████████▊                                             | 5918/10113 [02:52<29:24,  2.38it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 59%|███████████████████████████████████████████████████████████████                                            | 5965/10113 [03:19<1:20:11,  1.16s/it]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 60%|████████████████████████████████████████████████████████████████▉                                            | 6022/10113 [03:46<54:14,  1.26it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 60%|█████████████████████████████████████████████████████████████████▋                                           | 6089/10113 [04:14<14:03,  4.77it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 61%|██████████████████████████████████████████████████████████████████▏                                          | 6144/10113 [04:43<41:03,  1.61it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 62%|███████████████████████████████████████████████████████████████████▎                                         | 6251/10113 [05:12<06:33,  9.80it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 63%|████████████████████████████████████████████████████████████████████▏                                        | 6324/10113 [05:31<25:46,  2.45it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 71%|█████████████████████████████████████████████████████████████████████████████▊                               | 7218/10113 [05:54<00:30, 95.94it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 82%|█████████████████████████████████████████████████████████████████████████████████████████▏                   | 8274/10113 [06:13<00:19, 96.12it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 9407/10113 [06:33<00:07, 94.08it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10113/10113 [06:48<00:00, 24.74it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


1
Decade 1970...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1970


  0%|                                                                                                                         | 0/9419 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1233 > 512). Running this sequence through the model will result in indexing errors
 13%|██████████████▏                                                                                               | 1215/9419 [00:15<04:42, 29.08it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 25%|███████████████████████████▎                                                                                 | 2360/9419 [00:38<00:44, 158.79it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 33%|████████████████████████████████████▏                                                                         | 3103/9419 [00:57<02:29, 42.34it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 44%|████████████████████████████████████████████████▊                                                             | 4177/9419 [01:17<01:58, 44.21it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 55%|███████████████████████████████████████████████████████████▌                                                 | 5143/9419 [01:36<00:40, 106.58it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 63%|█████████████████████████████████████████████████████████████████████▏                                        | 5926/9419 [01:53<02:50, 20.49it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 63%|█████████████████████████████████████████████████████████████████████▋                                        | 5967/9419 [02:23<35:20,  1.63it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 65%|██████████████████████████████████████████████████████████████████████▉                                       | 6078/9419 [02:49<23:08,  2.41it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 65%|███████████████████████████████████████████████████████████████████████▍                                      | 6121/9419 [03:16<18:08,  3.03it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 66%|████████████████████████████████████████████████████████████████████████▎                                     | 6192/9419 [03:43<10:34,  5.09it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 66%|████████████████████████████████████████████████████████████████████████▊                                     | 6232/9419 [04:09<28:49,  1.84it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 67%|█████████████████████████████████████████████████████████████████████████▎                                    | 6275/9419 [04:35<35:22,  1.48it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 67%|█████████████████████████████████████████████████████████████████████████▊                                    | 6324/9419 [04:52<06:01,  8.57it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 68%|██████████████████████████████████████████████████████████████████████████▍                                   | 6373/9419 [05:21<16:12,  3.13it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 68%|██████████████████████████████████████████████████████████████████████████▉                                   | 6415/9419 [05:47<37:56,  1.32it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 72%|███████████████████████████████████████████████████████████████████████████████▌                              | 6816/9419 [06:05<00:27, 94.07it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 81%|████████████████████████████████████████████████████████████████████████████████████████▌                    | 7658/9419 [06:21<00:16, 105.81it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 88%|████████████████████████████████████████████████████████████████████████████████████████████████▊             | 8295/9419 [06:36<00:15, 70.89it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 8902/9419 [06:51<00:06, 76.39it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9419/9419 [07:05<00:00, 22.15it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


1
Decade 1980...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1980


  0%|                                                                                                                        | 0/11106 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (4048 > 512). Running this sequence through the model will result in indexing errors
  9%|█████████▊                                                                                                  | 1007/11106 [00:10<01:02, 162.41it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 18%|███████████████████▉                                                                                         | 2035/11106 [00:32<02:38, 57.31it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 26%|████████████████████████████▎                                                                                | 2886/11106 [00:50<02:22, 57.88it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 33%|████████████████████████████████████                                                                         | 3672/11106 [01:09<01:37, 76.01it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 42%|█████████████████████████████████████████████▊                                                               | 4665/11106 [01:30<04:07, 25.99it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 55%|███████████████████████████████████████████████████████████▏                                                | 6083/11106 [01:52<00:42, 119.17it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 63%|█████████████████████████████████████████████████████████████████████                                        | 7034/11106 [02:12<01:10, 57.51it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 73%|███████████████████████████████████████████████████████████████████████████████▊                             | 8128/11106 [02:31<00:59, 50.15it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 83%|██████████████████████████████████████████████████████████████████████████████████████████▉                  | 9271/11106 [02:50<00:19, 93.62it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 91%|██████████████████████████████████████████████████████████████████████████████████████████████████▏         | 10102/11106 [03:09<00:53, 18.62it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 92%|██████████████████████████████████████████████████████████████████████████████████████████████████▊         | 10167/11106 [03:38<08:01,  1.95it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 92%|███████████████████████████████████████████████████████████████████████████████████████████████████▊        | 10268/11106 [04:04<02:07,  6.57it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 93%|████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 10313/11106 [04:30<07:40,  1.72it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 93%|████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 10348/11106 [04:57<13:11,  1.04s/it]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 10406/11106 [05:25<03:28,  3.36it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 10437/11106 [05:47<03:15,  3.42it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 10475/11106 [06:11<08:29,  1.24it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████▏     | 10513/11106 [06:31<04:10,  2.36it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 10542/11106 [06:50<04:02,  2.33it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 10579/11106 [07:12<03:19,  2.65it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████    | 10699/11106 [07:33<00:12, 31.69it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11106/11106 [07:43<00:00, 23.96it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


1
Decade 1990...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1990


  0%|                                                                                                                         | 0/9778 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (634 > 512). Running this sequence through the model will result in indexing errors
  8%|████████▍                                                                                                      | 748/9778 [00:14<03:37, 41.45it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 14%|███████████████▋                                                                                              | 1391/9778 [00:34<01:34, 88.59it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 19%|█████████████████████▍                                                                                        | 1901/9778 [00:54<02:09, 60.67it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 26%|███████████████████████████▉                                                                                 | 2506/9778 [01:13<01:01, 117.78it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 31%|█████████████████████████████████▉                                                                            | 3020/9778 [01:30<02:51, 39.48it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 38%|██████████████████████████████████████████                                                                    | 3740/9778 [01:50<02:39, 37.79it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 45%|█████████████████████████████████████████████████▏                                                            | 4374/9778 [02:10<02:22, 38.04it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 49%|█████████████████████████████████████████████████████▊                                                        | 4786/9778 [02:26<01:57, 42.57it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 54%|███████████████████████████████████████████████████████████▊                                                  | 5312/9778 [02:44<01:23, 53.35it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 60%|█████████████████████████████████████████████████████████████████▉                                            | 5866/9778 [03:03<02:04, 31.46it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 68%|██████████████████████████████████████████████████████████████████████████▌                                   | 6628/9778 [03:21<00:49, 63.97it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 72%|███████████████████████████████████████████████████████████████████████████████▌                              | 7072/9778 [03:42<02:56, 15.31it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 75%|██████████████████████████████████████████████████████████████████████████████████▍                           | 7328/9778 [04:05<02:43, 14.95it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 77%|████████████████████████████████████████████████████████████████████████████████████▊                         | 7534/9778 [04:27<02:48, 13.33it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 79%|███████████████████████████████████████████████████████████████████████████████████████▏                      | 7752/9778 [04:50<02:09, 15.60it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 81%|█████████████████████████████████████████████████████████████████████████████████████████▍                    | 7949/9778 [05:13<03:47,  8.04it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 83%|██████████████████████████████████████████████████████████████████████████████████████████▊                   | 8077/9778 [05:33<14:47,  1.92it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 83%|███████████████████████████████████████████████████████████████████████████████████████████▏                  | 8111/9778 [05:59<17:01,  1.63it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 83%|███████████████████████████████████████████████████████████████████████████████████████████▌                  | 8138/9778 [06:22<16:45,  1.63it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 86%|██████████████████████████████████████████████████████████████████████████████████████████████                | 8361/9778 [06:47<01:30, 15.67it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 87%|███████████████████████████████████████████████████████████████████████████████████████████████▌              | 8500/9778 [07:06<02:41,  7.91it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 88%|█████████████████████████████████████████████████████████████████████████████████████████████████             | 8628/9778 [07:26<01:23, 13.77it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 90%|██████████████████████████████████████████████████████████████████████████████████████████████████▊           | 8784/9778 [07:46<04:28,  3.70it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 94%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 9229/9778 [08:07<00:24, 22.32it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉  | 9595/9778 [08:28<00:07, 25.31it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9778/9778 [08:44<00:00, 18.63it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


1
Decade 2000...
/home/shared/corpora/Corpus of Historical American English/TEXTS
2000


  0%|                                                                                                                        | 0/13795 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (642 > 512). Running this sequence through the model will result in indexing errors
  7%|███████▏                                                                                                      | 898/13795 [00:11<04:14, 50.76it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 11%|████████████▍                                                                                                | 1581/13795 [00:33<07:55, 25.71it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 14%|███████████████▌                                                                                             | 1970/13795 [00:57<11:32, 17.08it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 16%|█████████████████▎                                                                                           | 2185/13795 [01:18<13:50, 13.99it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 18%|███████████████████▊                                                                                         | 2504/13795 [01:42<11:12, 16.79it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 20%|█████████████████████▊                                                                                       | 2768/13795 [02:04<08:11, 22.42it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 23%|████████████████████████▌                                                                                    | 3109/13795 [02:26<09:31, 18.71it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 26%|████████████████████████████▏                                                                               | 3593/13795 [02:48<01:33, 108.77it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 31%|█████████████████████████████████▏                                                                          | 4234/13795 [03:06<01:33, 101.82it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 37%|███████████████████████████████████████▊                                                                     | 5046/13795 [03:25<03:14, 44.87it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 39%|██████████████████████████████████████████▌                                                                  | 5392/13795 [03:47<06:30, 21.50it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 43%|██████████████████████████████████████████████▌                                                              | 5890/13795 [04:07<02:31, 52.32it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 45%|█████████████████████████████████████████████████▌                                                           | 6275/13795 [04:24<03:06, 40.23it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 50%|██████████████████████████████████████████████████████▉                                                      | 6956/13795 [04:43<02:36, 43.70it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 55%|████████████████████████████████████████████████████████████▍                                                | 7652/13795 [05:00<02:15, 45.27it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 62%|███████████████████████████████████████████████████████████████████▌                                         | 8550/13795 [05:19<01:14, 69.97it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 67%|█████████████████████████████████████████████████████████████████████████▎                                   | 9285/13795 [05:41<03:16, 22.91it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 70%|████████████████████████████████████████████████████████████████████████████▎                                | 9664/13795 [06:04<01:52, 36.67it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 74%|███████████████████████████████████████████████████████████████████████████████▊                            | 10194/13795 [06:27<01:36, 37.39it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 77%|███████████████████████████████████████████████████████████████████████████████████▏                        | 10632/13795 [06:50<01:55, 27.29it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 82%|████████████████████████████████████████████████████████████████████████████████████████▍                   | 11290/13795 [07:15<00:55, 44.78it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 84%|██████████████████████████████████████████████████████████████████████████████████████████▍                 | 11556/13795 [07:36<00:57, 39.01it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 86%|█████████████████████████████████████████████████████████████████████████████████████████████▍              | 11927/13795 [07:52<00:43, 43.04it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 90%|█████████████████████████████████████████████████████████████████████████████████████████████████▏          | 12408/13795 [08:10<00:27, 50.86it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 93%|████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 12811/13795 [08:27<00:26, 36.81it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 13222/13795 [08:47<00:14, 38.82it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌| 13736/13795 [09:11<00:01, 35.91it/s]

Expected hidden states size: (13, B, |s|, 768). Got (13, 1024, 128, 768)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13795/13795 [09:21<00:00, 24.57it/s]


saving usages for decade


In [None]:
# import pickle
# import argparse
# import numpy as np
# from usage_collector import collect_from_coha

# parser = argparse.ArgumentParser()
# parser.add_argument('--seqlen', type=int, default=128)
# parser.add_argument('--bertdir', type=str, default='models/bert-base-uncased')
# parser.add_argument('--cohadir', type=str, default='data/coha')
# parser.add_argument('--outdir', type=str, default='data')
# parser.add_argument('--buffer', type=int, default=1024)

# args = parser.parse_args()

# targets = ['net', 'virtual', 'disk', 'card', 'optical', 'virus',
#            'signal', 'mirror', 'energy', 'compact', 'leaf',
#            'brick', 'federal', 'sphere', 'coach', 'spine', 'parent', 'sleep']

# print('{}\nSEQUENCE LENGTH: {}\n{}'.format('-' * 30, args.seqlen, '-' * 30))

# # decades = list(np.arange(1910, 2001, 10))
# # decades = list(np.arange(1810, 1811, 10))

# for decade in np.arange(1910, 2009, 10):
#     collect_from_coha(targets,
#                       [decade],
#                       sequence_length=args.seqlen,
#                       pretrained_weights=args.bertdir,
#                       coha_dir=args.cohadir,
#                       output_path='{}/concat/usages_16_len{}_{}.dict'.format(args.outdir, args.seqlen, decade),
#                       buffer_size=args.buffer)

#     # # Save usages
#     # with open('{}/concat/usages_16_len{}_{}.dict'.format(args.outdir, args.seqlen, decade), 'wb') as f:
#     #     pickle.dump(usages, file=f)
#     # usages = None


In [None]:
with open('/Users/gabriellachronis/Box Sync/src/cwr4lsc/test_output', 'rb') as file:

    # dump information to that file
    data = pickle.load(file)