In [1]:
!conda install tqdm

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 22.9.0
  latest version: 23.7.4

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Retrieving notices: ...working... done


In [4]:
import os
import pickle

import torch
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from transformers import BertModel, BertTokenizer
import re
import csv   



def get_context(token_ids, target_position, sequence_length=128):
    """
    Given a text containing a target word, return the sentence snippet which surrounds the target word
    (and the target word's position in the snippet).

    :param token_ids: list of token ids (for an entire line of text)
    :param target_position: index of the target word's position in `tokens`
    :param sequence_length: desired length for output sequence (e.g. 128, 256, 512)
    :return: (context_ids, new_target_position)
                context_ids: list of token ids for the output sequence
                new_target_position: index of the target word's position in `context_ids`
    """
    # -2 as [CLS] and [SEP] tokens will be added later; /2 as it's a one-sided window
    window_size = int((sequence_length - 2) / 2)
    context_start = max([0, target_position - window_size])
    padding_offset = max([0, window_size - target_position])
    padding_offset += max([0, target_position + window_size - len(token_ids)])

    context_ids = token_ids[context_start:target_position + window_size]
    context_ids += padding_offset * [0]

    new_target_position = target_position - context_start

    return context_ids, new_target_position

def get_usage_vectors( model,
                       batch_input_ids,
                       batch_tokens,
                       batch_snippets,
                       batch_pos, 
                       batch_decades):
    
    with torch.no_grad():
        # collect list of input ids into a single batch tensor
        input_ids_tensor = torch.tensor(batch_input_ids)
        if torch.cuda.is_available():
            input_ids_tensor = input_ids_tensor.to('cuda')

            
        #print(input_ids_tensor)
        # run usages through language model
        outputs = model(input_ids_tensor,  output_hidden_states=True )
        #print(len(outputs.hidden_states)) # items in the tuple = 1 + num layers
        if torch.cuda.is_available():
            hidden_states = [l.detach().cpu().clone().numpy() for l in outputs[2]]
        else:
            #print("fjekl")
            hidden_states = [l.clone().numpy() for l in outputs.hidden_states]

        # get usage vectors from hidden states
        hidden_states = np.stack(hidden_states)  # (13, B, |s|, 768)
        #print('Expected hidden states size: (13, B, |s|, 768). Got {}'.format(hidden_states.shape))
        # usage_vectors = np.sum(hidden_states, 0)  # (B, |s|, 768)
        # usage_vectors = hidden_states.view(hidden_states.shape[1],
        #                                    hidden_states.shape[2],
        #                                    -1)
        usage_vectors = np.sum(hidden_states[1:, :, :, :], axis=0)
        
        # usage_vectors = hidden_states.reshape((hidden_states.shape[1], hidden_states.shape[2], -1))
        #print(usage_vectors.shape)
        return usage_vectors


def collect_from_coha(target_words,
                      decades,
                      sequence_length,
                      pretrained_weights='models/bert-base-uncased',
                      coha_dir='data/coha',
                      output_path=None,
                      buffer_size=1024):
    """
    Collect usages of target words from the COHA dataset.

    :param target_words: list of words whose usages are to be collected
    :param decades: list of year integers (e.g. list(np.arange(1910, 2001, 10)))
    :param sequence_length: the number of tokens in the context of a word occurrence
    :param pretrained_weights: path to model folder with weights and config file
    :param coha_dir: path to COHA directory (containing `all_1810.txt`, ..., `all_2000.txt`)
    :param output_path: path to output file for `usages` dictionary. If provided, data is stored
                        in this file incrementally (use e.g. to avoid out of memory errors)
    :param buffer_size: (max) number of usages to process in a single model run
    :return: usages: a dictionary from target words to lists of usage tuples
                     lemma -> [(vector, sentence, word_position, decade), (v, s, p, d), ...]
    """

    # load model and tokenizer
    tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
    model = BertModel.from_pretrained(pretrained_weights)
    if torch.cuda.is_available():
        model.to('cuda')

    # build word-index vocabulary for target words
    i2w = {}
    for t, t_id in zip(target_words, tokenizer.encode(' '.join(target_words))[1:-1]): #changed for new hugginface api
        i2w[t_id] = t

    # buffers for batch processing
    batch_input_ids = []
    batch_tokens = []
    batch_pos = []
    batch_snippets = []
    batch_decades = []

    usages = defaultdict(list)  # w -> (vector, sentence, word_position, decade)
    
    # do collection
    
    print(len(decades))
    for T, decade in enumerate(decades):
        # one time interval at a time
        print('Decade {}...'.format(decade))

        
        ### gabriella changes
        ### my coha is organized differently. 
        ### the decades have random numbers for the alphabet index places , so i have to use regex
        ### to ignore that. 
        print(coha_dir)
        print(decade)
        my_regex = r'text_' + re.escape(str(decade)) + 's.*'

        #print("running through decade ", decade)

        # iterate through directories
        for decade_dir in os.listdir(coha_dir):

            if re.match(my_regex, decade_dir):
                # get all the text files for that decade
                # iterate through text files for this decade
                this_decade_files = os.listdir(os.path.join(coha_dir, decade_dir))
                for F, filename in enumerate(tqdm(this_decade_files)):
                    #print(filename)
                    
                    with open(os.path.join(coha_dir, decade_dir, filename), 'r') as f:
                        lines = f.readlines()
                        #print("gets here")

                        # get the usages from this file
                        for L, line in enumerate(lines):
                            #print("gets to line: ", L)



                            # tokenize line and convert to token ids
                            tokens = tokenizer.encode(line)

                            for pos, token in enumerate(tokens):
                                #print(token)
                                # store usage info of target words only
                                if token in i2w:
                                    context_ids, pos_in_context = get_context(tokens, pos, sequence_length)

                                    input_ids = [101] + context_ids + [102]


                                    # convert later to save storage space
                                    snippet = tokenizer.convert_ids_to_tokens(context_ids)
                                    #print(i2w[token])
                                    #print(' '.join(snippet))

                                    # add usage info to buffers
                                    batch_input_ids.append(input_ids)
                                    batch_tokens.append(i2w[token])
                                    batch_pos.append(pos_in_context)
                                    batch_snippets.append(snippet)
                                    batch_decades.append(decade)

#                                 print("batch size ", len(batch_input_ids))
#                                 print("lines left ", len(lines) - L)
#                                 print("files left in this decade" , len(this_decade_files) - F)
#                                 print("decades left", len(decades) - T)
                                    
                                # if the buffers are full...             or if we're at the end of the dataset
                                if (len(batch_input_ids) >= buffer_size) or (L == len(lines) - 1 and T == len(decades) - 1 and F==len(this_decade_files)):
                                    
                                    usage_vectors = get_usage_vectors(
                                        model,
                                        batch_input_ids,
                                        batch_tokens,
                                        batch_snippets, 
                                        batch_pos, 
                                        batch_decades )
                                    
#                                     with torch.no_grad():
#                                         # collect list of input ids into a single batch tensor
#                                         input_ids_tensor = torch.tensor(batch_input_ids)
#                                         if torch.cuda.is_available():
#                                             input_ids_tensor = input_ids_tensor.to('cuda')

#                                         # run usages through language model
#                                         outputs = model(input_ids_tensor,  output_hidden_states=True )
#                                         #print(len(outputs.hidden_states)) # items in the tuple = 1 + num layers
#                                         if torch.cuda.is_available():
#                                             hidden_states = [l.detach().cpu().clone().numpy() for l in outputs[2]]
#                                         else:
#                                             #print("fjekl")
#                                             hidden_states = [l.clone().numpy() for l in outputs.hidden_states]

#                                         # get usage vectors from hidden states
#                                         hidden_states = np.stack(hidden_states)  # (13, B, |s|, 768)
#                                         print('Expected hidden states size: (13, B, |s|, 768). Got {}'.format(hidden_states.shape))
#                                         # usage_vectors = np.sum(hidden_states, 0)  # (B, |s|, 768)
#                                         # usage_vectors = hidden_states.view(hidden_states.shape[1],
#                                         #                                    hidden_states.shape[2],
#                                         #                                    -1)
#                                         usage_vectors = np.sum(hidden_states[1:, :, :, :], axis=0)
#                                         # usage_vectors = hidden_states.reshape((hidden_states.shape[1], hidden_states.shape[2], -1))
#                                         #print("makes usage vectors")
#                                         print(usage_vectors.shape)



                                    # store usage tuples in a dictionary: lemma -> (vector, snippet, position, decade)
                                    #print(len(batch_input_ids))
                                    for b in np.arange(len(batch_input_ids)):
                                        usage_vector = usage_vectors[b, batch_pos[b]+1, :] # get the right position
                                        usages[batch_tokens[b]].append(
                                             (usage_vector, batch_snippets[b], batch_pos[b], batch_decades[b]))
                                    
#                                     print(usages)
                        
                                    # finally, empty the batch buffers
                                    batch_input_ids, batch_tokens, batch_pos, batch_snippets, batch_decades = [], [], [], [], []
                        
        print("saving usages for decade")
        if os.path.exists(output_path):
            append_write = 'ab' # append if already exists
        else:
            append_write = 'wb' # make a new file if not

        # and store data incrementally
        if output_path:
            #print(append_write)
            with open(output_path, append_write) as f:
                pickle.dump(usages, file=f)
    

    return usages


In [5]:
# Target words: we want to collect tokens of each of these words from COHA

targets = ['net', 'virtual', 'disk', 'card', 'optical', 'virus',
           'signal', 'mirror', 'energy', 'compact', 'leaf',
           'brick', 'federal', 'sphere', 'coach', 'spine', 'parent', 'sleep']

decades = [decade for decade in np.arange(1910, 2009, 10)]
print(len(decades))

buffer_size=1024
sequence_length=128


coha_dir = '/home/shared/corpora/Corpus of Historical American English/TEXTS'
bert_dir = 'bert-base-uncased'
output_dir = '../data/cwr4lsc'

10


In [6]:
### collect just the usages and not the vectors. 

for decade in decades:
    collect_from_coha(targets,
                      [decade],
                      sequence_length=sequence_length,
                      pretrained_weights=bert_dir,
                      coha_dir=coha_dir,
                      output_path='{}/usages_with_vectors_16_len{}_{}.dict'.format(output_dir, sequence_length, decade),
                      buffer_size=buffer_size)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1
Decade 1910...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1910


  0%|                                                                                                                                                                                  | 0/3355 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (7482 > 512). Running this sequence through the model will result in indexing errors
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3355/3355 [05:33<00:00, 10.06it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1
Decade 1920...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1920


  0%|                                                                                                                                                                                 | 0/11557 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (5408 > 512). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11557/11557 [06:31<00:00, 29.55it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1
Decade 1930...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1930


  0%|                                                                                                                                                                                 | 0/10352 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10352/10352 [06:43<00:00, 25.67it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1
Decade 1940...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1940


  0%|                                                                                                                                                                                 | 0/11343 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (768 > 512). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11343/11343 [06:27<00:00, 29.26it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1
Decade 1950...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1950


  0%|                                                                                                                                                                                 | 0/11935 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (603 > 512). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11935/11935 [06:46<00:00, 29.37it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1
Decade 1960...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1960


  0%|                                                                                                                                                                                 | 0/10113 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (693 > 512). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10113/10113 [06:33<00:00, 25.72it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1
Decade 1970...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1970


  0%|                                                                                                                                                                                  | 0/9419 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1233 > 512). Running this sequence through the model will result in indexing errors
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9419/9419 [06:54<00:00, 22.70it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1
Decade 1980...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1980


  0%|                                                                                                                                                                                 | 0/11106 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (4048 > 512). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11106/11106 [07:21<00:00, 25.15it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1
Decade 1990...
/home/shared/corpora/Corpus of Historical American English/TEXTS
1990


  0%|                                                                                                                                                                                  | 0/9778 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (634 > 512). Running this sequence through the model will result in indexing errors
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9778/9778 [08:30<00:00, 19.16it/s]


saving usages for decade


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1
Decade 2000...
/home/shared/corpora/Corpus of Historical American English/TEXTS
2000


  0%|                                                                                                                                                                                 | 0/13795 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (642 > 512). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13795/13795 [09:10<00:00, 25.06it/s]


saving usages for decade


In [11]:
l = []

l.append([1,2,3])
l.append([4,5,6])
l

with open('file', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(l)

In [None]:
# import pickle
# import argparse
# import numpy as np
# from usage_collector import collect_from_coha

# parser = argparse.ArgumentParser()
# parser.add_argument('--seqlen', type=int, default=128)
# parser.add_argument('--bertdir', type=str, default='models/bert-base-uncased')
# parser.add_argument('--cohadir', type=str, default='data/coha')
# parser.add_argument('--outdir', type=str, default='data')
# parser.add_argument('--buffer', type=int, default=1024)

# args = parser.parse_args()

# targets = ['net', 'virtual', 'disk', 'card', 'optical', 'virus',
#            'signal', 'mirror', 'energy', 'compact', 'leaf',
#            'brick', 'federal', 'sphere', 'coach', 'spine', 'parent', 'sleep']

# print('{}\nSEQUENCE LENGTH: {}\n{}'.format('-' * 30, args.seqlen, '-' * 30))

# # decades = list(np.arange(1910, 2001, 10))
# # decades = list(np.arange(1810, 1811, 10))

# for decade in np.arange(1910, 2009, 10):
#     collect_from_coha(targets,
#                       [decade],
#                       sequence_length=args.seqlen,
#                       pretrained_weights=args.bertdir,
#                       coha_dir=args.cohadir,
#                       output_path='{}/concat/usages_16_len{}_{}.dict'.format(args.outdir, args.seqlen, decade),
#                       buffer_size=args.buffer)

#     # # Save usages
#     # with open('{}/concat/usages_16_len{}_{}.dict'.format(args.outdir, args.seqlen, decade), 'wb') as f:
#     #     pickle.dump(usages, file=f)
#     # usages = None


In [None]:
with open('/Users/gabriellachronis/Box Sync/src/cwr4lsc/test_output', 'rb') as file:

    # dump information to that file
    data = pickle.load(file)