## study the preprocess code here

In [1]:
#  Copyright (c) Microsoft Corporation. 
#  Licensed under the MIT license. 
"""
preprocess input data into feature and stores binary as python shelve DB
each chunk is gzipped JSON string
"""
import argparse
import gzip
import json
import subprocess as sp
import shelve
import os
from os.path import dirname, exists, join

import torch
from lsp_model import GPT2Tokenizer
from tqdm import tqdm

from env import END_OF_TEXT_TOKEN
from gpt2_training.train_utils import InputFeatures_train as InputFeatures


def _get_file_len(corpus):
    n_line = int(sp.check_output(f"wc -l {corpus}".split(),
                                 universal_newlines=True).split()[0])
    return n_line


def _norm_text(text):
    w, *toks = text.strip().split()
    try:
        w = float(w)
    except Exception:
        toks = [w] + toks
        w = 1.0
    return w, ' '.join(toks)


def _get_inputs_from_text(text, tokenizer):
    srcs, tgt = text.strip().split('\t')
    weights = []
    inputs = []
    for src in srcs.split(' EOS '):
        src_weight, src = _norm_text(src)
        context_id = tokenizer.encode(src)
        weights.append(src_weight)
        inputs.append(context_id)
    tgt_weight, tgt = _norm_text(tgt)
    if tgt_weight != 0:
        response_id = tokenizer.encode(tgt)
        weights.append(tgt_weight)
        inputs.append(response_id)
    return weights, inputs


def _make_features(id_, weights, inputs, tokenizer, max_len):
    end_of_text_id = tokenizer.encoder[END_OF_TEXT_TOKEN]
    features = []
    sents = []
    ws = []
    len_ = 0
    i = 0
    if True:
        if len(weights)==2 and weights[-1]>0:
            persona_id=int(weights[-1])
    for ids, w in zip(inputs, weights):
        if len(ids) > max_len:
            if len(sents) >= 2:
                if True:

                    feat = _make_feature(id_ + i, sents, ws, end_of_text_id,persona_id)
                else:
                    feat = _make_feature(id_ + i, sents, ws, end_of_text_id)
                if feat is not None:
                    features.append(feat)
                    i += 1
            len_ = 0
            sents = []
            ws = []
            continue
        elif len_ > max_len:
            if True:
                feat = _make_feature(id_ + i, sents, ws, end_of_text_id,persona_id)
            else:
                feat = _make_feature(id_ + i, sents, ws, end_of_text_id)
            if feat is not None:
                features.append(feat)
                i += 1
            len_ = len(sents[-1]) + 1
            sents = sents[-1:]
            ws = ws[-1:]
        len_ += (len(ids) + 1)
        sents.append(ids)
        ws.append(w)
    if len(sents) >= 2:
        if True:
            feat = _make_feature(id_ + i, sents, ws, end_of_text_id,persona_id)
        else:
            feat = _make_feature(id_ + i, sents, ws, end_of_text_id)

        if feat is not None:
            features.append(feat)

    return features


def _make_feature(id_, sents, ws, eos,persona_id=None):
    if all(w == 0 for w in ws[1:]):
        return None
    input_ids = [i for s in sents for i in s+[eos]][:-1]
    lm_labels = []
    weights = []
    token_type_ids = []  # this becomes round ids
    for i, (s, w) in enumerate(zip(sents, ws)):
        if i == 0:
            lm_labels += [-1] * len(s)
            weights += [0.0] * len(s)
            token_type_ids += [0] * len(s)
            continue

        token_type_ids += [i] * (len(s) + 1)
        if w == 0.0:
            lm_labels += [-1] * (len(s) + 1)
            weights += [0.0] * (len(s) + 1)
        else:
            lm_labels += (s + [eos])
            weights += [w] * (len(s) + 1)

    # handle trailing -1's
    i = len(lm_labels) - 1
    while i >= 0:
        if lm_labels[i] != -1:
            break
        i -= 1
    input_ids = input_ids[:i+1]
    lm_labels = lm_labels[:i+1]
    weights = weights[:i+1]
    token_type_ids = token_type_ids[:i+1]

    # pad to multiples of 8
    while len(input_ids) % 8 != 0:
        input_ids.append(0)
        token_type_ids.append(0)
        lm_labels.append(-1)
        weights.append(0.0)

    position_ids = list(range(len(input_ids)))
    assert (len(input_ids) == len(position_ids) == len(token_type_ids)
            == len(lm_labels) == len(weights))
    assert len(input_ids) % 8 == 0
    if len(input_ids) == 0:
        import pdb
        pdb.set_trace()
    feature = InputFeatures(id_, input_ids, position_ids, token_type_ids,
                            lm_labels, weights,persona_id=persona_id)
    return feature


def main(args):
    toker = GPT2Tokenizer.from_pretrained('gpt2')
    attrs = []
    if args.reverse:
        attrs.append('reverse')
    if args.two_turn:
        attrs.append('2turn')
    if attrs:
        db_path = (f'{args.corpus[:-4]}.{args.max_seq_len}len.'
                   f'{".".join(attrs)}.db/db')
    else:
        db_path = f'{args.corpus[:-4]}.{args.max_seq_len}len.db/db'
    if exists(dirname(db_path)):
        raise ValueError('Found existing DB, please backup')
    else:
        os.makedirs(dirname(db_path))
    with open(args.corpus, "r", encoding="utf-8") as reader, \
            shelve.open(db_path, 'n') as db:
        chunk = []
        n_chunk = 0
        n_example = 0
        for line in tqdm(reader, total=_get_file_len(args.corpus)):
            try:
                if len(chunk) >= args.chunk_size:
                    # save and renew chunk
                    db[f'chunk_{n_chunk}'] = gzip.compress(
                        json.dumps(chunk[:args.chunk_size]).encode('utf-8'))
                    chunk = chunk[args.chunk_size:]
                    n_chunk += 1

                weights, inputs = _get_inputs_from_text(line, toker)
                # at this point, the weights are still what we need. 
                
                if args.reverse:
                    weights = list(reversed(weights))
                    inputs = list(reversed(inputs))
                if args.two_turn:
                    weights = weights[:2]
                    inputs = inputs[:2]
                if len(weights) < 2:
                    continue
                features = _make_features(n_example, weights, inputs,
                                          toker, args.max_seq_len)
                for feature in features:
                    chunk.append(vars(feature))
                    n_example += 1
            except Exception as e:
                print('!!! prepro exception !!!', e)
                continue
        # save last chunk
        db[f'chunk_{n_chunk}'] = gzip.compress(
            json.dumps(chunk).encode('utf-8'))
    # save relevant information to reproduce
    meta = {'n_example': n_example,
            'chunk_size': args.chunk_size,
            'max_seq_len': args.max_seq_len,
            'reverse': args.reverse,
            'two_turn': args.two_turn}
    with open(join(dirname(db_path), 'meta.json'), 'w') as writer:
        json.dump(meta, writer, indent=4)
    torch.save(toker, join(dirname(db_path), 'tokenizer.pt'))



In [2]:
toker = GPT2Tokenizer.from_pretrained('gpt2')

In [3]:
tokenizer=toker
text="0.0 what ' s it like to be attractive ? to like walk into a room and have girls actually looking at you ?	0 reddit is definitely the best place to ask this question"
weights, inputs = _get_inputs_from_text(text, tokenizer)

In [4]:
weights,inputs

([0.0],
 [[10919,
   705,
   264,
   340,
   588,
   284,
   307,
   10966,
   5633,
   284,
   588,
   2513,
   656,
   257,
   2119,
   290,
   423,
   4813,
   1682,
   2045,
   379,
   345,
   5633]])

In [5]:
n_example=0

features = _make_features(n_example, weights, inputs,toker, 128)[0]

IndexError: list index out of range

In [6]:

print('conv_id:',end='\t')
print(features.conv_id)

print('input_ids:',end='\t')
print(features.input_ids )

print('position_ids:',end='\t')
print(features.position_ids )

print('token_type_ids:',end='\t')
print(features.token_type_ids )

print('lm_labels:',end='\t')
print(features.lm_labels )

print('weights:',end='\t')
print(features.weights )

print('persona_id:',end='\t')
print(features.persona_id )
print(features.input_len)


conv_id:	

NameError: name 'features' is not defined

In [38]:
text="0.0 if you could have one ' do over ' in your life , what would you do differently ?	34 find dif   ds     ways to fund myself - no matter how desperate ,f f f  there was always a better way ."
weights, inputs = _get_inputs_from_text(text, tokenizer)
n_example=0

features = _make_features(n_example, weights, inputs,toker, 128)[0]
print('weights:',end='\t')
print(features.weights )

weights:	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 0.0, 0.0, 0.0]


In [None]:
# check this one: 

#https://github.com/huggingface/transformers/tree/v0.6.2


In [9]:
tokens=[10919,   318,   262,  5290,  1693,   345,  1683,   750,  5633, 50256,
          7890, 22967,   764,     0,     0,     0]
tokens=[10919,   318,   262,  5290,  1517,  2130,   468, 25623,   284,   345,
          5633, 50256, 46248, 25623,   284,  5170,  2130,   284,   502,   736,
           287,   262,  4317,    82,   618,  1312,   373,   379,   257,  2318,
           588, 16839,   812,  2084,   764, 50256]
print(toker.decode(tokens))

what is the worst thing someone has confessed to you ?<|endoftext|>someone confessed to killing someone to me back in the 70s when i was at a bar like digit years ago .<|endoftext|>


In [24]:
len(tokens)

56

In [25]:
len(token_ids)

34

In [22]:
token_ids=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
token_ids[15]

0

In [56]:
b=torch.tensor([[[1,1,1,1],[1,1,1,1],[1,1,1,1]],[[1,1,1,1],[1,1,1,1],[1,1,1,1]]]) # 2x3x4
a=torch.tensor([[0,1,1],[1,1,0]]).reshape()


In [57]:
a.size(),b.size()

(torch.Size([2, 3, 1]), torch.Size([2, 3, 4]))

In [59]:
a.dtype

torch.int64

In [61]:
a.type(torch.float16)

tensor([[[0.],
         [1.],
         [1.]],

        [[1.],
         [1.],
         [0.]]], dtype=torch.float16)

In [58]:
a * b

tensor([[[0, 0, 0, 0],
         [1, 1, 1, 1],
         [1, 1, 1, 1]],

        [[1, 1, 1, 1],
         [1, 1, 1, 1],
         [0, 0, 0, 0]]])

In [12]:
#  Copyright (c) Microsoft Corporation. 
#  Licensed under the MIT license. 
'''
 * @Desc: train GPT2 from scratch/ fine tuning.
          Modified based on Huggingface GPT-2 implementation
'''

import json
import os
import sys
import argparse
import logging
import time
import tqdm
import datetime
import torch

import numpy as np

from os.path import join
from torch.distributed import get_rank, get_world_size

from lsp_model import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, Adam
from gpt2_training.train_utils import load_model, boolean_string, set_lr, get_eval_list_same_length
from gpt2_training.eval_utils import eval_model_loss

from data_loader import BucketingDataLoader, DynamicBatchingLoader, DistributedBucketingDataLoader


from gpt2_training.distributed import all_reduce_and_rescale_tensors, all_gather_list


logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO)
logger = logging.getLogger(__name__)

INF = 100000000
CACHE_EMPTY_STEP = 10000
EVAL_STEP = 100000

#########################################################################
# Prepare Parser
##########################################################################

parser = argparse.ArgumentParser()
parser.add_argument('--model_name_or_path', type=str,
                    help='pretrained model name or path to local checkpoint')
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--max_seq_length", type=int, default=128)

parser.add_argument("--skip_eval", action='store_true',
                    help='If true, skip evaluation.')
parser.add_argument("--init_checkpoint", type=str)
parser.add_argument("--train_input_file", type=str)
parser.add_argument("--eval_input_file", type=str)
parser.add_argument("--continue_from", type=int, default=0)

parser.add_argument("--train_batch_size", type=int, default=4,
                    help="batch size now means per GPU per step")
parser.add_argument("--gradient_accumulation_steps", type=int, default=2,
                    help="to increase effective batch size "
                         "and reduce synchronization")
parser.add_argument("--eval_batch_size", type=int, default=4)
parser.add_argument("--learning_rate", type=float, default=1e-5)
parser.add_argument("--num_optim_steps", type=int, default=1000000,
                    help="new API specifies num update steps")
parser.add_argument("--valid_step", type=int, default=10000,
                    help="how many optim steps between validations")
parser.add_argument("--warmup_proportion", type=float, default=0.1)
parser.add_argument("--warmup_steps", type=int, default=16000)

parser.add_argument("--normalize_data", type=boolean_string, default=True)
parser.add_argument("--fp16", type=boolean_string, default=True)
parser.add_argument("--lr_schedule", type=str,
                    choices=['noam', 'noamwd', 'BERT', 'None'], default='noam')
parser.add_argument("--loss_scale", type=float, default=0)
parser.add_argument("--no_token_id", type=boolean_string, default=True)

parser.add_argument("--output_dir", type=str)
parser.add_argument("--log_dir", type=str)
parser.add_argument('--pbar', type=boolean_string, default=True, help='turn on progress bar')

# distributed
parser.add_argument('--local_rank', type=int, default=-1,
                    help='for torch.distributed')
parser.add_argument('--config', help='JSON config file')


# do normal parsing
args = parser.parse_args()




usage: ipykernel_launcher.py [-h] [--model_name_or_path MODEL_NAME_OR_PATH]
                             [--seed SEED] [--max_seq_length MAX_SEQ_LENGTH]
                             [--skip_eval] [--init_checkpoint INIT_CHECKPOINT]
                             [--train_input_file TRAIN_INPUT_FILE]
                             [--eval_input_file EVAL_INPUT_FILE]
                             [--continue_from CONTINUE_FROM]
                             [--train_batch_size TRAIN_BATCH_SIZE]
                             [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS]
                             [--eval_batch_size EVAL_BATCH_SIZE]
                             [--learning_rate LEARNING_RATE]
                             [--num_optim_steps NUM_OPTIM_STEPS]
                             [--valid_step VALID_STEP]
                             [--warmup_proportion WARMUP_PROPORTION]
                             [--warmup_steps WARMUP_STEPS]
                             [--normalize_dat

SystemExit: 2

In [10]:
import torch

In [10]:
torch.zeros([5],dtype=torch.long, device='cuda:0')

tensor([0, 0, 0, 0, 0], device='cuda:0')

In [None]:
token_persona_emb_ids=torch.ones(10,dtype=torch.long)
token_persona_emb_ids = persona_ids.reshape((-1,1)) * token_persona_emb_ids 


In [11]:
from sklearn import preprocessing
import numpy as np
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
X_scaled = preprocessing.scale(X_train)

X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [23]:
np.mean(X_scaled)

4.9343245538895844e-17

In [24]:
np.std(X_scaled)

1.0

In [13]:
import numpy as np
from sklearn.preprocessing import StandardScaler

data = np.random.randint(25, size=(4, 4))
standardized_data = StandardScaler().fit_transform(data)



In [14]:
data

array([[ 8,  6,  8, 15],
       [17, 23, 22,  9],
       [12, 23,  3, 10],
       [22,  6, 16,  8]])

In [15]:
standardized_data

array([[-1.28280871, -1.        , -0.58275249,  1.67125804],
       [ 0.4276029 ,  1.        ,  1.33690278, -0.55708601],
       [-0.52262577,  1.        , -1.26834366, -0.18569534],
       [ 1.37783158, -1.        ,  0.51419338, -0.92847669]])

In [22]:
StandardScaler().fit_transform(X_train[2,:].reshape(-1, 1))

array([[ 0.        ],
       [ 1.22474487],
       [-1.22474487]])

In [17]:
standardized_data

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])