In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from tqdm.auto import tqdm
import tensorflow as tf

import _pickle as pickle
def save(file,name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'wb')
    else:
        outfile = open(name+'.pickle', 'wb')
    pickle.dump(file, outfile)
    outfile.close
    
def load(name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'rb')
    else:
        outfile = open(name+'.pickle', 'rb')
    file = pickle.load(outfile)
    outfile.close
    return file

from transformers import GPT2Tokenizer, TFGPT2LMHeadModel

## Data Parse

In [None]:
def get_metadata():
    with open('arxiv-metadata-oai-snapshot.json', 'r') as f:
        for line in f:
            yield line

In [None]:
metadata = get_metadata()

In [None]:
keys = ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed']

titles_tags_dict = {k:[] for k in keys}
count = 0
for paper in tqdm(metadata, total = 1700000):
    parsed = json.loads(paper)
    for k in keys:
        titles_tags_dict[k].append(parsed[k])
#     titles_tags_dict["title"].append(parsed['title'])
#     titles_tags_dict["tags"].append(parsed['categories'])
#     count += 1

In [None]:
df = pd.DataFrame(titles_tags_dict)

In [None]:
df.head()

In [None]:
save(df, 'data')

## Data Load

In [None]:
df = load('data')
df = df.sample(n = df.shape[0])

In [None]:
categ = pd.read_csv('categ.csv', sep = ';', encoding = 'latin-1')

dcat = {}
for i, line in categ.iterrows():
    dcat[line['category']] = line['description']
    
def apply_categ(x):
    x = x.split(' ')
    x1 = []
    for elt in x:
        try:
            x1.append(dcat[elt])
        except:
            1
#             x1.append('')
    return "; ".join(x1)
df['category'] = df['categories'].apply(apply_categ)

In [None]:
ins = list(map(lambda x : " | ".join(x), list(zip(df['category'], df['title']))))
ous = df['abstract'].values

In [None]:
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def write_to_tf_records(inputs, outputs, save_path):
#     input_ids = inputs['input_ids']   
#     token_type_ids = inputs['token_type_ids']   
#     attention_mask = inputs['attention_mask']  
#     outputs = outputs

    # write to tfrecord
    with tf.io.TFRecordWriter(save_path) as writer:
        def create_float_feature(values):
            return tf.train.Feature(float_list=tf.train.FloatList(value=values))

        def create_int_feature(values):
            return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))

        for (ids, ty, ma, out) in zip(inputs['input_ids'], inputs['token_type_ids'], inputs['attention_mask'], outputs):
            features = {'input_ids': create_int_feature(ids), 
                        'token_type_ids': create_int_feature(ty),
                        'attention_mask': create_int_feature(ma),
                        'outputs': create_int_feature(out), 
            }
            tf_example = tf.train.Example(features=tf.train.Features(feature=features))
            writer.write(tf_example.SerializeToString())

In [None]:
max_len = 512
bs = 10000
nb = df.shape[0] // bs + 1
for i in tqdm(range(23,nb)):
    a = ins[bs*i:bs*(i+1)]
    b = ous[bs*i:bs*(i+1)]
    toks = tokenizer.batch_encode_plus(list(zip(a, b)),
                                  add_special_tokens  = True, truncation = 'only_second',padding = 'max_length',
                                  max_length = max_len + 1, return_token_type_ids = True, verbose = True)
    
#     print(toks['input_ids'][:50])
    inputs = {
        'input_ids' : np.array(toks['input_ids'])[:, :max_len].astype('int32'),
        'token_type_ids' : np.array(toks['token_type_ids'])[:, :max_len].astype('int32'),
        'attention_mask' : np.array(toks['attention_mask'])[:, :max_len].astype('int32')
    }

    outputs = np.array(toks['input_ids'])[:,1:].astype('int32')
#     save((inputs, outputs), 'batch_'+str(i), 'tokenized')
    
    print(inputs['input_ids'].shape)
    
    write_to_tf_records(inputs, outputs, './tf_records/batch_'+str(i)+'.tfrecord')

In [None]:
def _parse_function(record):
    max_len = 512
    feature_description = {'input_ids': tf.io.FixedLenFeature(shape = (max_len),dtype = tf.int64), 
            'token_type_ids': tf.io.FixedLenFeature(shape = (max_len),dtype = tf.int64), 
            'attention_mask': tf.io.FixedLenFeature(shape = (max_len),dtype = tf.int64), 
            'outputs': tf.io.FixedLenFeature(shape = (max_len),dtype = tf.int64), 
        }

    new_sample = tf.io.parse_single_example(record, feature_description)

    input_ids = new_sample['input_ids']
    token_type_ids = new_sample['token_type_ids']
    attention_mask = new_sample['attention_mask']

    y = new_sample['outputs']

    X = {
        'input_ids' : input_ids,
        'attention_mask' : attention_mask,
        'token_type_ids' : token_type_ids
    }

    return X,y

def get_dataset( gcs_pattern, batch_size = 256):
    list_data = tf.io.gfile.glob(gcs_pattern)
    dataset = tf.data.TFRecordDataset(list_data)
    dataset = dataset.map(_parse_function, num_parallel_calls=8)
    dataset = dataset.repeat().batch(batch_size)
    return dataset.prefetch(tf.data.experimental.AUTOTUNE)
      

## Generation

In [2]:
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [3]:
batch_size = 64
# batch_size = 4
max_len = 128
lr = 0.000001

inputs = {
        'input_ids' : tf.keras.Input(shape = (max_len,), dtype = tf.int32),
        'attention_mask' : tf.keras.Input(shape = (max_len,), dtype = tf.int32),
        'token_type_ids' : tf.keras.Input(shape = (max_len,), dtype = tf.int32)
    }
decoder = TFGPT2LMHeadModel.from_pretrained('gpt2')
x = decoder(input_ids = inputs['input_ids'], attention_mask = inputs['attention_mask'], token_type_ids = inputs['token_type_ids'])
x = x[0]
model = tf.keras.Model(inputs, x)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


In [4]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
tfgp_t2lm_head_model (TFGPT2LMH TFCausalLMOutputWith 124439808   input_1[0][0]                    
                                                                 input_2[0][0]         

In [5]:
model.load_weights('./checkpoint/gpt_1947_weights')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x20551ce99c8>

In [9]:
prompt = ["Computation and Language (Computational Linguistics and Natural Language) | Topic Recognition and Understanding for zero Shot Transformers, a sentence pair classification model based on transformers language models to perform aspect based sentiment analysis."]
# prompt = ["Computation and Language (Computational Linguistics and Natural Language) | Transformers are a new technique in NLP. At Ekimetrics we use Transformers extensively and have pushed the boudaries of the state of the art NLP for business. We propose a new technoque called TRUST (Topic Recognition and Understanding with zero Shot Transformers) to recognize topics in social media texts using Zero Shot Learning and masked self supervised learning (BERT)."]
# prompt = ["Computation and Language (Computational Linguistics and Natural Language) | We propose a new state of the art framework to perform topic based sentiment analysis tasks, based on a sentence pair classification Transformers architecture that we named TRUST (Topic Recognition and Understanding for zero Shot Transformers)."]

prompt = ["Atomic Physics | We introduce a novel way to dissociate carbon dioxyde into dioxygen and carbon monnoxyde by stimulating the radial excitation of molecule."]

prompt = tokenizer.batch_encode_plus(prompt, add_special_tokens = True, return_tensors="tf")['input_ids']

generated = decoder.generate(input_ids=prompt, max_length=400, min_length=12, 
                     do_sample=True, early_stopping=None, num_beams=5, 
                     temperature=3, top_k=3, top_p=None, repetition_penalty=1.1, 
                     bad_words_ids=None, bos_token_id=None, pad_token_id=None, 
                     eos_token_id=None, length_penalty=None, no_repeat_ngram_size=None, 
                     num_return_sequences=2, attention_mask=None, 
                     decoder_start_token_id=None, use_cache=None)

decoded = tokenizer.batch_decode(generated)
                             
for elt in decoded:
    print(elt)
    print('\n')

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


Atomic Physics | We introduce a novel way to dissociate carbon dioxyde into dioxygen and carbon monnoxyde by stimulating the radial excitation of molecule.
  This approach is able to produce high-temperatures, which can be used in
quantum computing applications such as photovoltaics and nanoscale sensing.
  In this paper we propose a new method to disentangle molecules from their chemical counterparts
and show that it has significant advantages over conventional methods.
  The main advantage of this technique is its ability to generate low temperatures at room temperature.
We also demonstrate that it does not require an external magnetic field for
dissipations to occur. Our results are consistent with previous work showing the
effectiveness when applied to quantum computer systems.
  It should also be pointed out that these findings do not necessarily mean that there
is no physical connection between atoms or nucleotides; rather, they suggest
that one may have different reactions depen