In [1]:
!pip install -q transformers datasets accelerate bitsandbytes

from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import torch




Read model 1 outputs and combine them:

In [2]:
import os
import pandas as pd

# Directory containing the CSV files
directory = '../model1/outputs/'

# List all CSV files in the directory
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

# Read and concatenate all CSV files
combined_df = pd.concat([pd.read_csv(os.path.join(directory, file), on_bad_lines='skip', engine='python') for file in csv_files])

# Save the combined DataFrame to a new CSV file
combined_df.to_csv(os.path.join(directory, 'full_combined_outputs_model1.csv'), index=False)

Load model 1 outputs and fine-tune model 1 again (replace setting:)

In [3]:
import pandas as pd
data = pd.read_csv("../model1/outputs/full_combined_outputs_model1.csv", on_bad_lines='skip', engine='python')

In [4]:
data

Unnamed: 0,id,generated_text
0,0ae533a8710b8ce93c1c66ba1908dd1d88e6e985,"When ""American Idol"" season 12 premiered lower..."
1,e8950caed2a0bc2fb2138cef8ff1a15100ced40e,A Kentucky museum devoted to Corvettes is read...
2,caaf22ada96d8cb2c22cf541093bed4d4d5a3ebf,The U.S. Air Force began grounding a third of ...
3,36904449f8ba24e664edd0deed363f973cb0c821,A suggestive letter from an inmate and a box o...
4,5e592e4bd81a0c366c3276a264b0ac9391339183,CBS suspended basketball analyst Greg Anthony ...
...,...,...
99995,2b3dfad3b66774711c308576ff22e2af6853d37c,Former President George W. Bush on Thursday re...
99996,6b2f176b7dd1f87318042417960e07aeb041b0e7,Children suffering from attention-deficit hype...
99997,b17fdb4b170f7533d3826dfab5c3fbe515907c0f,Latino voters strongly support President Barac...
99998,63b4933e88fecf09a67a965e76fcd9db633fa291,Despite being ridiculed when they first appear...


In [5]:
train_data = data[['generated_text']]

In [6]:
from datasets import Dataset
dataset = Dataset.from_pandas(train_data, preserve_index=False)

In [7]:
def keep_article(example):
    return {"text": example["generated_text"].strip()}

dataset = dataset.map(keep_article, remove_columns=dataset.column_names)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [8]:
model_dir = "../model1/opt125m-news-model-base"
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

In [9]:
def tokenize(example):
    result = tokenizer(example["text"], truncation=True, max_length=1024, padding="max_length")
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [10]:
split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split["train"]
eval_dataset = split["test"]

In [11]:
model = AutoModelForCausalLM.from_pretrained(model_dir)
model.config.pad_token_id = tokenizer.pad_token_id

In [12]:
training_args = TrainingArguments(
    output_dir="../model2a",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    lr_scheduler_type="cosine",
    learning_rate=1e-4,
    warmup_steps=100,
    num_train_epochs=3,
    fp16=torch.cuda.is_available(),
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=500,
    logging_steps=100,
    eval_steps=500,
    save_total_limit=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)


In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss,Validation Loss
500,1.0238,0.968247
1000,1.0009,0.950559
1500,0.9867,0.938367
2000,0.9807,0.927481
2500,0.9757,0.920703
3000,0.957,0.911825
3500,0.9534,0.901576
4000,0.9493,0.895112
4500,0.9351,0.886706
5000,0.9345,0.879466


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=16875, training_loss=0.847552766248915, metrics={'train_runtime': 43110.3549, 'train_samples_per_second': 6.263, 'train_steps_per_second': 0.391, 'total_flos': 1.4109769728e+17, 'train_loss': 0.847552766248915, 'epoch': 3.0})

In [14]:
trainer.save_model("../model2a")
tokenizer.save_pretrained("../model2a")

('../model2a\\tokenizer_config.json',
 '../model2a\\special_tokens_map.json',
 '../model2a\\vocab.json',
 '../model2a\\merges.txt',
 '../model2a\\added_tokens.json',
 '../model2a\\tokenizer.json')

Inference Time

In [15]:
# use T4?
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

save_dir = "../model2a"

# Load model
model = AutoModelForCausalLM.from_pretrained(
    save_dir,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(save_dir)

`torch_dtype` is deprecated! Use `dtype` instead!


In [16]:
def generate_text(input_text, max_new_tokens=700, temperature=0.5):
  input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)

  output = model.generate(
      input_ids,
      max_new_tokens=max_new_tokens,
      do_sample=True,
      temperature=temperature
  )

  return tokenizer.decode(output[0], skip_special_tokens=True)

In [17]:
generate_text("The government has announced a new policy on renewable energy", 750, 0.8)

'The government has announced a new policy on renewable energy -- a move that could save the nation $13 billion\nThe U.S. Environmental Protection Agency announced a major overhaul of its energy policy this week, with a major overhaul aimed at reducing greenhouse gas emissions. The EPA\'s new policy calls for renewable energy to be used as a source of energy by 2020 and 2030. "We are committed to making sure we have an effective strategy to achieve our goal," said Michael Brown, director of the agency\'s Office of Management and Budget. "This is a significant step forward in addressing climate change." The new policy also includes a pledge not to increase production of coal or oil over the next decade. It will require all states to report their carbon dioxide levels annually to the EPA, which has already begun implementing the changes. The changes come after years of debate between environmentalists and government officials about how best to reduce emissions. In 2010, the EPA issued a 