In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel
import gc

In [None]:
import os
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"

In [2]:
torch.manual_seed(42)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2-large').cuda()
model.resize_token_embeddings(len(tokenizer))
class CustomDataset(Dataset):
    def __init__(self, prompt,instruction,desired_output, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for p,i,d in zip(prompt,instruction,desired_output):
            p = str(p)
            i = str(i)
            d = str(d)
            prompt = '# '+p.lstrip('\n') + '\n# '+i + '\n\"\"\"' + d
            encodings_dict = tokenizer('<|startoftext|>' + prompt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

Downloading vocab.json:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/3.02G [00:00<?, ?B/s]

In [3]:
df = pd.read_csv('/home/delta/Downloads/netflix_titles.csv')
prompt = df['director']
instruction = ['generate a description for the above director' for i in range(len(prompt))]
target = df['description']
max_length = max([len(tokenizer.encode(str(t))) for t in target]) + \
             max([len(tokenizer.encode(str(i))) for i in instruction]) + \
             max([len(tokenizer.encode(str(p))) for p in prompt])
dataset = CustomDataset(prompt,instruction,target,tokenizer,max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])
gc.collect()
torch.cuda.empty_cache()
training_args = TrainingArguments(output_dir='./results', 
                                  num_train_epochs=1, 
                                  logging_steps=100, 
                                  save_steps=5000,
                                  per_device_train_batch_size=1, 
                                  per_device_eval_batch_size=1,
                                  warmup_steps=10, 
                                  weight_decay=0.05, 
                                  logging_dir='./logs', 
                                  report_to = 'none',
                                  deepspeed ='ds_config_zero3.json')


In [4]:
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [None]:
%%bash
cat <<'EOT' > ds_config_zero3.json
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_16bit_weights_on_model_save": true
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}
EOT

In [5]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 7926
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 7926


Step,Training Loss
100,1.2815
200,0.9059
300,0.9293
400,0.9096
500,0.932
600,0.9304
700,0.9403
800,0.9155
900,0.896
1000,0.9183


Saving model checkpoint to ./results/checkpoint-5000
Configuration saved in ./results/checkpoint-5000/config.json
Model weights saved in ./results/checkpoint-5000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=7926, training_loss=0.8866011417185812, metrics={'train_runtime': 1778.5932, 'train_samples_per_second': 4.456, 'train_steps_per_second': 4.456, 'total_flos': 4817414582323200.0, 'train_loss': 0.8866011417185812, 'epoch': 1.0})

In [6]:
prompt = 'Kirsten Johnson'
instruction = 'generate a description for the above director'
prompt = '# '+prompt.lstrip('\n') + '\n# '+instruction + '\n\"\"\"'
prompt_start = prompt.rfind(instruction)+len(instruction)+4
print(prompt)

# Kirsten Johnson
# generate a description for the above director
"""


In [7]:
generated = tokenizer(f"<|startoftext|>{prompt}", return_tensors="pt").input_ids.cuda()
sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=20)
for i, sample_output in enumerate(sample_outputs):
    output = tokenizer.decode(sample_output, skip_special_tokens=True,
                              clean_up_tokenization_spaces=True)
    output = output[prompt_start:]
    print(output)
    print('')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The tragic death at an apartment complex impacts not just residents but their landlords as she navigates three husbands, one ex on the run from police harassment.

In a fictional case involving a criminal and a gang at a local community college, four teen and one classmate battle a vicious and ruthless group that refuses a truce.

After his daughter abruptly drops off the farm because she's taking birth-injection drugs or is dead, a mother-to-be is sent on a mad cap that gives her two extra lives.

As the young lives their parents – along with all living creatures on all 4 corners of Earth, is threatened. She seeks solace with magical adventures in the small, mysterious island home.

A talented but ambitious teen from Boston struggles to juggle her dreams to play soccer or fall in with a criminal cartel – but soon ends the team she falls under to.

An aspiring model agrees to dance, where her talent doesn't follow when the photographer whose job she wants to get a photo sets her up for