<a href="https://colab.research.google.com/github/grackner/dsc_261_final_project/blob/grackner/phi_text_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Phi Text Generation

url: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct?library=transformers

In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import uuid
import pandas as pd
import kagglehub
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True, dtype='float16')

In [None]:
# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Test inference
messages =[
    {"role": "user", "content": "Who is Michael Jordan?"},
]
inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=100, use_cache=False)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

In [None]:
# Create test dataset
phi_df = pd.DataFrame(columns=['uuid', 'generated_article'])

In [None]:
def generate_article():
  messages = [
    {
        "role": "user",
        "content": "Write the body of a news article, do not include a headline or title. It can be about any topic"
    },
  ]
  inputs = tokenizer.apply_chat_template(
          messages,
          add_generation_prompt=True,
          tokenize=True,
          return_dict=True,
          return_tensors="pt",
  ).to(model.device)

  outputs = model.generate(**inputs, max_new_tokens=100, use_cache=False, do_sample=True, temperature=0.9, top_p=0.95,top_k=50)
  response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])
  return response

In [None]:
# Generate articles
n = 100 # Number of datapoints to create
for i in range(0, n + 1):
  print(i)
  # Add unique identifier for the row
  phi_df.loc[i, 'uuid'] = str(uuid.uuid4())
  response = generate_article()
  phi_df.loc[i, 'generated_article'] = response

In [None]:
phi_df.head()

In [None]:
phi_df.loc[1, 'generated_article']

In [None]:
phi_df.to_csv("phi_outputs.csv")

In [None]:
## Load dataset
# path = kagglehub.dataset_download("gowrishankarp/newspaper-text-summarization-cnn-dailymail")

# print("Path to dataset files:", path)

# train_df = pd.read_csv(path + "/cnn_dailymail/train.csv")
# test_df = pd.read_csv(path + "/cnn_dailymail/test.csv")
# val_df = pd.read_csv(path + "/cnn_dailymail/validation.csv")

In [None]:
## Loop through the train dataset
# for index, row in train_df.head(1).iterrows():
#   print(index)
#   # Get article
#   article = row["article"]
#   # Cut down to 100 words
#   article = " ".join(article.split()[:100])
#   # Format the query
#   messages = [
#     {
#         "role": "user",
#         "content": [
#             {"type": "text", "text": f"Write a news article do not include a title"}
#             # {"type": "text", "text": f"Write an article with a similar style to the following article example from CNN: {article}"}
#         ]
#     },
#   ]
#   # Store query in df # TODO: Should happen in data cleaning?
#   print(article)
#   train_df.loc[index, 'query_article'] = messages[0]['content'][0]['text']
#   ## Run query through inference
#   inputs = processor.apply_chat_template(
#     messages,
#     add_generation_prompt=True,
#     tokenize=True,
#     return_dict=True,
#     return_tensors="pt",
#   ).to(model.device)

#   outputs = model.generate(**inputs, max_new_tokens=100)
#   response = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])
#   # Store answer in df
#   train_df.loc[index, 'model_output'] = response