<a href="https://colab.research.google.com/github/grackner/dsc_261_final_project/blob/main/phi_text_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Phi Text Generation

url: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct?library=transformers

In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import uuid
import pandas as pd
import kagglehub
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True, dtype='float16')

In [None]:
# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Test inference
messages =[
    {"role": "user", "content": "Who is Michael Jordan?"},
]
inputs = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=100, use_cache=False)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

In [None]:
# Create test dataset
phi_df = pd.DataFrame(columns=['uuid', 'topic', 'generated_article'])

In [None]:
topics = ['US - Crime + Justice', 'World - Africa', 'World - Americas', 'World - Asia', 'World - Australia', 'World - China', 'World - Europe', 'World - India', 'World - Middle East', 'World - United Kingdom',
          'World - India', 'World - Middle East', 'World - United Kingdom', 'Politics - CNN Polls', 'Politics - Elections', 'Business - Tech', 'Business - Media', 'Business - Markets',
          'Business - Pre-markets', 'Business - After-Hours', 'Business - Investing', 'Business - Markets Now', 'Health - Fitness', 'Health - Food', 'Health - Sleep', 'Health - Mindfulness',
          'Health - Relationships', 'Entertainment - Movies', 'Entertainment - Television', 'Entertainment - Celebrity', 'Tech - Innovate', 'Tech - Foreseeable Future', 'Tech - Innovative Cities',
          'Style - Arts', 'Style - Design', 'Style - Fashion', 'Style - Architecture', 'Style - Luxury', 'Style - Beauty', 'Travel - Destinations', 'Travel - Food & Drink', 'Travel - Lodging and Hotels',
          'Travel - News', 'Sports - Pro Football', 'Sports - College Football', 'Sports - Basketball', 'Sports - Baseball', 'Sports - Soccer', 'Sports - Olympics', 'Sports - Hockey',
          'Science - Space', 'Science - Life', 'Science - Medicine', 'Science - Climate', 'Science - Solutions', 'Science - Weather']

In [None]:
def generate_article(topic):
  messages = [
    {
        "role": "user",
        "content": f"""Write a full news article in the style of CNN or DailyMail.
                    The story should sound realistic, factual, and human-written.
                    Use natural journalistic language with short and medium-length sentences.
                    Start with a strong lead paragraph summarizing who, what, where, and when.
                    Then expand with quotes, context, background, and a final paragraph about next steps or reactions.
                    Include realistic numbers, dates, and locations.
                    The article should be about {topic}.
                    Add 1–3 short quotes attributed to plausible people (officials, witnesses, or experts).
                    Use neutral tone — no opinions, exaggeration, or bullet points.
                    Output only the article text (no headline, no lists, no explanation, no “to summarize”).
                    End cleanly after several paragraphs.
                  """
    },
  ]
  inputs = tokenizer.apply_chat_template(
          messages,
          add_generation_prompt=True,
          tokenize=True,
          return_dict=True,
          return_tensors="pt",
  ).to(model.device)

  outputs = model.generate(**inputs, max_new_tokens=750, use_cache=False, do_sample=True, temperature=0.9, top_p=0.95,top_k=50)
  response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])
  return response

In [None]:
# Generate articles
rng = np.random.default_rng()
n = 4 # Number of datapoints to create
for i in range(0, n + 1):
  print(i)
  # Get random num between 0 and length of list
  random_integer = rng.integers(low=0, high=len(topics))
  topic = topics[random_integer]
  # Add unique identifier for the row
  phi_df.loc[i, 'uuid'] = str(uuid.uuid4())
  phi_df.loc[i, 'topic'] = topic
  response = generate_article(topic)
  phi_df.loc[i, 'generated_article'] = response

In [None]:
phi_df.head()

In [None]:
phi_df.loc[3, 'generated_article']

In [None]:
phi_df.to_csv("phi_outputs.csv")

In [None]:
## Load dataset
# path = kagglehub.dataset_download("gowrishankarp/newspaper-text-summarization-cnn-dailymail")

# print("Path to dataset files:", path)

# train_df = pd.read_csv(path + "/cnn_dailymail/train.csv")
# test_df = pd.read_csv(path + "/cnn_dailymail/test.csv")
# val_df = pd.read_csv(path + "/cnn_dailymail/validation.csv")

In [None]:
## Loop through the train dataset
# for index, row in train_df.head(1).iterrows():
#   print(index)
#   # Get article
#   article = row["article"]
#   # Cut down to 100 words
#   article = " ".join(article.split()[:100])
#   # Format the query
#   messages = [
#     {
#         "role": "user",
#         "content": [
#             {"type": "text", "text": f"Write a news article do not include a title"}
#             # {"type": "text", "text": f"Write an article with a similar style to the following article example from CNN: {article}"}
#         ]
#     },
#   ]
#   # Store query in df # TODO: Should happen in data cleaning?
#   print(article)
#   train_df.loc[index, 'query_article'] = messages[0]['content'][0]['text']
#   ## Run query through inference
#   inputs = processor.apply_chat_template(
#     messages,
#     add_generation_prompt=True,
#     tokenize=True,
#     return_dict=True,
#     return_tensors="pt",
#   ).to(model.device)

#   outputs = model.generate(**inputs, max_new_tokens=100)
#   response = processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])
#   # Store answer in df
#   train_df.loc[index, 'model_output'] = response