# M1L3 Screencasts

## M1L3SC1: What Can LLMs Do Today? Real Use Cases Across Providers

### Step 1: # Step 1: Setting Up Your Environment

In [None]:
#!pip install openai  #removed pip install command

from openai import OpenAI

api_key = 'your-openai-api-key' #Replace with your actual API key

client = OpenAI(api_key=api_key)

### Step 2: Text Summarization

In [None]:
def summarize_text(input_text: str,
                   model: str = "gpt-4o-mini") -> str:
    """
    Return a concise summary of `input_text` using a modern chat model.
    `client` must be an instance of openai.OpenAI(), e.g. client = OpenAI().
    """
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system",
             "content": "You are a concise and accurate summarizer."},
            {"role": "user",
             "content": f"Summarize the following text:\n\n{input_text}"}
        ],
        temperature=0.3,        # low temp → faithful, deterministic summary
        max_tokens=150
    )
    return response.choices[0].message.content.strip()


long_text = "Large language models (LLMs) are a type of artificial intelligence model that can understand, generate, and manipulate human language. They are trained on massive amounts of text data and are able to perform a wide variety of tasks, including text summarization, code generation, language translation, and logical reasoning. LLMs are becoming increasingly popular and are being used in a wide range of applications, such as chatbots, virtual assistants, and content creation tools. Some of the most popular LLMs include GPT-4, Claude, and Gemini."
summary = summarize_text(long_text)
print("Summary:", summary)


# Step 3: Coding Assistance

In [None]:
def calculate_factorial(n):
    return n * calculate_factorial(n-1) if n else 1

### Step 4: Language Translation

_note: This section would be handled within a real IDE, so there's no direct code execution here._

In [None]:
def translate_text(input_text: str,
                   target_language: str = "Spanish",
                   model: str = "gpt-4o-mini") -> str:
    """
    Translate `input_text` into `target_language` using an OpenAI chat model.
    """
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system",
             "content": f"You are a professional translator. "
                        f"Translate any user message into {target_language}."},
            {"role": "user", "content": input_text}
        ],
        temperature=0.3,            # low temp → faithful translation
        max_tokens=120
    )
    return response.choices[0].message.content.strip()

english_text = "The quick brown fox jumps over the lazy dog."
translated_text = translate_text(english_text)
print("Translated Text:", translated_text)

### Step 5: Logical Reasoning

In [None]:
def logical_reasoning_question(question: str,
                               model: str = "gpt-4o-mini") -> str:
    """
    Ask a free-form logical-reasoning question and return the model’s answer.
    """
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system",
             "content": "You are a highly logical assistant. "
                        "Think step-by-step and explain your reasoning."},
            {"role": "user", "content": question}
        ],
        temperature=0.2,            # crisp, deterministic reasoning
        max_tokens=200
    )
    return response.choices[0].message.content.strip()

question = "If a train travels at 60 km/h for 2 hours, how far does it travel?"
answer = logical_reasoning_question(question)
print("Logical Answer:", answer)

What Can LLMs Do Today? Real Use Cases Across Providers
Summary: Large language models (LLMs) are AI models designed to understand and generate human language, trained on extensive text data. They can perform tasks like text summarization, code generation, language translation, and logical reasoning. LLMs are widely used in applications such as chatbots, virtual assistants, and content creation tools, with notable examples including GPT-4, Claude, and Gemini.
Translated Text: El rápido zorro marrón salta sobre el perro perezoso.
Logical Answer: To determine how far the train travels, we can use the formula for distance, which is:

\[ \text{Distance} = \text{Speed} \times \text{Time} \]

In this case, the speed of the train is 60 km/h, and the time it travels is 2 hours. 

Now, we can plug in the values:

\[ \text{Distance} = 60 \, \text{km/h} \times 2 \, \text{h} \]

Calculating this gives:

\[ \text{Distance} = 120 \, \text{km} \]

Therefore, the train travels 120 kilometers.


## M1L3SC2: What Can Vision-Language Models Do? Image + Text in Action

### Step 1: Setting Up for Vision-Language Tasks

In [None]:
#!pip install transformers torch pillow #remove pip command

# Import necessary libraries
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
from io import BytesIO
from transformers import BlipProcessor, BlipForConditionalGeneration


# Define function for fetching images from online
def fetch_image(url: str, timeout: int = 10):
    """
    Download an image and return a PIL.Image, or None on failure.

    * Adds a realistic User-Agent to avoid 403/404 from some CDNs.
    * Raises on HTTP errors so you see the real reason immediately.
    * Converts to RGB to avoid “cannot write mode RGBA as JPEG” later on.
    """
    headers = {
        "User-Agent":
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
    }

    try:
        resp = requests.get(url, headers=headers, timeout=timeout)
        resp.raise_for_status()               # surfaces 403/404 as an exception
        return Image.open(BytesIO(resp.content)).convert("RGB")
    except Exception as err:
        print(f"[fetch_image] {err}  -  {url}")
        return None

### Step 2: Image Captioning with BLIP

In [None]:
# Image Captioning with BLIP
model_name_blip = "Salesforce/blip-image-captioning-base"
processor_blip = BlipProcessor.from_pretrained(model_name_blip)
model_blip = BlipForConditionalGeneration.from_pretrained(model_name_blip)

image_url_blip = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg" # Using a sample image instead
image_blip = fetch_image(image_url_blip)

if image_blip:
    inputs_blip = processor_blip(image_blip, return_tensors="pt")
    outputs_blip = model_blip.generate(**inputs_blip, max_new_tokens=50, do_sample=True)
    caption_blip = processor_blip.decode(outputs_blip[0], skip_special_tokens=True)
    print("Generated Caption with BLIP:", caption_blip)
else:
    print("Failed to process image for BLIP.")

Generated Caption with BLIP: a woman and a yellow dog are sitting at the beach on a bright warm day


### Step 3: Zero-Shot Image Classification with CLIP

In [None]:
# Image Classification with CLIP
model_name_clip = "openai/clip-vit-base-patch32"
processor_clip = CLIPProcessor.from_pretrained(model_name_clip)
model_clip = CLIPModel.from_pretrained(model_name_clip)

labels_clip = ["a cat", "a dog", "a bird"]
image_url_clip = "https://upload.wikimedia.org/wikipedia/commons/3/3a/Cat03.jpg"
image_clip = fetch_image(image_url_clip)

if image_clip:
    inputs_clip = processor_clip(text=labels_clip, images=image_clip, return_tensors="pt", padding=True)
    outputs_clip = model_clip(**inputs_clip)
    logits_per_image_clip = outputs_clip.logits_per_image
    probs_clip = logits_per_image_clip.softmax(dim=1)

    pred_label_clip = labels_clip[probs_clip.argmax()]
    print(f"Predicted Label with CLIP: {pred_label_clip}")
else:
    print("Failed to process image for CLIP.")

Predicted Label with CLIP: a cat


### Step 4: Visual Question Answering (VQA)

In [None]:
# Text-based evaluation with CLIP
model_name_clip_vqa = "openai/clip-vit-base-patch32"
processor_clip_vqa = CLIPProcessor.from_pretrained(model_name_clip_vqa)
model_clip_vqa = CLIPModel.from_pretrained(model_name_clip_vqa)

question_clip_vqa = "What is in this image?" # Changing the question, VQA isn't possible with this model
image_url_clip_vqa = "https://upload.wikimedia.org/wikipedia/commons/3/3a/Cat03.jpg" # Sample image from above
image_clip_vqa = fetch_image(image_url_clip_vqa)

if image_clip_vqa:
    inputs_clip_vqa = processor_clip_vqa(text=[question_clip_vqa], images=image_clip_vqa, return_tensors="pt", padding=True)
    outputs_clip_vqa = model_clip_vqa(**inputs_clip_vqa)
    # This CLIP model does not give the answer directly, just a way of evaluating AI fitness
    probs_vqa = outputs_clip_vqa.logits_per_image.softmax(dim=1)
    answer_clip_vqa = labels_clip[probs_vqa.argmax()]
    print("Answer with CLIP (simulated for VQA):", answer_clip_vqa)
else:
    print("Failed to process image for CLIP VQA.")

Answer with CLIP (simulated for VQA): a cat
