In [None]:
import torch
import matplotlib
import numpy as np
import cv2
import os
from PIL import Image

# #stable diffusion
from diffusers import StableDiffusionPipeline

# #stable diffusion between two prompts
from stable_diffusion_videos import StableDiffusionWalkPipeline


#whisper and a tool to download Youtube videos
import whisper
import pytube

import warnings
warnings.filterwarnings('ignore')

# Object Detection

## General Detection

- Object detection is the task of locating and classifying objects in an image or video.


- YOLO is a state-of-art object detection library and completely open-source.


- Click [here](https://github.com/ultralytics/yolov5) to learn more about it. 


- You can easily deploy YOLO for your own object detection problems. [Here](https://www.youtube.com/watch?v=tFNJGim3FXw) is an amazing tutorial for you.


In [None]:
model = torch.hub.load('ultralytics/yolov5', 'yolov5s',force_reload=True)

In [None]:
#cap = cv2.VideoCapture(0)
cap = cv2.VideoCapture("nascar.mp4")
while cap.isOpened():
    #grab the frame
    ret, frame = cap.read()
    
    #detect the objects
    if frame is not None:
        results = model(frame)
    
        cv2.imshow('YOLO', np.squeeze(results.render()))
    
    if cv2.waitKey(10) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()

## Custom Detection

In [None]:
model = torch.hub.load('ultralytics/yolov5', 'custom', path='best_model.pt', force_reload = True)

In [None]:
#get the detection
img = 'dataset/test/images/510x.jpg'
results = model(img)

#plot the detection
img = Image.fromarray(np.squeeze(results.render()))
img.show()

In [None]:
cap = cv2.VideoCapture('dataset/myvideo.mp4')
while cap.isOpened():
    ret, frame = cap.read()
    
    if frame is not None:
        # Make detections 
        results = model(frame)
        cv2.imshow('YOLO', np.squeeze(results.render()))
    
    if cv2.waitKey(10) & 0xFF == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()

# Generating Images from Text: Stable Diffusion

- Text-to-Image deep learning architecture. The model was trained on 2.6 billion images with approximate cost of **$600,000**
- If you want to try it in your browser, visit [HuggingFace](https://huggingface.co/spaces/stabilityai/stable-diffusion)
- If you hit login issue, uncomment and run the code below.
- Be creative and try out different long, descriptive prompts. For example, you get inspiration from [here](https://mpost.io/best-100-stable-diffusion-prompts-the-most-beautiful-ai-text-to-image-prompts/).

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
model_id = "CompVis/stable-diffusion-v1-4"
pipe = StableDiffusionPipeline.from_pretrained(model_id,revision="fp16", 
                                               torch_dtype=torch.float16,
                                               use_auth_token=True)
pipe = pipe.to("cuda")
pipe.enable_attention_slicing()

In [None]:
#enter a prompt
prompt = "ultrarealistic, (native american old man ) portrait, cinematic lighting,\
award winning photo, no color, 80mm lense –beta –upbeta –upbeta"

pipe(prompt).images[0]  

In [None]:
prompt = "the living room of a cozy wooden house with a fireplace, wallpaper, warm, digital art. \
art by james gurney and larry elmore"
pipe(prompt).images[0]  

In [None]:
promt = "kneeling cat knight, portrait, finely detailed armor,\ 
intricate design, silver, silk, cinematic lighting, 4k"
pipe(prompt).images[0] 

## Create an Artwork

- Things are getting interesting. If we can generate images from prompts, why not to generate images between two different prompts just like interpolation. 


- Enter two prompts and check the video in *stable_diffusion_videos* folder. Increase *"num_images"* to get more and more interesting prompt evolution


- Check out the videos I already generated in "stable_diffusion_videos" folder. Try out different prompts.

In [None]:
prompt1 = 'old rusty 18th century building in desert'
prompt2 = 'a futuristic building, vivid colors, 4K'
num_images = 60

pipeline = StableDiffusionWalkPipeline.from_pretrained(model_id,
    torch_dtype=torch.float16,
    revision="fp16",
).to("cuda")

pipeline.enable_attention_slicing()
video_path = pipeline.walk(
    prompts=[prompt1,prompt2],
    seeds=[42, 1337],
    num_interpolation_steps = num_images,  #number of images to generate in betweeen propmts
    height=512,                            # use multiples of 64 if > 512. Multiples of 8 if < 512.
    width=512,                             # use multiples of 64 if > 512. Multiples of 8 if < 512.
    output_dir='stable_diffusion_videos',  # Where images/videos will be saved
    name='building_test',                  # Subdirectory of output_dir where images/videos will be saved
    guidance_scale = 5,                    # Higher adheres to prompt more, lower lets model take the wheel
    num_inference_steps=50,                # Number of diffusion steps per image generated. 50 is good default
)

# Whisper

- The model is by OpenAI and trained on 680,000 hours of 680 000 multilingual audio. If you sleep 8 hours a day and live 80 years, you can talk/listen at most 16*365*80=467200 hours.
- Check the repo [here](https://github.com/openai/whisper)
- Put any recording in the current folder and change the filename

## Transcribe Audio

In [None]:
filename = "file.m4a"
model = whisper.load_model("small")
result = model.transcribe(filename)
print(result["text"])

## Download and Transcribe Youtube Videos

- Download and transcribe any Youtube videos. Just past the link below and see what happens. Let's try out a fairly long video about T-cells and B-cells

In [None]:
from IPython.display import YouTubeVideo
YouTubeVideo('eOStU5kaCpk', width=800, height=600)

In [None]:
#download and get the audio
video = 'https://www.youtube.com/watch?v=eOStU5kaCpk'
data = pytube.YouTube(video)
data_name = data.title+'.mp4'

# Convert to audio file
audio = data.streams.get_audio_only()
audio.download()

#let the Whisper do the rest
model = whisper.load_model("base")
result = model.transcribe(data_name)
print(result["text"])

# Hugging Face Transformers

- [Hugging Face](https://huggingface.co/) is an open-source community that provides state-of-the-art deep learning models, mostly based on Transformers, to developers and researchers.


- Hugging Face also offers a cloud-based platform called Hugging Face Hub that allows users to share, train, and deploy deep learning models. To learn how to use it here is their [official tutorial](https://youtube.com/playlist?list=PLo2EIpI_JMQvWfQndUesu0nPBAtZ9gP1o)



- Almost every task in HuggingFace is wrapped up in the pipeline function. 

In [None]:
from transformers import pipeline

## Text Summarization

In [None]:
summarizer = pipeline("summarization", model = "facebook/bart-large-cnn")
summarizer(result["text"])

## Translation

In [None]:
tr_translator = pipeline(task="translation",model="Helsinki-NLP/opus-mt-tc-big-tr-en")

In [None]:
text="İstanbul Valiliği’nin açıklamasında 17 Ağustos 1999 Marmara depremi \
nedeniyle 2006’da kurulan İstanbul Proje Koordinasyon Birimi’nin (İPKB) projesi kapsamındaki çalışamalarda \
93 riskli okuldan 76 okulun yıkılıp yeniden yapılması,\
17’sinin de güçlendirilmesi kararlaştırıldı."

## Image Captioning

In [None]:
image_to_text = pipeline(task="image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

In [None]:
baby_image = Image.open("babies.jpg")
baby_image

In [None]:
image_to_text(baby_image)[0]["generated_text"]

In [None]:
car_image = Image.open("cars_image.jpg")
car_image

In [None]:
image_to_text(car_image)[0]["generated_text"]

## Project Ideas

You can try out the exercises in HuggingFace

- Transcribe the conversation between multiple people and process it.


- Scrap the news from a given URL and summarize it.


- Grab a Youtube video in a different language, transcribe, translate and summarize it.

# ChatGPT

- Generative Pretrained **Transformers**(GPT) is a deep learning model with over 175 billion. 
- ChatGPT was trained on 570GB of textual data.
- It cost $50 million to train ChatGPT.
- Visit [https://chat.openai.com/chat](https://chat.openai.com/chat)