# Import necessary liberaries

In [None]:
%%bash

pip install pytube

pip3 -qqq install transformers[torch] torch datasets

pip3 -qqq install gdcm
pip3 -qqq install pydicom
pip -qqq install faiss-gpu
pip -qqq install pinecone-client
pip -qqq install scenedetect

In [None]:
import os
import faiss
import torch
import skimage
import pinecone
import requests
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from pytube import YouTube
from PIL import Image
from io import BytesIO
import IPython.display
from datasets import load_dataset
from collections import OrderedDict
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from sklearn.decomposition import PCA
import time
import cv2

# Dataset

## Download dataset

In [None]:
df = pd.read_csv('https://query.data.world/s/cagbol5yuk7ykt2sswer5vdcn5yxeg?dws=00000')

In [None]:
df.head()

In [None]:
len(df)

In [None]:
df_small = df[:200]

In [None]:
len(df_small)

If you run YouTube downeloader you will get this error:
```AttributeError: 'NoneType' object has no attribute 'span'```

I just patched this error by simply modifying {home}/.local/lib/python3.7/site-packages/pytube/cipher.py
Line 411

```transform_plan_raw = find_object_from_startpoint(raw_code, match.span()[1] - 1)
to```

```transform_plan_raw = js```
And everything works fine.
Hope this can solve your problem.

In [None]:
def check_valid_URLs(video_id):
  checker_url = "https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v="
  video_url = checker_url + video_id

  request = requests.get(video_url)
  if request.status_code == 200:
    return True
  else:
    return False
def download_video(video_id):

  # where to save
  SAVE_PATH = "/content/videos/" #to_do

  # link of the video to be downloaded
  link = f"https://www.youtube.com/watch?v={video_id}"

  try:
    # object creation using YouTube
    yt = YouTube(link)
    # Get all streams and filter for mp4 files
    mp4_streams = yt.streams.filter(progressive=True, file_extension='mp4').order_by(
        'resolution').desc().first()


    # downloading the video
    mp4_streams.download(output_path=SAVE_PATH)
    output = 'Video downloaded successfully!'
    return output
  except:
    output = "Some Error!"
    return output

In [None]:
import os
def my_mkdirs(folder):
  if os.path.exists(folder)==False:
    os.makedirs(folder)
my_mkdirs('/content/videos/')

In [None]:
valid_urls = []
count = len(df_small['video id'])
i = 0
for id in df_small['video id']:
  validation = check_valid_URLs(id)
  print(f'validation of {i} url in dataframe is: {validation}')
  valid_urls.append(validation)
  i += 1
  if i == count:
    break

In [None]:
df_small['is_valid'] = valid_urls

In [None]:
df_small = df_small[df_small['is_valid'] == True]

In [None]:
len(df_small)

In [None]:
df_small.reset_index(drop=True, inplace=True)

In [None]:
df_small.head()

In [None]:
valid_video = []
for id in df_small['video id']:
  output = download_video(id)
  if output == "Some Error!":
    valid_video.append(False)
  else:
    valid_video.append(True)
  print(output)

In [None]:
df_small['valid_video'] = valid_video

In [None]:
df_small = df_small[df_small['valid_video'] == True]

In [None]:
df_small.reset_index(drop=True, inplace=True)

get file names for videeos in videos folder

In [None]:
from os import listdir
from os.path import isfile, join
mypath = "/content/videos/"
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [None]:
onlyfiles

# Get model and all reqoured preprocessing functions

In [None]:
def get_model_info(model_ID, device):
  # Save the model to device
  model = CLIPModel.from_pretrained(model_ID).to(device)
   # Get the processor
  processor = CLIPProcessor.from_pretrained(model_ID)
  # Get the tokenizer
  tokenizer = CLIPTokenizer.from_pretrained(model_ID)

  return model, processor, tokenizer

In [None]:
# Set the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
model_ID = 'openai/clip-vit-base-patch32'
model, processor, tokenizer = get_model_info(model_ID, device)

# Phase 1: Visual search

## Extract embbedings from videos

In [None]:
def extract_embedding_from_video(video_path):
  frames = []
  cap = cv2.VideoCapture(video_path)
  frame_rate = cap.get(cv2.CAP_PROP_FPS)
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
  total_seconds = total_frames / frame_rate
  target_frame_count = int(total_seconds)
  target_frame_index = 0
  for i in range(target_frame_count):
      cap.set(cv2.CAP_PROP_POS_FRAMES, target_frame_index)
      ret, frame = cap.read()
      # frame = cv2.resize(frame, (1280, 720))
      frame = cv2.resize(frame, (224, 224))
      if not ret:
          break
      frames.append(frame)
      target_frame_index += int(frame_rate)
  cap.release()

  embeddings = []
  for frame in frames:
    image = processor(
      text=None,
      images = frame,
      return_tensors='pt',
    )['pixel_values'].to(device)
    embedding = model.get_image_features(image)
    embedding_as_np = embedding.cpu().detach().numpy()
    embeddings.append(list(np.squeeze(embedding_as_np)))
  embeddings = np.asarray(embeddings)

  return np.mean(embeddings, axis=0)

In [None]:
from tqdm.notebook import tqdm
def extract_all_embedding(video_names):
  embeddings = []
  for i in tqdm(range(len(onlyfiles))):
    file_name = onlyfiles[i]
    video_path = f"/content/videos/{file_name}"
    embedding = extract_embedding_from_video(video_path)
    embeddings.append(embedding)
  return embeddings

In [None]:
embeddings = tqdm(extract_all_embedding(onlyfiles))

## Extract embedding from text

In [None]:
embeddings_list = []
for embedding in embeddings:
  embeddings_list.append(embedding)

In [None]:
def get_single_text_embedding(text):
  inputs = tokenizer(text, return_tensors='pt').to(device)
  text_embeddings = model.get_text_features(**inputs)
  # Convert the embeddings to numpy array
  embedding_as_np = text_embeddings.cpu().detach().numpy()

  return np.squeeze(embedding_as_np)

In [None]:
def get_all_text_embeddings(df, text_col):
  df['text_embeddings'] = df[str(text_col)].apply(get_single_text_embedding)

  return df

In [None]:
df_small = get_all_text_embeddings(df_small, 'name')

In [None]:
df_small.head()

## Dimension reduction using PCA

In [None]:
# n_components = min(random_number, num_images, len(features))
n_components = min(100, len(df_small), len(df_small['text_embeddings'][0]))

In [None]:
# Check trade-off between num-dimensions and variance
pca = PCA(n_components)
pca.fit(list(df_small['text_embeddings']))
matplotlib.style.use('seaborn')
plt.plot(range(1, n_components + 1), pca.explained_variance_ratio_, 'o--', markersize=4)
plt.title('Variance for each PCA dimension')
plt.xlabel('PCA Dimension')
plt.ylabel('Variance')
plt.grid(True)
plt.show()

In [None]:
# Number of features for tex-image search after dimension reduction
num_feature_dimensions = 30

In [None]:
df_small['text_embeddings'].shape

In [None]:
# Apply dimention reduction on text_embedding using PCA
pca = PCA(n_components=num_feature_dimensions)
pca.fit(list(df_small['text_embeddings']))
text_embedding_compressed = pca.transform(list(df_small['text_embeddings']))
text_embedding_compressed = text_embedding_compressed.tolist()

In [None]:
len(text_embedding_compressed), len(text_embedding_compressed[0])

In [None]:
video_embeddings_compressed = pca.transform(embeddings_list)
video_embeddings_compressed = video_embeddings_compressed.tolist()

In [None]:
len(video_embeddings_compressed), len(video_embeddings_compressed[0])

In [None]:
df_small['video_embeddings_compressed'] = video_embeddings_compressed

In [None]:
df_small['text_embeddings_compressed'] = text_embedding_compressed

## Pinecone

In [None]:
from pinecone import Pinecone, ServerlessSpec

In [None]:
API_KEY = 'f58afe0a-3959-4cb0-9020-f5d06db8ef14'

In [None]:
pc = Pinecone(api_key=API_KEY)

In [None]:
import os
CLOUD = os.environ.get('PINECONE_CLOUD') or 'aws'
REGION = os.environ.get('PINECONE_REGION') or 'us-east-1'

SPEC = ServerlessSpec(cloud=CLOUD, region=REGION)

In [None]:
# Create a severless index
import time

INDEX_NAME = "tex-video"

existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if INDEX_NAME not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        INDEX_NAME,
        dimension=num_feature_dimensions,  # dimensionality of minilm
        metric='cosine',
        spec=SPEC
    )
    # wait for index to be initialized
    while not pc.describe_index(INDEX_NAME).status['ready']:
        time.sleep(1)

# connect to index
my_index = pc.Index(INDEX_NAME)
time.sleep(1)
# view index stats
my_index.describe_index_stats()

In [None]:
df_small['vector_id'] = df_small.index
df_small['vector_id'] = df_small['vector_id'].apply(str)

In [None]:
# Get all the metadata
final_metadata = []
for index in range(len(df_small)):
  final_metadata.append({
      'ID': index,
      'caption': df_small.iloc[index]['name'],
      'image': df_small.iloc[index]['video id']
  })

video_embeddings_compressed_IDs = df_small.vector_id.tolist()
video_embeddings = [arr for arr in df_small.video_embeddings_compressed]
# Create the single list of directory format to insert
# data_to_upsert = list(zip(image_IDs, image_embeddings, final_metadata))

In [None]:
vectors = [
    {'id': id,
    'values': values,
    'metadata': metadata} for id, values, metadata in zip(image_IDs, video_embeddings, final_metadata)
]

In [None]:
my_index.upsert(vectors=vectors)

In [None]:
# Check index size for each namespace
my_index.describe_index_stats()

In [None]:
def pinecone_quey(text):
  query_embedding = get_single_text_embedding(text)
  compressed_text = pca.transform(np.expand_dims(query_embedding, axis=0))
  squeezed_text = np.squeeze(compressed_text).tolist()
  my_index.query(vector=squeezed_text, top_k=4, include_metadata=True)


In [None]:
# Get the query text
text_query = df_small.iloc[10]['name']

# Get the caption embedding
query_embedding = get_single_text_embedding(text_query)


In [None]:
np.expand_dims(query_embedding, axis=0).shape

In [None]:
# Get the query text
text_query = df_small.iloc[10]['name']

# Get the caption embedding
query_embedding = get_single_text_embedding(text_query)

# Reduce the dimension
compressed_text = pca.transform(np.expand_dims(query_embedding, axis=0))

# Squeeze embedding
squeezed_text = np.squeeze(compressed_text).tolist()

In [None]:
squeezed_text

In [None]:
matches = my_index.query(vector=squeezed_text, top_k=4, include_metadata=True)

In [None]:
len(matches['matches'])

In [None]:
matches

# Phase 2: Text search

In [None]:
!pip install elasticsearch

In [None]:
from elasticsearch import Elasticsearch
from datetime import datetime

cloud_id = "Test_in_Python:ZWFzdHVzMi5henVyZS5lbGFzdGljLWNsb3VkLmNvbTo0NDMkYmUwNzcyNzAwMWY5NDA4MGFhZTFhYWRjYjJmODcwMWEkZWVmNDlkNTRjMTAxNDI5MDlhYjA2OGVmNTA1NGY1MzE="
api_key = "N2ZLcDU0OEJfekFzMDkzOWRsclE6VjZYdkhRRVBSYldCMEpCVGRuZFJDQQ=="

client = Elasticsearch(cloud_id=cloud_id, api_key=api_key)

In [None]:
print(client.info())

In [None]:
from elasticsearch import helpers

def gendata():
  for i in range(len(df_small)):
    yield {
        "_index": "videos",
        "video_id": df_small.iloc[i]["video id"],
        "name": df_small.iloc[i]["name"],
        "channel": df_small.iloc[i]["channel"],
        "category": df_small.iloc[i]["category"]
    }

helpers.bulk(client, gendata())

# Phase 3: Serving

In [None]:
def pretty_elastic(response):
  outputs = []
  if len(response["hits"]["hits"]) == 0:
    print("Your search returned no results")
  else:
    for hit in response["hits"]["hits"]:
      id = hit["_id"]
      score = hit["_score"]
      video_id = hit["_source"]["video_id"]
      name = hit["_source"]["name"]
      channel = hit["_source"]["channel"]
      category = hit["_source"]["category"]
      pretty_elastic_output = f"\nID: {id}\nVideo ID: {video_id}\nName: {name}\nChannel: {channel}\nCategory: {category}\nScore: {score}"
      outputs.append(pretty_elastic_output)
  return outputs

In [None]:
def elastic_query(text):
  response = client.search(
      index="videos",
      query={
        "bool": {
          "must": [
            {
              "match": {
                "name": {
                  "query": text,
                  "fuzziness": "auto"
                }
              }
            }
          ],
          "should": [
            {
              "match": {
                "channel": {
                  "query": text,
                  "fuzziness": "auto"
                }
              }
            },
            {
              "match": {
                "category": {
                  "query": text,
                  "fuzziness": "auto"
                }
              }
            }
          ]
        }
      }
  )

  elastic_output = pretty_elastic(response)
  return elastic_output

In [None]:
elastic_output = elastic_query("actors are in studio")

In [None]:
for output in elastic_output:
  print(output)

In [None]:
def pretty_pinecone(matches):
  output = []
  if len(matches['matches']) == 0:
    print("Your search returned no result!")
  else:
    for item in matches['matches']:
      video_id = item['metadata']['image']
      caption = item['metadata']['caption']
      score = item['score']
      id = item['metadata']['ID']
      pretty_pinecone_output = f"\nID: {id}\nVideo_ID: {video_id}\nCaption: {caption}\nScore: {score}"
      output.append(pretty_pinecone_output)
  return output

In [None]:
def pinecone_query(text):
  query_embedding = get_single_text_embedding(text)
  compressed_text = pca.transform(np.expand_dims(query_embedding, axis=0))
  squeezed_text = np.squeeze(compressed_text).tolist()
  matches = my_index.query(vector=squeezed_text, top_k=10, include_metadata=True)
  pinecone_output = pretty_pinecone(matches)
  return pinecone_output


In [None]:
pinecone_output = pinecone_query("actors are in studio")

In [None]:
for output in pinecone_output:
  print(output)

In [None]:
def fusing_answers(text):
  elastic_output = elastic_query(text)
  pinecone_output = pinecone_query(text)
  for i, item in enumerate(zip(pinecone_output, elastic_output)):
    print(
        f"Number {i} Pinecone match: {item[0]}\n\nNumber {i} Elastic match: {item[1]}"
    )

In [None]:
fusing_answers("crisis and cratus studio")