In [1]:
# !pip install opencv-python-headless scipy
# !pip install ImageHash

In [2]:
# import cv2
# import os
# import numpy as np

# def extract_frames(video_path, output_dir, fps=2):
#     """
#     비디오에서 초당 프레임을 추출합니다.
#     """
#     frames=[]
#     if not os.path.exists(output_dir):
#         os.makedirs(output_dir)

#     cap = cv2.VideoCapture(video_path)
#     length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
#     fps_video = cap.get(cv2.CAP_PROP_FPS)
#     frame_rate_ratio = int(fps_video / fps)

#     for i in range(length):
#         ret, frame = cap.read()
#         if not ret:
#             break
#         if i % frame_rate_ratio == 0:
#             frames.append(frame)
#             frame_path = os.path.join(output_dir, f"frame_{i}.jpg")
#             cv2.imwrite(frame_path, frame)
#     cap.release()
#     return frames





In [21]:
import os
import cv2
import numpy as np
import json
from datetime import timedelta
from transformers import ViTImageProcessor, ViTModel
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering

In [11]:
def calculate_batch_sharpness(batch_frames):
    """
    배치 프레임의 선명도를 계산합니다.
    """
    sharpness_values = []
    for frame in batch_frames:
        gray_image = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        sharpness = cv2.Laplacian(gray_image, cv2.CV_64F).var()
        sharpness_values.append(sharpness)
    return sharpness_values

In [12]:
def select_sharpest_frames(frames, timestamps, batch_size=4):
    """
    주어진 배치에서 가장 선명한 프레임을 선택합니다.
    """
    selected_frames = []
    selected_timestamps = []
    for i in range(0, len(frames), batch_size):
        batch_frames = frames[i:i+batch_size]
        batch_timestamps = timestamps[i:i+batch_size]
        sharpness_values = calculate_batch_sharpness(batch_frames)
        sharpest_index = np.argmax(sharpness_values)
        selected_frames.append(batch_frames[sharpest_index])
        selected_timestamps.append(batch_timestamps[sharpest_index])
    return selected_frames, selected_timestamps

In [30]:
def extract_features(frames, processor, model):
    """
    프레임들로부터 특성을 추출합니다.
    """
    inputs = processor(images=[Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) for frame in frames], return_tensors="pt")
    outputs = model(**inputs)
    features = outputs.last_hidden_state[:,0,:].detach().numpy()
    return features

In [32]:
def cluster_frames(features):
    """
    프레임의 특성을 기반으로 그룹화합니다.
    """
    similarity_matrix = cosine_similarity(features)
    clustering = AgglomerativeClustering(n_clusters=None, metric='precomputed', linkage='average', distance_threshold=0.5)
    clustering.fit(1 - similarity_matrix)
    return clustering.labels_.tolist()

In [15]:
def process_video(video_path, output_dir, fps=1, batch_size=4):
    """
    비디오에서 프레임을 추출하여 선명도를 기준으로 선택, 그 후 특성 추출 및 그룹화를 통해 저장합니다.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    processor, model = load_model_and_processor()
    frames, timestamps = extract_frames(video_path, fps)
    selected_frames, selected_timestamps = select_sharpest_frames(frames, timestamps, batch_size)
    features = extract_features(selected_frames, processor, model)
    labels = cluster_frames(features)
    save_frames(selected_frames, selected_timestamps, labels, output_dir)

In [16]:
def extract_frames(video_path, fps):
    """
    비디오에서 프레임과 타임스탬프를 추출합니다.
    """
    cap = cv2.VideoCapture(video_path)
    fps_video = cap.get(cv2.CAP_PROP_FPS)
    frame_rate_ratio = max(int(fps_video / fps), 1)

    frames = []
    timestamps = []
    frame_count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % frame_rate_ratio == 0:
            frames.append(frame)
            timestamps.append(frame_count / fps_video)
        frame_count += 1
    cap.release()
    return frames, timestamps

In [29]:
def save_frames(frames, timestamps, labels, output_dir):
    """
    선택된 프레임을 저장하고 정보를 JSON 파일로 기록합니다.
    """
    frame_info = []
    for i, (frame, timestamp, label) in enumerate(zip(frames, timestamps, labels)):
        file_name = f"frame_{i}_group_{label}.jpg"
        frame_path = os.path.join(output_dir, file_name)
        cv2.imwrite(frame_path, frame)
        frame_info.append({
            "file_name": file_name,
            "timestamp": str(timedelta(seconds=float(timestamp))),
            "group": int(label)  # Ensure label is in Python int type
        })

    json_path = os.path.join(output_dir, "frame_info.json")
    with open(json_path, 'w') as f:
        json.dump(frame_info, f, indent=4)

In [22]:
def load_model_and_processor():
    """
    ViT 모델과 프로세서를 로드합니다.
    """
    processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
    model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
    return processor, model

In [33]:
video_path='videos/와 진짜 말도 안되는 미쳐버린 상상력으로 만들어낸 띵작 영화 [결말포함].mp4'
output_dir = 'frames'  # 출력 디렉토리 경로
FPS=1
BATCH_SIZE = 4  # 배치 크기
process_video(video_path, output_dir, FPS, BATCH_SIZE)

In [46]:
from geoclip import GeoCLIP

model = GeoCLIP()
model.to('mps')

GeoCLIP(
  (image_encoder): ImageEncoder(
    (CLIP): CLIPModel(
      (text_model): CLIPTextTransformer(
        (embeddings): CLIPTextEmbeddings(
          (token_embedding): Embedding(49408, 768)
          (position_embedding): Embedding(77, 768)
        )
        (encoder): CLIPEncoder(
          (layers): ModuleList(
            (0-11): 12 x CLIPEncoderLayer(
              (self_attn): CLIPAttention(
                (k_proj): Linear(in_features=768, out_features=768, bias=True)
                (v_proj): Linear(in_features=768, out_features=768, bias=True)
                (q_proj): Linear(in_features=768, out_features=768, bias=True)
                (out_proj): Linear(in_features=768, out_features=768, bias=True)
              )
              (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (mlp): CLIPMLP(
                (activation_fn): QuickGELUActivation()
                (fc1): Linear(in_features=768, out_features=3072, bias=True)
            

In [39]:
frame_info=None
with open('frames/frame_info.json','r') as f:
    frame_info = json.load(f)

In [63]:
def frames_to_location(frame_info, model):
    predict_locations = []
    for frame in frame_info:
        image_path = os.path.join('frames', frame['file_name'])
        # 모델 예측. 실제 모델 API에 따라 조정이 필요할 수 있습니다.
        top_pred_gps, top_pred_prob = model.predict(image_path, top_k=1)
        prob = top_pred_prob[0].item()
        if prob > 0.08:
            # Tensor를 Python의 float으로 변환하고 소수점 여섯째 자리까지 포맷팅
            lat = "{:.6f}".format(top_pred_gps[0][0].item())  # 첫 번째 요소의 latitude
            lon = "{:.6f}".format(top_pred_gps[0][1].item())  # 첫 번째 요소의 longitude
            prob = top_pred_prob[0].item()   # 확률 값 변환
            
            # 기존 프레임 정보에 위치 정보 추가
            frame_with_location = {
                **frame, 
                "latitude": float(lat), 
                "longitude": float(lon), 
                "probability": prob
            }
            predict_locations.append(frame_with_location)
    
    # JSON 파일로 저장
    with open('predict_locations.json', 'w') as f:
        json.dump(predict_locations, f, indent=4)
    
    return predict_locations

In [64]:
predict_locations = frames_to_location(frame_info, model)

In [59]:
len(predict_locations)

7

In [60]:
len(frame_info)

214