In [3]:
import sqlite3
import json

# Connect to the database
conn = sqlite3.connect('H:\\xai-unibuc\\xAI_deepfake\\my_database3.db')
cursor = conn.cursor()

# Define a function to get the relevant frames from the 'click_locations' column
def get_relevant_frames(limit=20):
    cursor.execute("SELECT video_name, click_locations FROM annotations LIMIT ?", (limit,))
    data = cursor.fetchall()
    
    train_frames = {}
    test_frames = {}
    
    # Split the data into train and test (first 10 for train, next 10 for test)
    for idx, (video_name, click_locations) in enumerate(data):
        frame_data = json.loads(click_locations)
        relevant_frames = list(frame_data.keys())
        
        if idx < 10:
            train_frames[video_name] = relevant_frames
        else:
            test_frames[video_name] = relevant_frames
    
    return train_frames, test_frames

# Extract train and test frames
train_frames, test_frames = get_relevant_frames()

# Display the extracted frames
print("Train Frames:")
for video, frames in train_frames.items():
    print(f"Video: {video}, Frames: {frames}")

print("\nTest Frames:")
for video, frames in test_frames.items():
    print(f"Video: {video}, Frames: {frames}")

# Close the database connection
conn.close()


Train Frames:
Video: bxfwiuvafo.mp4, Frames: ['273', '286', '298']
Video: hipzzheqlg.mp4, Frames: ['133', '141', '148', '154']
Video: mibbivalty.mp4, Frames: ['45', '50', '55', '72', '78', '99', '110', '118', '131', '143', '156', '166', '182', '225', '235', '246', '277', '289', '300']
Video: azseubmxrc.mp4, Frames: ['57', '63', '70', '78', '86', '93', '109', '118', '147']
Video: brvsnraikz.mp4, Frames: ['110', '115', '121', '131', '143', '189', '259']
Video: bzqkplrsnt.mp4, Frames: ['151', '164', '175', '189']
Video: fkkmxwjkxb.mp4, Frames: ['34', '41', '49', '54', '62', '73', '84', '137', '146', '154', '162']
Video: ekmwbuaedp.mp4, Frames: ['27', '32', '39', '45', '54', '61', '73', '81', '89', '97', '105', '112', '124', '132']
Video: airktntzqp.mp4, Frames: ['200', '213', '228', '236', '248']
Video: jgijuusdev.mp4, Frames: ['53', '59', '66', '74', '87', '97', '104', '112', '123', '130', '142']

Test Frames:
Video: dsnerafcqv.mp4, Frames: ['61', '75', '88', '98', '111', '119']
Video: g

In [None]:
import sqlite3
import json
import torch
from PIL import Image
import numpy as np
import os
import cv2
from torch.nn.functional import cosine_similarity
import torchvision.models as models
import torchvision.transforms as transforms
from pathlib import Path

base_dir = Path.cwd()
database_path = base_dir.parent / 'my_database3.db'
video_dir = base_dir.parent / 'deepfake_dataset_challenge' / 'dfdc_train_part_47'

if database_path.exists():
    try:
        conn = sqlite3.connect(str(database_path))
        cursor = conn.cursor()
        # Check if the 'annotations' table exists
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='annotations';")
        if not cursor.fetchone():
            raise sqlite3.OperationalError("Table 'annotations' does not exist in the database.")
        print(f"Connected to database at {database_path}")
    except sqlite3.Error as e:
        raise sqlite3.Error(f"An error occurred while connecting to the database: {e}")
else:
    raise FileNotFoundError(f"Database file not found at {database_path}")

# Load the ResNet model and preprocessing function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
try:
    model = models.resnet50(pretrained=True).eval().to(device)
except Exception as e:
    raise RuntimeError(f"Failed to load the ResNet model: {e}")

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Specify the layer identifier from which to extract features
layer_identifier = 'last'  # Options: 'first', 'last', integer index

# Collect all convolutional layers in the model
conv_layers = []
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Conv2d):
        conv_layers.append((name, module))

def get_layer_by_identifier(identifier):
    if identifier == 'first':
        return conv_layers[0][1], conv_layers[0][0]  # module, name
    elif identifier == 'last':
        return conv_layers[-1][1], conv_layers[-1][0]
    elif isinstance(identifier, int):
        if 0 <= identifier < len(conv_layers):
            return conv_layers[identifier][1], conv_layers[identifier][0]
        else:
            print(f"Invalid layer index: {identifier}. Must be between 0 and {len(conv_layers)-1}.")
            return None, None
    else:
        print(f"Invalid layer identifier: {identifier}")
        return None, None

# Define a function to get the relevant frames from 'click_locations' 
def get_relevant_frames(limit=20):
    print("Fetching relevant frames from the database...")
    try:
        cursor.execute("SELECT video_name, click_locations FROM annotations LIMIT ?", (limit,))
        data = cursor.fetchall()
    except sqlite3.Error as e:
        print(f"An error occurred while fetching data from the database: {e}")
        return {}, {}
    if not data:
        print("No data fetched from the database.")
        return {}, {}

    train_frames = {}
    test_frames = {}

    # Split the data into train and test (first 10 for train, next 10 for test)
    for idx, (video_name, click_locations) in enumerate(data):
        try:
            frame_data = json.loads(click_locations)
            if not isinstance(frame_data, dict):
                print(f"Invalid data format for video {video_name}. Expected a JSON object.")
                continue
            relevant_frames = list(frame_data.keys())
        except json.JSONDecodeError as e:
            print(f"JSON decoding error for video {video_name}: {e}")
            continue
        except Exception as e:
            print(f"Unexpected error processing video {video_name}: {e}")
            continue

        if idx < 10:
            train_frames[video_name] = relevant_frames
        else:
            test_frames[video_name] = relevant_frames

    if not train_frames:
        print("No training frames found.")
    if not test_frames:
        print("No testing frames found.")

    print(f"Train frames: {train_frames}")
    print(f"Test frames: {test_frames}")
    return train_frames, test_frames

# Function to extract frames from video using frame indices and encode them using ResNet
def extract_and_encode_frame(video_path, frame_number, layer_identifier):
    print(f"Extracting frame {frame_number} from video: {video_path}")
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        print(f"Failed to open video file: {video_path}")
        return None

    try:
        frame_number = int(frame_number)
    except ValueError:
        print(f"Invalid frame number: {frame_number}")
        cap.release()
        return None

    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
    ret, frame = cap.read()
    if ret:
        print(f"Frame {frame_number} successfully read from {video_path}")
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        image = preprocess(image).unsqueeze(0).to(device)

        intermediate_output = []

        def hook_fn(module, input, output):
            intermediate_output.append(output)

        # Get the convolutional layer module based on the idx
        layer, layer_name = get_layer_by_identifier(layer_identifier)
        if layer is None:
            cap.release()
            return None

        # Register the hook
        handle = layer.register_forward_hook(hook_fn)

        with torch.no_grad():
            _ = model(image)

        # Remove the hook
        handle.remove()
        cap.release()

        # Get the output features
        if intermediate_output:
            image_features = intermediate_output[0]
            # Flatten the features and normalize
            image_features = image_features.view(image_features.size(0), -1)
            norm = image_features.norm(dim=-1, keepdim=True)
            if torch.any(norm == 0):
                print(f"Zero norm encountered in features for frame {frame_number} in video {video_path}")
                return None
            image_features /= norm
            return image_features
        else:
            print(f"Failed to get features from layer '{layer_name}'")
            return None
    else:
        print(f"Failed to read frame {frame_number} from {video_path}")
        cap.release()
        return None

def display_frames(video_path1, frame_num1, video_path2, frame_num2, window_title):
    cap1 = cv2.VideoCapture(str(video_path1))
    cap2 = cv2.VideoCapture(str(video_path2))

    if not cap1.isOpened():
        print(f"Failed to open video file: {video_path1}")
        return
    if not cap2.isOpened():
        print(f"Failed to open video file: {video_path2}")
        return

    try:
        frame_num1 = int(frame_num1)
        frame_num2 = int(frame_num2)
    except ValueError:
        print(f"Invalid frame numbers: {frame_num1}, {frame_num2}")
        cap1.release()
        cap2.release()
        return

    # Set the video to the specified frame for video 1
    cap1.set(cv2.CAP_PROP_POS_FRAMES, frame_num1)
    ret1, frame1 = cap1.read()
    if not ret1:
        print(f"Failed to read frame {frame_num1} from {video_path1}")
        cap1.release()
        cap2.release()
        return

    # Set the video to the specified frame for video 2
    cap2.set(cv2.CAP_PROP_POS_FRAMES, frame_num2)
    ret2, frame2 = cap2.read()
    if not ret2:
        print(f"Failed to read frame {frame_num2} from {video_path2}")
        cap1.release()
        cap2.release()
        return

    # resize frames for display
    frame1 = cv2.resize(frame1, (640, 360))
    frame2 = cv2.resize(frame2, (640, 360))
    concatenated_frame = np.hstack((frame1, frame2))
    cv2.imshow(window_title, concatenated_frame)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

    cap1.release()
    cap2.release()

train_frames, test_frames = get_relevant_frames()
train_embeddings = {}

# Encode training frames
for video, frames in train_frames.items():
    video_path = video_dir / video
    if video_path.exists():
        print(f"Processing video: {video_path}")
        for frame in frames:
            try:
                frame_number = int(frame)
            except ValueError:
                print(f"Invalid frame number: {frame}")
                continue
            embedding = extract_and_encode_frame(video_path, frame_number, layer_identifier)
            if embedding is not None:
                train_embeddings[(video, frame_number)] = embedding
                print(f"Encoded frame {frame_number} for video {video}")
            else:
                print(f"Skipped frame {frame_number} for video {video}")
    else:
        print(f"Video file not found: {video_path}")

highest_similarity = -1
lowest_similarity = float('inf')
highest_pair = None
lowest_pair = None

for test_video, test_frames_list in test_frames.items():
    test_video_path = video_dir / test_video
    if test_video_path.exists():
        print(f"Processing test video: {test_video_path}")
        for test_frame in test_frames_list:
            try:
                test_frame_number = int(test_frame)
            except ValueError:
                print(f"Invalid test frame number: {test_frame}")
                continue
            test_embedding = extract_and_encode_frame(test_video_path, test_frame_number, layer_identifier)
            if test_embedding is not None:
                for (train_video, train_frame), train_embedding in train_embeddings.items():
                    test_embedding = test_embedding.to(device)
                    train_embedding = train_embedding.to(device)
                    if test_embedding.shape != train_embedding.shape:
                        print(f"Shape mismatch between test embedding and train embedding.")
                        continue
                    try:
                        similarity = cosine_similarity(test_embedding, train_embedding).item()
                    except Exception as e:
                        print(f"Error computing similarity: {e}")
                        continue
                    if similarity > highest_similarity:
                        highest_similarity = similarity
                        highest_pair = ((test_video, test_frame_number), (train_video, train_frame))
                    if similarity < lowest_similarity:
                        lowest_similarity = similarity
                        lowest_pair = ((test_video, test_frame_number), (train_video, train_frame))
            else:
                print(f"Skipped test frame {test_frame_number} for video {test_video}")
    else:
        print(f"Test video file not found: {test_video_path}")

# Display the most and least similar frames at the end
if highest_pair and lowest_pair:
    print(f"Most Similar Pair:")
    print(f"Test Video: {highest_pair[0][0]} at frame {highest_pair[0][1]}")
    print(f"Train Video: {highest_pair[1][0]} at frame {highest_pair[1][1]}")
    print(f"Cosine Similarity (Highest): {highest_similarity:.4f}\n")

    print(f"Least Similar Pair:")
    print(f"Test Video: {lowest_pair[0][0]} at frame {lowest_pair[0][1]}")
    print(f"Train Video: {lowest_pair[1][0]} at frame {lowest_pair[1][1]}")
    print(f"Cosine Similarity (Lowest): {lowest_similarity:.4f}\n")

    test_video_path_high = video_dir / highest_pair[0][0]
    train_video_path_high = video_dir / highest_pair[1][0]
    test_video_path_low = video_dir / lowest_pair[0][0]
    train_video_path_low = video_dir / lowest_pair[1][0]

    # Display the most similar frames
    display_frames(
        test_video_path_high, highest_pair[0][1],
        train_video_path_high, highest_pair[1][1],
        'Most Similar Frames'
    )

    # Display the least similar frames
    display_frames(
        test_video_path_low, lowest_pair[0][1],
        train_video_path_low, lowest_pair[1][1],
        'Least Similar Frames'
    )
else:
    print("Could not find most and least similar pairs.")

conn.close()


Connected to database at h:\xai-unibuc\xAI_deepfake\my_database3.db
Fetching relevant frames from the database...
Train frames: {'bxfwiuvafo.mp4': ['273', '286', '298'], 'hipzzheqlg.mp4': ['133', '141', '148', '154'], 'mibbivalty.mp4': ['45', '50', '55', '72', '78', '99', '110', '118', '131', '143', '156', '166', '182', '225', '235', '246', '277', '289', '300'], 'azseubmxrc.mp4': ['57', '63', '70', '78', '86', '93', '109', '118', '147'], 'brvsnraikz.mp4': ['110', '115', '121', '131', '143', '189', '259'], 'bzqkplrsnt.mp4': ['151', '164', '175', '189'], 'fkkmxwjkxb.mp4': ['34', '41', '49', '54', '62', '73', '84', '137', '146', '154', '162'], 'ekmwbuaedp.mp4': ['27', '32', '39', '45', '54', '61', '73', '81', '89', '97', '105', '112', '124', '132'], 'airktntzqp.mp4': ['200', '213', '228', '236', '248'], 'jgijuusdev.mp4': ['53', '59', '66', '74', '87', '97', '104', '112', '123', '130', '142']}
Test frames: {'dsnerafcqv.mp4': ['61', '75', '88', '98', '111', '119'], 'gwnaxtndii.mp4': ['288',

In [2]:
from PIL import Image
import requests
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoProcessor
import numpy as np

model_id = "microsoft/Phi-3-vision-128k-instruct"

processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype="auto").cuda()

user_prompt = '<|user|>\n'
assistant_prompt = '<|assistant|>\n'
prompt_suffix = "<|end|>\n"


AttributeError: module transformers has no attribute Phi3VImageProcessor

In [None]:
!pip install transformers -U
!pip install datasets -U
!pip install torch -U

In [None]:
!pip install flash-attn --no-build-isolation