In [None]:
import os
import shutil
import cv2  
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd

### Select only one clip from each videoID
遍历数据集中的视频片段，并在每个VideoID类别下只挑选一个片段，存入指定路径用于下一步查重

In [None]:
src_path = r"E:\A - 暴力事件检测\Datasets\Youtube-crime\A1 - 正常"
dst_path = "Video_Clips"

In [None]:
def extract_VideoID(video_name):
    # Initialize empty video id
    VideoID = ''
    # Get the entire video id
    for s in video_name.split('_')[:-1]:
        VideoID += s
        VideoID += '_'
        
    return VideoID[:-1]

In [None]:
# Save the VideoID that has been already selected
saved_video = []
# Begin to select
for video_name in tqdm(os.listdir(src_path)):
    VideoID = extract_VideoID(video_name)
    # Select and copy the video if it is not selected before, and add it into selected list
    if VideoID not in saved_video:
        video_path = os.path.join(src_path,video_name)
        shutil.copy(video_path, dst_path)
        saved_video.append(VideoID)
    else:
        pass

In [None]:
print("Number of source videos: ", len(saved_video))

### Calculate finger_print of each video by histogram
计算每个视频在BGR通道上的灰度直方图（帧间平均），并将其拼接展开为一维向量作为视频指纹

- 自定义函数

In [None]:
def get_all_frames(video_path):
    """Load all frames of a video as a list of images
    """
    frames = []
    cap = cv2.VideoCapture(video_path)
    for i in range(int(cap.get(7))):
        _,frame = cap.read()
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frames.append(frame)
    cap.release()
    
    return frames


def Normalize(data):
    """Normalize the input vector so that it has the max=1 and min=0
    """
    mm = np.max(data)
    mn = np.min(data)
    
    return (data-mn) / (mm-mn)


def calculate_finger_print(frames):
    """Calculate the histogram of picture and use it as video fingerprint 
    """
    hist = cv2.calcHist(images=frames, channels=[0], mask=None, histSize=[256], ranges=[0,255], accumulate=True)
    hist= Normalize(hist)
    finger_print = np.reshape(hist, [-1]) 
    
    return finger_print

- 配置文件路径

In [None]:
src_path ="Video_Clips"
video_list = os.listdir(src_path)

- 计算视频指纹

In [None]:
# Initialzie an empty dict to save finger print of each video
Video_FPs = {}
# Start to Calculate
for video_name in tqdm(video_list):
    # Get VideoID and video path 
    VideoID = extract_VideoID(video_name)    
    video_path = os.path.join(src_path, video_name)
    # Load all frames as a list of images
    frames = get_all_frames(video_path)
    # Calculate the video's finger print
    finger_print = calculate_finger_print(frames)
    # Save it in dict 
    Video_FPs[VideoID] = finger_print

### Calculate the similarity of videos
计算所有的VideoID中，两两之间的余弦相似度

- 定义余弦相似度计算

In [None]:
def cosine_similarity(vector_1, vector_2):
    """Calcuale the cosine similarity between two vector
    """
    A = np.dot(vector_1, vector_2)
    B = np.linalg.norm(vector_1) * np.linalg.norm(vector_2)
    return A/B

- 载入待比较的VideoID

In [None]:
# Get all VideoIDs
VIDs = [extract_VideoID(video_name) for video_name in video_list]

- 计算两两之间的相似度

In [None]:
# Initialize empty lists to save video pairs and similarities
VID_1 = []
VID_2 = []
VSs = []
# Start to calcualte similarity of each video pair
for i in range(len(VIDs)):
    for j in range(i+1, len(VIDs)):
        # Get two VideoID and FingerPrints
        video_1 = VIDs[i]
        video_2 = VIDs[j]
        fp1 = Video_FPs[video_1]
        fp2 = Video_FPs[video_2]
        # Calcualte the cosine similarity
        similarity = cosine_similarity(fp1, fp2)
        # Append the result into lists
        VID_1.append(video_1)
        VID_2.append(video_2)
        VSs.append(similarity)

- 转换为DF格式并存入CSV文件

In [None]:
df = pd.DataFrame({'Video_1':VID_1, 'VIdeo_2':VID_2, 'Video_Similarity':VSs})
df = df.sort_values(by='Video_Similarity', axis=0, ascending=False)
df = df.reset_index(drop=True)
df.to_csv('Similarity.csv',index=False, encoding='utf-8')