## Summary
* No classification error found.
* Some videos are duplicated in the 'noFights' folder, we remove them from the dataset.
* Some properties (unrelated to the presence of a fight) are different between the videos of fights and others: height of the images, frame-per-seconds, frame count, RGB means and variance, encoding algorithm. We will need to be careful to avoid a _[Clever Hans effect](https://en.wikipedia.org/wiki/Clever_Hans#The_Clever_Hans_effect)_ .
* Some videos are very similar (e.g. consecutive clips of a fight). If a model is trained on some clips from a fight and evaluated on other clips of the same fight, it might output correct classification based on the people's clothes. This would not generalize to other videos. We will design train/val splits avoiding this.

## Todo

* Import project_root instead of hardcoding it
* Clean up the "properties" part

In [None]:
# List the videos

import os
from pathlib import Path

from fight_classifier import DATASET_DIR
fights_dir = DATASET_DIR / "fights"
no_fights_dir = DATASET_DIR / "noFights"

fights_names = os.listdir(fights_dir)
no_fights_names = os.listdir(no_fights_dir)

fights_paths = [fights_dir / name for name in fights_names]
no_fights_paths = [no_fights_dir / name for name in no_fights_names]
print(f'{len(fights_paths)} fight videos, {len(no_fights_paths)} no-fight videos')

In [None]:
# Detect duplicates

from collections import defaultdict
from typing import Dict, List, Mapping

import tqdm

unique_fights_paths = []
unique_no_fights_paths = []
hash_to_paths: Mapping[int, List[str]] = defaultdict(list)
    
for path in tqdm.tqdm(fights_paths):
    path_str = str(path)
    video = skvideo.io.vread(path_str)
    video_hash = hash(video.tobytes())
    if video_hash not in hash_to_paths:
        unique_fights_paths.append(path)
    hash_to_paths[video_hash].append(path_str)
    
for path in tqdm.tqdm(no_fights_paths):
    path_str = str(path)
    video = skvideo.io.vread(path_str)
    video_hash = hash(video.tobytes())
    if video_hash not in hash_to_paths:
        unique_no_fights_paths.append(path)
    hash_to_paths[video_hash].append(path_str)

duplicates = {
    video_hash: paths 
    for video_hash, paths in hash_to_paths.items()
    if len(paths) != 1
}

print(f'Duplicates, {duplicates}\n')
print(
    f'{len(unique_fights_paths)} unique fight videos, '
    f'{len(unique_no_fights_paths)} unique no-fight videos')

In [None]:
import re

import pandas as pd
videos_dicts = []

# We sort the videos by index, to make it easier to manually fill
# the "coarse_category" and "fine_category" columns
fights_re = re.compile('.*/newfi([0-9]+)')
sorted_fights_paths = sorted(
    unique_fights_paths, 
    key=lambda p: int(fights_re.match(str(p)).group(1)))

no_fights_re = re.compile('.*/([0-9]+)')

sorted_no_fights_paths = sorted(
    unique_no_fights_paths,
    key=lambda p: int(no_fights_re.match(str(p)).group(1)))

for fight_path in sorted_fights_paths:
    fight_dict = {
        'video_path': str(fight_path.relative_to(DATASET_DIR)),
        'is_fight': True,
        'coarse_category': None,
        'fine_category': None,
    }
    videos_dicts.append(fight_dict)
for fight_path in sorted_no_fights_paths:
    fight_dict = {
        'video_path': str(fight_path.relative_to(DATASET_DIR)),
        'is_fight': False,
        'coarse_category': None,
        'fine_category': None,
    }
    videos_dicts.append(fight_dict)
videos_df = pd.DataFrame(videos_dicts)
videos_df.to_csv(str(DATASET_DIR / 'empty_videos.csv'))
videos_df

In [None]:
# Get a pandas.DataFrame of the videos with some properties


import cv2
import numpy as np
import skvideo.io
import tqdm

def video_path_to_np_old(video_path) -> Dict[str, Any]:
    """Returns basic video properties"""
    capture = cv2.VideoCapture(str(video_path))
    prop_dict = {
        "CV_CAP_PROP_FRAME_WIDTH" : capture.get(cv2.CAP_PROP_FRAME_WIDTH),
        "CV_CAP_PROP_FRAME_HEIGHT" : capture.get(cv2.CAP_PROP_FRAME_HEIGHT),
        "CAP_PROP_FPS" : capture.get(cv2.CAP_PROP_FPS),
        "CAP_PROP_POS_MSEC" : capture.get(cv2.CAP_PROP_POS_MSEC),
        "CAP_PROP_FRAME_COUNT" : capture.get(cv2.CAP_PROP_FRAME_COUNT),
        "CAP_PROP_BRIGHTNESS" : capture.get(cv2.CAP_PROP_BRIGHTNESS),
        "CAP_PROP_CONTRAST" : capture.get(cv2.CAP_PROP_CONTRAST),
        "CAP_PROP_SATURATION" : capture.get(cv2.CAP_PROP_SATURATION),
        "CAP_PROP_HUE" : capture.get(cv2.CAP_PROP_HUE),
        "CAP_PROP_GAIN" : capture.get(cv2.CAP_PROP_GAIN),
        "CAP_PROP_CONVERT_RGB" : capture.get(cv2.CAP_PROP_CONVERT_RGB),        
    }
    capture.release()
    return  prop_dict

# Properties of each unique video
prop_dicts: List[Dict[str, Any]] = []

paths_with_gt = [(path, True) for path in fights_paths] + [(path, False) for path in no_fights_paths]
for path, is_fight in tqdm.tqdm(paths_with_gt):
    prop_dict = video_path_to_np_old(path)
    prop_dict['is_fight'] = is_fight

    video = skvideo.io.vread(str(path))
    

    if len(videos_hashes[video_hash]) > 1:
        # Since we'll remove the duplicates, we don't want to count their properties twice
        continue
    
    # Basic statistics on the video RGB values
    r_mean, g_mean, b_mean = np.mean(video, axis=(0, 1, 2))
    r_std, g_std, b_std = np.std(video, axis=(0, 1, 2))
    prop_dict['r_mean'] = r_mean
    prop_dict['g_mean'] = g_mean
    prop_dict['b_mean'] = b_mean
    prop_dict['r_std'] = r_std
    prop_dict['g_std'] = g_std
    prop_dict['b_std'] = b_std
    prop_dict['brightness'] = np.mean((r_mean, g_mean, b_mean))
    
    prop_dicts.append(prop_dict)

In [None]:
len(set(videos_hashes.values()))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

props_df = pd.DataFrame(prop_dicts)
informative_columns = []
for column in props_df.columns:
    unique_values = props_df[column].unique()
    if len(unique_values) == 1 or column == 'is_fight':
        continue
    informative_columns.append(column)
    print("\n", column)
    props_df[column].hist(by=props_df['is_fight'])
    plt.show()
print("informative_columns ", informative_columns)

In [None]:
from pathlib import Path
p = Path("/try/this_{}.png")

In [None]:
p.format('hey')

In [None]:
dir(p)

In [None]:
p.replace('{}', '0')