In [1]:
import pickle, glob, os, cv2
import face_recognition
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN

In [4]:
with open('PL_1_0_250_tracks.pkl', 'rb') as pickle_file:
    tracks = pickle.load(pickle_file)
with open('PL_1_0_250_scores.pkl', 'rb') as pickle_file:
    scores = pickle.load(pickle_file)

In [48]:
tracks

[{'track': {'frame': array([ 94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106,
          107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
          120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
          133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145,
          146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
          159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171,
          172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184,
          185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
          198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
          211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
          224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236,
          237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
          250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262,
       

In [2]:
flist = glob.glob(os.path.join("../TalkNet-ASD/Videos/PL_1_0_250/pyframes/", "*.jpg"))
flist.sort()

In [5]:
faces = {}
for tidx, track in enumerate(tracks):
    score = scores[tidx]
    for fidx, frame in enumerate(track["track"]["frame"].tolist()):
        s = score[max(fidx - 2, 0) : min(fidx + 3, len(score) - 1)]  # average smoothing
        s = float(np.mean(s))
        if s > 0:
            track_data = {
                "score": s,
                "s": track["proc_track"]["s"][fidx],
                "x": track["proc_track"]["x"][fidx],
                "y": track["proc_track"]["y"][fidx],
                "frame": frame,
            }
            if frame in faces.keys():
                faces[frame][tidx] = track_data
            else:
                faces[frame] = {tidx: track_data}

In [6]:
len(faces.keys())

3780

In [7]:
total = len(faces.keys())
intervals = [int(i*total/100) for i in range(100)]

In [7]:
start_time = datetime.now()
print(f"Start time - {start_time}")
for fidx, frame_id in enumerate(faces.keys()):
    image = face_recognition.load_image_file(flist[frame_id])
    for track_id in faces[frame_id].keys():
        faces[frame_id][track_id]["encoding"] = face_recognition.face_encodings(
            image,
            [(
                int(faces[frame_id][track_id]["y"] - faces[frame_id][track_id]["s"]),
                int(faces[frame_id][track_id]["x"] + faces[frame_id][track_id]["s"]),
                int(faces[frame_id][track_id]["y"] + faces[frame_id][track_id]["s"]),
                int(faces[frame_id][track_id]["x"] - faces[frame_id][track_id]["s"]),
            )], model="small"
        )[0]
    if fidx in intervals:
        print("|", end="")
print()
end_time = datetime.now()
print(f"End time - {end_time}")
print(f"The model took - {end_time - start_time}")

Start time - 2023-12-05 23:15:57.751586
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
End time - 2023-12-05 23:34:30.807187
The model took - 0:18:33.055601


In [8]:
with open("faces.pkl", "wb") as pickle_file:
    pickle.dump(faces, pickle_file)

In [6]:
with open('faces.pkl', 'rb') as pickle_file:
    faces = pickle.load(pickle_file)

In [7]:
faces[98]

{2: {'score': 2.800000031789144,
  's': 24.195465087890625,
  'x': 653.0314331054688,
  'y': 376.98760986328125,
  'frame': 98,
  'encoding': array([-0.00475069,  0.06300543,  0.05752417, -0.01600654,  0.00112483,
          0.02959497, -0.07662967, -0.03952973,  0.11318361, -0.06335047,
          0.24787217,  0.00775525, -0.23524754, -0.04807144, -0.09182795,
          0.09417541, -0.1535296 , -0.06460626, -0.14140023, -0.02221822,
          0.02367331,  0.09808709,  0.04474756, -0.05241382, -0.06778108,
         -0.31833011, -0.13321105, -0.04271945,  0.08582131, -0.08552647,
          0.03879549,  0.05678439, -0.15780798, -0.02301202,  0.03761561,
         -0.04114474, -0.0052411 , -0.0063472 ,  0.23006317,  0.14159028,
         -0.11504854,  0.06032252,  0.02227838,  0.30301929,  0.20661645,
          0.11795092,  0.00972322, -0.07645765,  0.0715341 , -0.32195994,
          0.07482724,  0.15759143,  0.11869399,  0.06721074,  0.13830516,
         -0.1388669 ,  0.00538068,  0.0484978 

In [8]:
rows = []
for frame, tracks in faces.items():
    for track_id, track_data in tracks.items():
        rows.append({
            "Frame": frame,
            "Track": track_id,
            "Score": track_data["score"],
            "S": track_data["s"],
            "X": track_data["x"],
            "Y": track_data["y"],
            "Encoding": track_data["encoding"]
        })

In [9]:
df = pd.DataFrame(rows)

In [10]:
df

Unnamed: 0,Frame,Track,Score,S,X,Y,Encoding
0,98,2,2.800,24.195465,653.031433,376.987610,"[-0.004750688560307026, 0.06300543248653412, 0..."
1,99,2,2.875,24.682358,653.031433,377.456467,"[0.023352961987257004, 0.047207802534103394, 0..."
2,100,2,2.980,24.775803,653.031433,377.595596,"[0.012972662225365639, 0.06011754646897316, 0...."
3,101,2,3.220,24.846634,653.031433,377.647034,"[0.007410488091409206, 0.06599914282560349, 0...."
4,102,2,3.460,24.846634,653.031433,377.647034,"[0.012271895073354244, 0.0649895966053009, 0.0..."
...,...,...,...,...,...,...,...
3775,6245,22,2.940,131.120850,731.363129,260.601124,"[0.02583138458430767, 0.03261333703994751, 0.0..."
3776,6246,22,3.000,130.983749,730.938843,260.601124,"[0.008587155491113663, 0.03213134780526161, 0...."
3777,6247,22,3.000,130.960190,730.640869,260.601124,"[-0.001326502300798893, 0.03469701111316681, 0..."
3778,6248,22,2.950,130.960190,730.497894,260.601124,"[-0.011287746950984001, 0.043798815459012985, ..."


In [21]:
from sklearn.cluster import DBSCAN

# Choose DBSCAN parameters
eps = 0.5
min_samples = 200

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
clusters = dbscan.fit_predict(df["Encoding"].to_list())

# Print the cluster assignments
print("Cluster Assignments:", clusters)

Cluster Assignments: [0 0 0 ... 1 1 1]


In [22]:
unique, counts = np.unique(clusters, return_counts=True)
print(np.asarray((unique, counts)).T)

[[  -1   15]
 [   0  248]
 [   1 2369]
 [   2 1148]]


In [15]:
df["Clusters"] = clusters

In [37]:
df[df["Track"] == 2]["Clusters"].value_counts().idxmax()

0

In [38]:
final_ids = {}
for track_id in df["Track"].unique():
    final_cluster = df[df["Track"] == track_id]["Clusters"].value_counts().idxmax()
    final_ids[track_id] = final_cluster
final_ids

{2: 0, 3: 0, 6: 1, 7: 2, 9: 2, 12: 2, 14: 2, 17: 0, 18: 1, 19: 1, 22: 1}

In [39]:
df["Final IDs"] = df["Track"].apply(lambda x: final_ids[x])
df

Unnamed: 0,Frame,Track,Score,S,X,Y,Encoding,Clusters,Final IDs
0,98,2,2.800,24.195465,653.031433,376.987610,"[-0.004750688560307026, 0.06300543248653412, 0...",0,0
1,99,2,2.875,24.682358,653.031433,377.456467,"[0.023352961987257004, 0.047207802534103394, 0...",0,0
2,100,2,2.980,24.775803,653.031433,377.595596,"[0.012972662225365639, 0.06011754646897316, 0....",0,0
3,101,2,3.220,24.846634,653.031433,377.647034,"[0.007410488091409206, 0.06599914282560349, 0....",0,0
4,102,2,3.460,24.846634,653.031433,377.647034,"[0.012271895073354244, 0.0649895966053009, 0.0...",0,0
...,...,...,...,...,...,...,...,...,...
3775,6245,22,2.940,131.120850,731.363129,260.601124,"[0.02583138458430767, 0.03261333703994751, 0.0...",1,1
3776,6246,22,3.000,130.983749,730.938843,260.601124,"[0.008587155491113663, 0.03213134780526161, 0....",1,1
3777,6247,22,3.000,130.960190,730.640869,260.601124,"[-0.001326502300798893, 0.03469701111316681, 0...",1,1
3778,6248,22,2.950,130.960190,730.497894,260.601124,"[-0.011287746950984001, 0.043798815459012985, ...",1,1


In [53]:
def convert_to_ranges(lst, frame_rate):
    ranges = []
    start = lst[0]

    threshold = 2

    for i in range(1, len(lst)):
        if lst[i] - lst[i - 1] > threshold:
            ranges.append((start/frame_rate, lst[i - 1]/frame_rate))
            start = lst[i]

    # Add the last range
    ranges.append((start/frame_rate, lst[-1]/frame_rate))

    return ranges

In [54]:
final_tracks = {}
for idx in df["Final IDs"].unique():
    speaker_key = "SPEAKER_{:02d}".format(idx)
    final_tracks[speaker_key] = df[df["Final IDs"] == idx]["Frame"].to_list()
final_tracks

{'SPEAKER_00': [98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  160,
  161,
  162,
  163,
  164,
  183,
  184,
  185,
  186,
  187,
  188,
  189,
  190,
  191,
  192,
  193,
  194,
  195,
  196,
  197,
  198,
  199,
  200,
  201,
  202,
  203,
  204,
  205,
  206,
  207,
  208,
  209,
  210,
  211,
  212,
  213,
  214,
  215,
  216,
  217,
  218,
  219,
  220,
  221,
  222,
  223,
  224,
  225,
  226,
  227,
  228,
  229,
  230,
  231,
  232,
  233,
  234,
  235,
  236,
  237,
  238,
  239,
  240,
  241,
  242,
  243,
  244,
  245,
  246,
  247,
  248,
  249,
  250,
  251,
  252,
  253,
  254,
  255,
  256,
  257,
  258,
  259,
  260,
  261,
  262,
  263,
  264,
  265,
  266,
  267,
  268,
  269,
  270,
  271,
  272,
  273,
  274,
 

In [55]:
for key in final_tracks.keys():
    final_tracks[key] = convert_to_ranges(final_tracks[key], 25)

final_tracks

{'SPEAKER_00': [(3.92, 5.64), (6.4, 6.56), (7.32, 12.68), (180.72, 183.44)],
 'SPEAKER_01': [(32.76, 45.48),
  (45.84, 81.76),
  (81.92, 82.24),
  (82.68, 87.64),
  (208.0, 238.12),
  (238.76, 243.04),
  (243.36, 249.96)],
 'SPEAKER_02': [(87.72, 89.24),
  (90.24, 91.48),
  (91.6, 105.52),
  (107.24, 115.24),
  (157.0, 175.84),
  (177.56, 179.8)]}

In [None]:
track_face_encodings

In [None]:
face_recognition.compare_faces([track_face_encodings[7]["face_encoding"]], track_face_encodings[12]["face_encoding"], tolerance=0.6)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
def calculate_similarity(embedding1, embedding2):
    return euclidean_distances([embedding1], [embedding2])[0][0]

In [None]:
calculate_similarity(track_face_encodings[7]["face_encoding"], track_face_encodings[12]["face_encoding"])