In [1]:
import pickle
import pandas as pd
import sys
import time
import numpy as np

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

In [2]:
with open("../../output/video_temp/2qQs3Y9OJX0_c_01/pywork/encoding_df.pckl", "rb") as fil:
    encoding_df = pickle.load(fil)

In [4]:
marked_df = pd.read_excel("../../../del_later/1j20_marked.xlsx")

In [8]:
encoding_df.to_excel("../../../del_later/1j20.xlsx", index=False)

In [92]:
final_df = encoding_df.merge(marked_df[["Frame", "Track", "actual_cluters"]], on=["Frame", "Track"])
final_df

Unnamed: 0,Frame,Track,Score,S,X,Y,Encoding,Clusters,actual_cluters
0,0,0,1.600000,87.066998,331.198029,137.840912,"[-0.05140741169452667, 0.03528446704149246, 0....",0,1
1,1,0,1.700000,87.066998,331.198029,138.238453,"[-0.06638793647289276, 0.02433614432811737, 0....",0,1
2,2,0,1.780000,87.474350,331.374649,138.238453,"[-0.06108415871858597, 0.026007167994976044, 0...",0,1
3,3,0,1.920000,88.022346,331.374649,140.397591,"[-0.07366905361413956, 0.03151387721300125, 0....",0,1
4,4,0,1.980000,88.158028,331.374649,142.936060,"[-0.07083795964717865, 0.04400164633989334, 0....",0,1
...,...,...,...,...,...,...,...,...,...
2488,6795,99,0.020000,47.806647,362.171973,177.739293,"[-0.15960538387298584, 0.07936713099479675, 0....",4,5
2489,6802,100,0.033333,77.966949,320.476021,97.524651,"[-0.14060121774673462, 0.04611499235033989, 0....",4,5
2490,6803,100,0.025000,80.491959,343.058182,101.714498,"[-0.0818537101149559, 0.06873033195734024, 0.0...",4,5
2491,6804,100,0.060000,83.150391,371.819168,106.853055,"[-0.08707184344530106, 0.0573924258351326, 0.0...",4,5


In [3]:
final_df = encoding_df

In [4]:
def perform_clustering(encoding_list, min_clusters=2, max_clusters=6, input_clusters=-1, silhouette_threshold = 0.25):

    optimal_cluster = -1
    max_silhouette = 0
    
    num_points = len(encoding_list)
    
    if input_clusters < 0:
        for num_clusters in range(min_clusters, max_clusters + 1):
            if num_clusters >= num_points:
                break
            kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
            cluster_labels = kmeans.fit_predict(encoding_list)

            silhouette_avg = silhouette_score(encoding_list, cluster_labels)
#             sys.stderr.write(
#                 time.strftime("%Y-%m-%d %H:%M:%S")
#                 + " Silhouette score for k = %d: %f \r\n" % (num_clusters, silhouette_avg)
#             )
            if silhouette_avg > max_silhouette:
                max_silhouette = silhouette_avg
                optimal_cluster = num_clusters
    else:
        optimal_cluster = input_clusters
        
    if max_silhouette >= silhouette_threshold:
        kmeans = KMeans(n_clusters=optimal_cluster, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(encoding_list)

#         sys.stderr.write(
#             time.strftime("%Y-%m-%d %H:%M:%S")
#             + " Optimal number of clusters: %d \r\n" % (optimal_cluster)
#         )
        
    else:
        cluster_labels = [0] * len(encoding_list)

    return cluster_labels



def find_centroid(points):
    # Calculate the mean along each dimension
    centroid = np.mean(points, axis=0)
    return centroid

def mini_track_group(track_id, temp_df):
    
    centroid_list = []
    
    for cluster_num in temp_df["Track_Cluster_ID"].unique():
        centroid = find_centroid(list(temp_df[temp_df["Track_Cluster_ID"] == cluster_num]["Encoding"]))
        centroid_list.append({"Track": track_id, "Track_Cluster_ID": cluster_num, "Track_Cluster_Centroid": centroid})

    return centroid_list

In [6]:
def get_subset(df):
    
    if "Track" not in df.columns:
        print("Invalid df")
        return None
    if "Encoding" not in df.columns:
        print("Invalid df")
        return None
    
    def remove_first_two_rows(group):
        return group.iloc[2:]
    
    df_filtered = final_df.groupby('Track').apply(remove_first_two_rows).reset_index(drop=True)
    
    track_centroid_list = []
    df_list = []
    
    for track_id in list(df_filtered["Track"].unique()):
        temp = df_filtered[df_filtered["Track"] == track_id].copy()
        temp["Track_Cluster_ID"] = perform_clustering(list(temp["Encoding"]))
        
        track_centroid_list.extend(mini_track_group(track_id, temp))
        df_list.append(temp)
        
    return pd.concat(df_list), pd.DataFrame(track_centroid_list)

In [7]:
subset_df = get_subset(final_df)

In [8]:
final_df_cents, track_centroids_df = get_subset(final_df)

In [9]:
final_df_cents

Unnamed: 0,Frame,Track,Score,S,X,Y,Encoding,Clusters,Track_Cluster_ID
0,2,0,1.12,36.499027,297.335709,90.877142,"[-0.12249568104743958, 0.10854216665029526, 0....",4,0
1,3,0,1.24,36.707996,297.335709,90.909981,"[-0.11383970081806183, 0.10473571717739105, 0....",4,0
2,4,0,1.30,36.707996,297.581314,90.928844,"[-0.12663236260414124, 0.10784599930047989, 0....",4,0
3,5,0,1.30,36.708214,297.638245,91.085066,"[-0.1095271110534668, 0.11937547475099564, 0.0...",4,0
4,6,0,1.30,36.861284,297.707550,91.327406,"[-0.09684795886278152, 0.0959509015083313, 0.0...",4,0
...,...,...,...,...,...,...,...,...,...
1605,7419,68,0.12,24.705498,326.812332,113.277267,"[-0.09212890267372131, 0.07119353860616684, 0....",4,1
1606,7420,68,0.18,24.665962,328.202698,112.748276,"[-0.09382065385580063, 0.05741628259420395, 0....",4,1
1607,7421,68,0.18,24.665962,328.752304,112.709496,"[-0.09585478156805038, 0.06786153465509415, 0....",4,1
1608,7422,68,0.12,24.665962,330.117081,112.589043,"[-0.11552995443344116, 0.06896164268255234, 0....",4,1


In [10]:
pd.DataFrame(final_df_cents[["Track"]].value_counts()).sort_values("Track").reset_index().rename(columns={"count": "Track_Count"})

Unnamed: 0,Track,Track_Count
0,0,81
1,1,144
2,2,3
3,5,83
4,6,3
5,10,63
6,12,15
7,15,25
8,16,41
9,20,45


In [11]:
pd.DataFrame(final_df_cents[["Track", "Track_Cluster_Cluster_ID"]].value_counts()).reset_index().rename(
    columns={"count": "Track_Cluster_Cluster_Count"}
).sort_values(["Track", "Track_Cluster_Cluster_ID"]).reset_index(drop=True)

KeyError: "['Track_Cluster_Cluster_ID'] not in index"

In [211]:
final_df_cents[["Track", "Track_Cluster_Cluster_ID"]].value_counts()

{(14, 0): 190,
 (9, 0): 143,
 (30, 5): 138,
 (25, 5): 123,
 (18, 0): 108,
 (43, 4): 107,
 (21, 5): 102,
 (46, 5): 100,
 (38, 5): 88,
 (48, 5): 71,
 (27, 6): 69,
 (31, 4): 62,
 (42, 4): 58,
 (25, 6): 56,
 (3, 0): 55,
 (60, 0): 51,
 (62, 0): 45,
 (71, 3): 42,
 (68, 0): 40,
 (35, 5): 38,
 (56, 0): 37,
 (19, 6): 35,
 (65, 2): 35,
 (21, 6): 33,
 (34, 4): 31,
 (1, 4): 28,
 (57, 2): 27,
 (72, 0): 26,
 (88, 0): 26,
 (67, 2): 25,
 (86, 7): 24,
 (70, 0): 23,
 (90, 1): 23,
 (87, 1): 21,
 (10, 2): 19,
 (81, 7): 19,
 (52, 6): 17,
 (58, 0): 17,
 (63, 2): 17,
 (83, 7): 16,
 (95, 0): 15,
 (73, 2): 14,
 (63, 3): 14,
 (98, 6): 13,
 (0, 0): 12,
 (82, 7): 11,
 (55, 2): 10,
 (77, 6): 10,
 (44, 5): 10,
 (50, 6): 9,
 (40, 3): 8,
 (82, 6): 7,
 (55, 3): 7,
 (39, 3): 7,
 (77, 3): 6,
 (53, 3): 6,
 (75, 4): 5,
 (99, 1): 5,
 (99, 3): 5,
 (10, 3): 4,
 (79, 7): 3,
 (83, 3): 3,
 (79, 6): 2,
 (73, 3): 2,
 (24, 6): 2,
 (3, 6): 2,
 (100, 3): 2,
 (53, 2): 1,
 (23, 3): 1}

In [289]:
final_df_cents.to_excel("../../../del_later/final_df_cents.xlsx", index = False)

In [12]:
class TrackCluster:
    def __init__(self, track_cluster_cluster_id, track_cluster_cluster_count, track_cluster_percent):
        self.track_cluster_cluster_id = track_cluster_cluster_id
        self.track_cluster_cluster_count = track_cluster_cluster_count
        self.track_cluster_percent = track_cluster_percent

    def __lt__(self, other):
        return self.track_cluster_cluster_count < other.track_cluster_cluster_count

    def __le__(self, other):
        return self.track_cluster_cluster_count <= other.track_cluster_cluster_count

    def __eq__(self, other):
        return self.track_cluster_cluster_count == other.track_cluster_cluster_count

    def __ne__(self, other):
        return self.track_cluster_cluster_count != other.track_cluster_cluster_count

    def __gt__(self, other):
        return self.track_cluster_cluster_count > other.track_cluster_cluster_count

    def __ge__(self, other):
        return self.track_cluster_cluster_count >= other.track_cluster_cluster_count

    def __repr__(self) -> str:
        return f"TrackCluster(track_cluster_cluster_id={self.track_cluster_cluster_id}, track_cluster_cluster_count={self.track_cluster_cluster_count}, track_cluster_percent={self.track_cluster_percent})"


class Track:
    def __init__(self, track_id, track_count, track_cluster_list):
        self.track_id = track_id
        self.track_count = track_count
        self.track_cluster_list = track_cluster_list
        self.track_cluster_list.sort(reverse=True)
        self.final_cluster = -1

    def get_cluster_set(self):
        cluster_set = set()
        for track_cluster in self.track_cluster_list:
            cluster_set.add(cluster_set)

        return cluster_set

    def get_majority_forming_cluster_id(self):

        if self.track_count <= 20:
            max_cluster_count = 0
            max_cluster_id = -1
            for track_cluster in self.track_cluster_list:
                if track_cluster.track_cluster_cluster_count > max_cluster_count:
                    max_cluster_count = track_cluster.track_cluster_cluster_count
                    max_cluster_id = track_cluster.track_cluster_cluster_id

            return max_cluster_id

        for track_cluster in self.track_cluster_list:
            if track_cluster.track_cluster_percent >= 80:
                return track_cluster.track_cluster_cluster_id
            else:
                break

        combined_percent = 0
        cluster_list = []

        for track_cluster in self.track_cluster_list:
            combined_percent += track_cluster.track_cluster_percent
            cluster_list.append(track_cluster.track_cluster_cluster_id)
            if combined_percent >= 80:
                break

        return cluster_list

    def __repr__(self) -> str:
        return f"Track(track_id={self.track_id}, track_count={self.track_count}, track_cluster_list={self.track_cluster_list})"


class TrackList:
    def __init__(self, df):
        self.track_list = self.get_track_list(df)

    def get_final_clusters(self):

        final_cluster_dict = {}
        combine_dict = {}

        for track in self.track_list:
            final_cluster = track.get_majority_forming_cluster_id()
            final_cluster_dict[track.track_id] = final_cluster
            if type(final_cluster) == list:
                combine_dict[track.track_id] = final_cluster

        existing_sets_list: list[set] = []

        for track_id, cluster_list in combine_dict.items():
            cluster_set = set(cluster_list)

            if len(existing_sets_list) == 0:
                existing_sets_list.append(cluster_set)
                continue

            for existing_sets in existing_sets_list:
                if len(existing_sets.intersection(cluster_set)) > 0:
                    existing_sets.update(cluster_set)
                else:
                    existing_sets_list.append(cluster_set)

        # assign the first value of the set to all the values in the set
        final_combo_clusters = {}
        for cluster_set in existing_sets_list:
            cluster_id = cluster_set.pop()
            final_combo_clusters[cluster_id] = cluster_id
            for cluster in cluster_set:
                final_combo_clusters[cluster] = cluster_id

        for track_id, cluster in final_cluster_dict.items():
            if type(cluster) == list:
                final_cluster_dict[track_id] = final_combo_clusters[cluster[0]]
            elif cluster in final_combo_clusters:
                final_cluster_dict[track_id] = final_combo_clusters[cluster]

        return final_cluster_dict

    def get_track(self, track_id):
        for track in self.track_list:
            if track.track_id == track_id:
                return track

    @staticmethod
    def get_track_list(df):

        df["TrackClusterClass"] = df.apply(
            lambda row: TrackCluster(
                track_cluster_cluster_id=row["Track_Cluster_Cluster_ID"],
                track_cluster_cluster_count=row["Track_Cluster_Cluster_Count"],
                track_cluster_percent=row["Track_Cluster_Percent"],
            ),
            axis=1,
        )

        df = (
            df.groupby("Track")
            .agg(
                {
                    "Track_Count": "first",
                    "TrackClusterClass": lambda x: list(x),
                }
            )
            .reset_index()
        )

        df["TrackClass"] = df.apply(
            lambda row: Track(
                track_id=row["Track"],
                track_count=row["Track_Count"],
                track_cluster_list=row["TrackClusterClass"],
            ),
            axis=1,
        )

        return df["TrackClass"].tolist()

    def __repr__(self) -> str:
        return f"TrackList(track_list_count={len(self.track_list)})"


def get_final_cluster_ids(final_df):

    temp_df = final_df[["Track", "Track_Cluster_Cluster_ID"]].copy()

    #     Get count of each track

    track_count_df = (
        pd.DataFrame(temp_df[["Track"]].value_counts())
        .sort_values("Track")
        .reset_index()
        .rename(columns={"count": "Track_Count"})
    )

    #     Get count of each cluster in each track

    track_cluster_count_df = (
        pd.DataFrame(temp_df[["Track", "Track_Cluster_Cluster_ID"]].value_counts())
        .reset_index()
        .rename(columns={"count": "Track_Cluster_Cluster_Count"})
        .sort_values(["Track", "Track_Cluster_Cluster_ID"])
        .reset_index(drop=True)
    )

    temp_df = temp_df.merge(track_count_df, on="Track")
    temp_df = temp_df.merge(track_cluster_count_df, on=["Track", "Track_Cluster_Cluster_ID"])

    temp_df = temp_df.drop_duplicates().reset_index(drop=True)

    #     Get percentage of track cluster count in total track count

    temp_df["Track_Cluster_Percent"] = temp_df.apply(
        lambda row: (row["Track_Cluster_Cluster_Count"] / row["Track_Count"]) * 100, axis=1
    )
    
    track_list = TrackList(temp_df)
    
    final_cluster_dict = track_list.get_final_clusters()
    
    temp_df["Final_Cluster"] = temp_df["Track"].apply(lambda x: int(final_cluster_dict.get(x, -1)))
    
    temp_df = temp_df.drop(columns=["TrackClusterClass"], axis=1)

    return temp_df

In [13]:
_temp_df = get_final_cluster_ids(final_df_cents)
_temp_df

KeyError: "['Track_Cluster_Cluster_ID'] not in index"

In [305]:
_temp_df.to_excel("../../../del_later/final_df_cents.xlsx", index=False)

In [285]:
class TrackCluster:
    def __init__(self, track_cluster_cluster_id, track_cluster_cluster_count, track_cluster_percent):
        self.track_cluster_cluster_id = track_cluster_cluster_id
        self.track_cluster_cluster_count = track_cluster_cluster_count
        self.track_cluster_percent = track_cluster_percent

    def __lt__(self, other):
        return self.track_cluster_cluster_count < other.track_cluster_cluster_count

    def __le__(self, other):
        return self.track_cluster_cluster_count <= other.track_cluster_cluster_count

    def __eq__(self, other):
        return self.track_cluster_cluster_count == other.track_cluster_cluster_count

    def __ne__(self, other):
        return self.track_cluster_cluster_count != other.track_cluster_cluster_count

    def __gt__(self, other):
        return self.track_cluster_cluster_count > other.track_cluster_cluster_count

    def __ge__(self, other):
        return self.track_cluster_cluster_count >= other.track_cluster_cluster_count

    def __repr__(self) -> str:
        return f"TrackCluster(track_cluster_cluster_id={self.track_cluster_cluster_id}, track_cluster_cluster_count={self.track_cluster_cluster_count}, track_cluster_percent={self.track_cluster_percent})"


class Track:
    def __init__(self, track_id, track_count, track_cluster_list):
        self.track_id = track_id
        self.track_count = track_count
        self.track_cluster_list = track_cluster_list
        self.track_cluster_list.sort()
        self.final_cluster = -1

    def get_cluster_set(self):
        cluster_set = set()
        for track_cluster in self.track_cluster_list:
            cluster_set.add(cluster_set)

        return cluster_set

    def get_majority_forming_cluster_id(self):

        if self.track_count <= 20:
            max_cluster_count = 0
            max_cluster_id = -1
            for track_cluster in self.track_cluster_list:
                if track_cluster.track_cluster_cluster_count > max_cluster_count:
                    max_cluster_count = track_cluster.track_cluster_cluster_count
                    max_cluster_id = track_cluster.track_cluster_cluster_id

            return max_cluster_id

        for track_cluster in self.track_cluster_list:
            if track_cluster.track_cluster_percent >= 80:
                return track_cluster.track_cluster_cluster_id
            else:
                break

        combined_percent = 0
        cluster_list = []

        for track_cluster in self.track_cluster_list:
            combined_percent += track_cluster.track_cluster_percent
            cluster_list.append(track_cluster.track_cluster_cluster_id)
            if combined_percent >= 80:
                break

        return cluster_list

    def __repr__(self) -> str:
        return f"Track(track_id={self.track_id}, track_count={self.track_count}, track_cluster_list={self.track_cluster_list})"


class TrackList:
    def __init__(self, df):
        self.track_list = self.get_track_list(df)

    def get_final_clusters(self):

        final_cluster_dict = {}
        combine_dict = {}

        for track in self.track_list:
            final_cluster = track.get_majority_forming_cluster_id()
            final_cluster_dict[track.track_id] = final_cluster
            if type(final_cluster) == list:
                combine_dict[track.track_id] = final_cluster

        existing_sets_list: list[set] = []

        for track_id, cluster_list in combine_dict.items():
            cluster_set = set(cluster_list)

            for existing_sets in existing_sets_list:
                if len(existing_sets.intersection(cluster_set)) > 0:
                    existing_sets.update(cluster_set)
                else:
                    existing_sets_list.append(cluster_set)

        # assign the first value of the set to all the values in the set
        final_combo_clusters = {}
        for cluster_set in existing_sets_list:
            cluster_id = cluster_set.pop()
            for cluster in cluster_set:
                final_combo_clusters[cluster] = cluster_id

        for track_id, cluster in final_cluster_dict.items():
            if type(cluster) == list:
                final_cluster_dict[track_id] = final_combo_clusters[cluster[0]]
            elif cluster in final_combo_clusters:
                final_cluster_dict[track_id] = final_combo_clusters[cluster]

        return final_cluster_dict

    def get_track(self, track_id):
        for track in self.track_list:
            if track.track_id == track_id:
                return track

    @staticmethod
    def get_track_list(df):

        df["TrackClusterClass"] = df.apply(
            lambda row: TrackCluster(
                track_cluster_cluster_id=row["Track_Cluster_Cluster_ID"],
                track_cluster_cluster_count=row["Track_Cluster_Cluster_Count"],
                track_cluster_percent=row["Track_Cluster_Percent"],
            ),
            axis=1,
        )

        df = (
            df.groupby("Track")
            .agg(
                {
                    "Track_Count": "first",
                    "TrackClusterClass": lambda x: list(x),
                }
            )
            .reset_index()
        )

        df["TrackClass"] = df.apply(
            lambda row: Track(
                track_id=row["Track"],
                track_count=row["Track_Count"],
                track_cluster_list=row["TrackClusterClass"],
            ),
            axis=1,
        )

        return df["TrackClass"].tolist()

    def __repr__(self) -> str:
        return f"TrackList(track_list_count={len(self.track_list)})"


In [174]:
new_classes = perform_clustering(list(track_centroids_df["Track_Cluster_Centroid"]), max_clusters=12)

In [175]:
track_centroids_df["Track_Cluster_Cluster_ID"] = new_classes
track_centroids_df

Unnamed: 0,Track,Track_Cluster_ID,Track_Cluster_Centroid,Track_Cluster_Cluster_ID
0,0,1,"[-0.06636062562465668, 0.034249312430620196, 0...",0
1,0,0,"[-0.05568748606102807, 0.02765345573425293, 0....",0
2,1,4,"[-0.20266373828053474, 0.070628821849823, 0.10...",4
3,1,0,"[-0.18243570625782013, 0.06605116544025284, 0....",4
4,1,3,"[-0.18316734209656715, 0.08877378143370152, 0....",4
...,...,...,...,...
134,95,1,"[-0.15782350301742554, 0.16183091327548027, 0....",0
135,98,0,"[-0.08458438458350989, 0.12279241990584594, 0....",6
136,99,1,"[-0.0994673378765583, 0.12794823795557023, 0.0...",3
137,99,0,"[-0.1397691249847412, 0.07358804643154145, 0.0...",1


In [178]:
final_df_cents = final_df_cents.merge(
    track_centroids_df[["Track", "Track_Cluster_ID", "Track_Cluster_Cluster_ID"]],
    on=["Track", "Track_Cluster_ID"])

In [181]:
final_df_cents["Track_Cluster_Cluster_ID"].value_counts()

Track_Cluster_Cluster_ID
0    788
5    670
4    291
6    255
2    148
3    107
7     73
1     49
Name: count, dtype: int64

In [191]:
final_df_cents[final_df_cents["Track_Cluster_Cluster_ID"] == 0]["Track"].value_counts()

Track
14    190
9     143
18    108
3      55
60     51
62     45
68     40
56     37
72     26
88     26
70     23
58     17
95     15
0      12
Name: count, dtype: int64

In [204]:
temp_df = final_df_cents[["Track", "Track_Cluster_Cluster_ID", "Frame"]].groupby(
    ["Track","Track_Cluster_Cluster_ID"]).count().reset_index()

In [205]:
temp_df.to_excel("../../../del_later/final_df_cents.xlsx", index=False)

In [164]:
new_classes

array([0, 0, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 6, 0, 2, 3, 0, 0, 0, 6, 6, 5,
       3, 6, 6, 5, 6, 5, 4, 4, 5, 5, 5, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5,
       6, 6, 6, 6, 6, 2, 3, 3, 3, 3, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0,
       0, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0,
       0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, 2, 2, 4, 4, 6, 3, 6, 7,
       7, 6, 6, 7, 3, 7, 7, 7, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 6, 3, 1, 3], dtype=int32)

In [122]:
subset_df["new_classes"] = new_classes
subset_df

Unnamed: 0,track_id,centroid,new_classes
0,0,"[-0.06636062562465668, 0.034249312430620196, 0...",0
1,0,"[-0.05568748606102807, 0.02765345573425293, 0....",0
2,1,"[-0.20266373828053474, 0.070628821849823, 0.10...",4
3,1,"[-0.18243570625782013, 0.06605116544025284, 0....",4
4,1,"[-0.18316734209656715, 0.08877378143370152, 0....",4
...,...,...,...
134,95,"[-0.15782350301742554, 0.16183091327548027, 0....",0
135,98,"[-0.08458438458350989, 0.12279241990584594, 0....",6
136,99,"[-0.0994673378765583, 0.12794823795557023, 0.0...",3
137,99,"[-0.1397691249847412, 0.07358804643154145, 0.0...",1


In [135]:
subset_df[subset_df["new_classes"] == 6]["track_id"].unique()

array([ 3, 19, 21, 24, 25, 27, 50, 52, 77, 79, 82, 98])

In [138]:
def group_tracks(subset_df):
    
    for track_id in subset_df["track_id"].unique():
        track_classes = subset_df[subset_df[t"track_id"] == track_id]["new_classes"].unique():
            for track_class in

Unnamed: 0,track_id,centroid,new_classes
0,0,"[-0.06636062562465668, 0.034249312430620196, 0...",0
1,0,"[-0.05568748606102807, 0.02765345573425293, 0....",0
2,1,"[-0.20266373828053474, 0.070628821849823, 0.10...",4
3,1,"[-0.18243570625782013, 0.06605116544025284, 0....",4
4,1,"[-0.18316734209656715, 0.08877378143370152, 0....",4
...,...,...,...
134,95,"[-0.15782350301742554, 0.16183091327548027, 0....",0
135,98,"[-0.08458438458350989, 0.12279241990584594, 0....",6
136,99,"[-0.0994673378765583, 0.12794823795557023, 0.0...",3
137,99,"[-0.1397691249847412, 0.07358804643154145, 0.0...",1


    Track_id  clusters  new_clusters
0          0         0             6
1          0         0             6
2          0         0             6
3          0         0             6
4          1         2             3
5          1         2             3
6          1         2             3
7          1         2             3
8          2         1             2
9          2         1             2
10         2         1             2
11         2         1             2
12         2         2             3
13         2         2             3
14         3         3             4
15         3         3             4
16         3         3             4
17         4         4             5
18         4         4             5
19         4         4             5
20         4         0             6
21         4         0             6
22         4         0             6
23         4         0             6


In [45]:
df_filtered

Unnamed: 0,Frame,Track,Score,S,X,Y,Encoding,Clusters,actual_cluters
0,2,0,1.78,87.474350,331.374649,138.238453,"[-0.06108415871858597, 0.026007167994976044, 0...",0,1
1,3,0,1.92,88.022346,331.374649,140.397591,"[-0.07366905361413956, 0.03151387721300125, 0....",0,1
2,4,0,1.98,88.158028,331.374649,142.936060,"[-0.07083795964717865, 0.04400164633989334, 0....",0,1
3,5,0,1.96,88.313486,331.374649,143.661291,"[-0.05682859569787979, 0.036323778331279755, 0...",0,1
4,6,0,1.88,88.313486,331.374649,143.661291,"[-0.06938336044549942, 0.03340009227395058, 0....",0,1
...,...,...,...,...,...,...,...,...,...
2376,6793,99,0.10,45.543660,362.171973,174.193830,"[-0.08686831593513489, 0.10544657707214355, 0....",4,5
2377,6794,99,0.06,46.675153,362.171973,175.966561,"[-0.16220229864120483, 0.06623321771621704, 0....",4,5
2378,6795,99,0.02,47.806647,362.171973,177.739293,"[-0.15960538387298584, 0.07936713099479675, 0....",4,5
2379,6804,100,0.06,83.150391,371.819168,106.853055,"[-0.08707184344530106, 0.0573924258351326, 0.0...",4,5


In [79]:
df_filtered["Track"].value_counts()

Track
14     190
25     179
9      143
30     138
21     135
18     108
43     107
46     100
38      88
48      71
27      69
31      62
42      58
3       57
60      51
62      45
71      42
68      40
35      38
56      37
19      35
65      35
63      31
34      31
1       28
57      27
88      26
72      26
67      25
86      24
70      23
90      23
10      23
87      21
83      19
81      19
82      18
55      17
58      17
52      17
73      16
77      16
95      15
98      13
0       12
44      10
99      10
50       9
40       8
53       7
39       7
79       5
75       5
24       2
100      2
23       1
Name: count, dtype: int64

In [58]:
centroid_df = df_filtered[["Track", "actual_cluters"]].drop_duplicates().reset_index(drop=True)

In [84]:
perform_clustering(list(df_filtered[df_filtered["Track"] == 25]["Encoding"]))

2024-04-02 12:06:00 Silhouette score for k = 2: 0.263476 
2024-04-02 12:06:01 Silhouette score for k = 3: 0.260980 
2024-04-02 12:06:01 Silhouette score for k = 4: 0.191554 
2024-04-02 12:06:02 Silhouette score for k = 5: 0.179236 
2024-04-02 12:06:02 Silhouette score for k = 6: 0.180849 
2024-04-02 12:06:03 Optimal number of clusters: 2 


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1], dtype=int32)

In [59]:
centroids = {}
for track_id in list(df_filtered["Track"]):
    centroids[track_id] = find_centroid(list(df_filtered[df_filtered["Track"] == track_id]["Encoding"]))

In [60]:
centroid_df["centroid"] = centroid_df["Track"].apply(lambda x: centroids[x])

In [61]:
centroid_df

Unnamed: 0,Track,actual_cluters,centroid
0,0,1,"[-0.06013462754587332, 0.03040172935773929, 0...."
1,1,7,"[-0.18646200160895074, 0.07389921322464943, 0...."
2,3,1,"[-0.10225323947113857, 0.09906484441537607, 0...."
3,9,1,"[-0.14558422299740198, 0.09890822704125951, 0...."
4,10,8,"[-0.0958845563556837, 0.044831989414018135, 0...."
5,14,1,"[-0.14156822821027354, 0.10037661171273181, 0...."
6,18,1,"[-0.15014308553051064, 0.09342120302392652, 0...."
7,19,0,"[-0.11109535467943975, 0.10282816759177617, 0...."
8,21,0,"[-0.11591486396888892, 0.13274720106963758, 0...."
9,23,10,"[-0.06383392959833145, 0.06218215450644493, 0...."


In [72]:
centroid_df["optimal_clustering"] = perform_clustering(list(centroid_df["centroid"]), max_clusters=20)

2024-04-02 11:47:51 Silhouette score for k = 2: 0.205231 
2024-04-02 11:47:51 Silhouette score for k = 3: 0.236059 
2024-04-02 11:47:51 Silhouette score for k = 4: 0.265282 
2024-04-02 11:47:51 Silhouette score for k = 5: 0.273550 
2024-04-02 11:47:51 Silhouette score for k = 6: 0.279839 
2024-04-02 11:47:51 Silhouette score for k = 7: 0.258021 
2024-04-02 11:47:51 Silhouette score for k = 8: 0.249696 
2024-04-02 11:47:51 Silhouette score for k = 9: 0.236378 
2024-04-02 11:47:51 Silhouette score for k = 10: 0.203415 
2024-04-02 11:47:51 Silhouette score for k = 11: 0.260133 
2024-04-02 11:47:51 Silhouette score for k = 12: 0.209998 
2024-04-02 11:47:51 Silhouette score for k = 13: 0.213460 
2024-04-02 11:47:51 Silhouette score for k = 14: 0.215232 
2024-04-02 11:47:51 Silhouette score for k = 15: 0.215156 
2024-04-02 11:47:51 Silhouette score for k = 16: 0.205498 
2024-04-02 11:47:51 Silhouette score for k = 17: 0.219110 
2024-04-02 11:47:52 Silhouette score for k = 18: 0.219432 
2024-

In [74]:
centroid_df["actual_cluters"].value_counts()

actual_cluters
1     16
0     12
4      6
5      6
3      6
2      4
6      2
7      1
8      1
10     1
9      1
Name: count, dtype: int64

In [75]:
centroid_df["optimal_clustering"].value_counts()

optimal_clustering
2    15
4    13
1    10
5     7
3     7
0     4
Name: count, dtype: int64

In [52]:
list(centroids.values())

[array([-0.06013463,  0.03040173,  0.14088206,  0.02119882,  0.05065428,
        -0.10089886,  0.03706199, -0.04794211,  0.12203306, -0.09159597,
         0.24053926,  0.00208954, -0.13659149, -0.05549773,  0.034036  ,
         0.04693156, -0.16591268, -0.07115627, -0.09181619, -0.06711394,
         0.0196701 ,  0.0555539 ,  0.06054448,  0.01386129, -0.13413001,
        -0.2512342 , -0.09957895, -0.15919246,  0.14910211, -0.05358325,
        -0.00195126,  0.01388312, -0.2570928 , -0.06269796, -0.05196904,
        -0.03478903,  0.05301379, -0.05796527,  0.1803456 ,  0.0994247 ,
        -0.1168328 , -0.02917274,  0.00244494,  0.22815734,  0.15016208,
        -0.04329354,  0.04014992,  0.00733304,  0.05316318, -0.19238127,
         0.06106708,  0.04400279,  0.23678577,  0.1201506 ,  0.00205568,
        -0.17737272,  0.05550496,  0.02182858, -0.17811502,  0.17376377,
         0.06659342, -0.06334658, -0.03384392,  0.09223738,  0.22027257,
         0.06508631, -0.04899423, -0.1876592 ,  0.1