# Load data

In [132]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import umap
import pickle

from fontTools.misc.psOperators import ps_integer
from networkx.classes import neighbors
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer, silhouette_score, adjusted_rand_score, rand_score, adjusted_mutual_info_score
from sklearn.metrics import mutual_info_score
from sklearn.cluster import DBSCAN

In [3]:
# Load the data
def load_features_train():
    with open(r"C:\Code\UTEC\ML\Project3\Project\data\pickle\features.pkl", "rb") as archivo:
        feature_vectors_f = pickle.load(archivo)
        video_ids_f = pickle.load(archivo)
    return feature_vectors_f, video_ids_f

In [4]:
feature_vectors, video_ids = load_features_train()

In [5]:
# Simplify the data using the mean
X = []
for feature_vector in feature_vectors:
    try:
        X.append(np.mean(feature_vector, axis=0))
    except:
        print("Error")

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [7]:
# Remove nan values
for i in range(len(X)):
    if np.isnan(X[i]).any():
        X.pop(i)
        video_ids.pop(i)

In [8]:
X = np.array(X)
print(X.shape)
print(len(video_ids))

(10694, 512)
10694


# Get labels

In [103]:
# get labels from train_subset.csv which contains the id and label of the videos
train_subset_labels = pd.read_csv(r"C:\Code\UTEC\ML\Project3\Project\data\csv\train_subset.csv")
# select the labels that are in video_ids
train_subset_labels = train_subset_labels[train_subset_labels["youtube_id"].isin(video_ids)]

train_subset_labels

Unnamed: 0,youtube_id,label
0,GcTww0NOCk0,hopscotch
1,zeIkGEHK46I,riding camel
2,-Fwy8NwefTk,shot put
3,YYgESo5eQD8,situp
4,38Ml6v4vPzY,playing clarinet
...,...,...
10712,M0O8lByHQ-k,hopscotch
10713,XhZTEPRW5lk,shot put
10714,egfzXatr0jc,playing clarinet
10715,mQ0-F4Y24xo,playing ice hockey


In [104]:
Y_train_no_encode = []
# sort the labels to have the same order as in the feature vectors and video_ids
for video_id in video_ids:
    sort_label = train_subset_labels[train_subset_labels["youtube_id"] == video_id]["label"].values[0]
    Y_train_no_encode.append(sort_label)
    

In [105]:
# encode the lables to numbers
labelEncoder = LabelEncoder()
Y_train = labelEncoder.fit_transform(Y_train_no_encode)

In [106]:
print(Y_train.shape)

(10694,)


In [112]:
print(Y_train_no_encode[0:5])

['balloon blowing', 'balloon blowing', 'cooking chicken', 'baking cookies', 'golf driving']


In [123]:
print(np.unique(Y_train))

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


# UMAP

In [13]:
umap_red = umap.UMAP(n_components=30)
X_umap = umap_red.fit_transform(X)

In [14]:
print(X_umap.shape)

(10694, 30)


In [40]:
# save the umap model
with open(r"C:\Code\UTEC\ML\Project3\Project\data\pickle\x_umap.pkl", "wb") as archivo:
        pickle.dump(X_umap, archivo)

In [41]:
# load the umap model
with open(r"C:\Code\UTEC\ML\Project3\Project\data\pickle\x_umap.pkl", "rb") as archivo:
        X_umap_temp = pickle.load(archivo)
print(X_umap_temp.shape)

(10694, 30)


# DBScan

In [31]:
from sklearn.preprocessing import StandardScaler

X_umap = StandardScaler().fit_transform(X_umap)


In [32]:
dbscan = DBSCAN(eps=1.5, min_samples=4).fit(X_umap)
labels_dbscan_umap_train = dbscan.labels_

In [34]:
print(len(labels_dbscan_umap_train), len(X_umap))

10694 10694


In [35]:
print(np.unique(labels_dbscan_umap_train))

[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]


In [36]:
silhouette_scr = silhouette_score(X_umap, labels_dbscan_umap_train)
adjusted_rand_scr = adjusted_rand_score(Y_train, labels_dbscan_umap_train)
mutual_info_scr = mutual_info_score(Y_train, labels_dbscan_umap_train)
print(silhouette_scr, adjusted_rand_scr, mutual_info_scr)

0.48215133 0.7471182007735846 2.531974034583729


In [37]:
prueba = {
    'ids_train': video_ids[0:10],
    'Y_train': Y_train[0:10],
    'Y_pred' : labels_dbscan_umap_train[0:10]
}
df_prueba = pd.DataFrame(prueba)
df_prueba

Unnamed: 0,ids_train,Y_train,Y_pred
0,--gx7yb1-x0,1,0
1,--Ntf6n-j9Q,1,0
2,-07Ke73N4zI,2,1
3,-1MXpPymXFU,0,1
4,-2KvnLMnrA0,6,2
5,-36efvC2K54,0,1
6,-3tVVBhz8-o,1,0
7,-5oULXqj45c,13,3
8,-5s-IR39XaY,5,4
9,-5Vx7UtZpzk,5,4


# DBScan implementation

In [102]:
from sklearn.neighbors import KDTree
from collections import deque
class DBScan_custom:
    def __init__(self, eps=0.5, min_samples=5):
        self.eps = eps
        self.min_samples = min_samples
        self.X = None
        self.labels = None
        
    def fit(self, X):
        self.X = X
        # load in KDtree for eficient search of neighbours
        kdtree = KDTree(X)
        # all noise
        self.labels = np.full(len(X), -1)
        # get the nn in the eps radius
        nns = kdtree.query_radius(X, r=self.eps)
        # iterate trough all the points to find the clusters
        cluster_number = 0
        visited_points = np.zeros(len(X))
        for i in range(len(X)):
            # skip if the point is already assigned to a cluster
            if self.labels[i] != -1:
                continue
            visited_points[i] = 1
            # core points
            if len(nns[i]) >= self.min_samples:
                cluster_number += 1
                # assign the cluster number to the current point if doesnt have a cluster
                if self.labels[i] == -1:
                    self.labels[i] = cluster_number
                # expand the cluster from the current point to its nns
                nns_p = deque(nns[i])
                j = nns_p.pop()
                # iterate through all the nns and assign the same cluster number
                while True:
                    if visited_points[j] == 0:
                        visited_points[j] = 1
                        nns_nn_p = nns[j]
                        if nns_nn_p.size >= self.min_samples:
                            for item in nns_nn_p:
                                nns_p.append(item)
                    if self.labels[j] == -1:
                        self.labels[j] = cluster_number
                    # break if there are no more nns cores
                    if len(nns_p) == 0:
                        break
                    j = nns_p.pop()
        return self
    def labels_(self):
        return self.labels
    def predict(self, X_test):
        # return the label of the nn of the train points
        labels = []
        kdtree = KDTree(self.X)
        dist, nns_1 = kdtree.query(X_test, k=1)
        labels = self.labels[nns_1]
        return labels

        

In [98]:
dbscan = DBScan_custom(eps=1.5, min_samples=4).fit(X_umap)
labels_dbscan_umap_train = dbscan.predict()

In [99]:
print(len(labels_dbscan_umap_train))

10694


In [100]:
print(np.unique(labels_dbscan_umap_train))

[-1  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]


In [133]:
def metrics_clustering(X, Y_true, Y_pred):
    silhouette_scr = silhouette_score(X, Y_pred)
    rand_scr = rand_score(Y_true, Y_pred)
    mutual_info_scr = mutual_info_score(Y_true, Y_pred)
    adjusted_mutual_info_scr = adjusted_mutual_info_score(Y_true, Y_pred)
    return silhouette_scr, rand_scr, mutual_info_scr, adjusted_mutual_info_scr

In [134]:
silhouette_scr, rand_scr, mutual_info_scr, adjusted_mutual_info_scr = metrics_clustering(X_umap, Y_train, labels_dbscan_umap_train)
print(silhouette_scr, rand_scr, mutual_info_scr, adjusted_mutual_info_scr)

0.49123225 0.9676384126332777 2.5308477014338058 0.9116324925219547


In [None]:
# Find the best parameters for DBScan
# TODO: Implement a grid search, apply the model to the test data, submit the results

# Assign acctions to the labels based on majority voting (Train)

In [120]:
import statistics
def assign_actions(labels, Y):
    unique_labels = np.unique(labels)
    actions = [""] * len(labels)
    for label in unique_labels:
        index_labels = np.where(labels == label)[0]
        actions_string = [Y[i] for i in index_labels] 
        action_majority = statistics.mode(actions_string)
        for index in index_labels:
            actions[index] = action_majority
    return actions
        
        

In [124]:
Y_pred_train_string = assign_actions(labels_dbscan_umap_train, Y_train_no_encode)

In [122]:
print(Y_pred_train_string[0:10])

['balloon blowing', 'balloon blowing', 'cooking chicken', 'cooking chicken', 'golf driving', 'cooking chicken', 'balloon blowing', 'shot put', 'flipping pancake', 'flipping pancake']


In [125]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_train_no_encode, Y_pred_train_string)

0.7610809799887788

# Load Data test

In [69]:
def load_features_test():
    with open(r"C:\Code\UTEC\ML\Project3\Project\data\pickle\features_test.pkl", "rb") as archivo:
        feature_vectors_test_f = pickle.load(archivo)
        video_ids_test_f = pickle.load(archivo)
    return feature_vectors_test_f, video_ids_test_f

In [70]:
feature_vectors_test, video_ids_test = load_features_test()

In [71]:
X_test = []
for feature_vector in feature_vectors_test:
    try:
        X_test.append(np.mean(feature_vector, axis=0))
    except:
        print("Error")

In [72]:
for i in range(len(X_test)):
    if np.isnan(X_test[i]).any():
        X_test.pop(i)
        video_ids_test.pop(i)

In [73]:
X_test = np.array(X_test)
print(X_test.shape)
print(len(video_ids_test))

(1626, 512)
1626


# Get labels test