# Load data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import umap
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer, silhouette_score, adjusted_rand_score
from sklearn.metrics import mutual_info_score
from sklearn.cluster import DBSCAN

In [2]:
# Load the data
def load_features_train():
    with open(r"C:\Code\UTEC\ML\Project3\Project\data\pickle\features.pkl", "rb") as archivo:
        feature_vectors_f = pickle.load(archivo)
        video_ids_f = pickle.load(archivo)
    return feature_vectors_f, video_ids_f

In [3]:
feature_vectors, video_ids = load_features_train()

In [5]:
# Simplify the data using the mean
X = []
for feature_vector in feature_vectors:
    try:
        X.append(np.mean(feature_vector, axis=0))
    except:
        print("Error")

In [7]:
# Remove nan values
for i in range(len(X)):
    if np.isnan(X[i]).any():
        X.pop(i)
        video_ids.pop(i)

In [8]:
X = np.array(X)
print(X.shape)
print(len(video_ids))

(10694, 512)
10694


# Get labels

In [19]:
# get labels from train_subset.csv which contains the id and label of the videos
train_subset_labels = pd.read_csv(r"C:\Code\UTEC\ML\Project3\Project\data\csv\train_subset.csv")
# select the labels that are in video_ids
train_subset_labels = train_subset_labels[train_subset_labels["youtube_id"].isin(video_ids)]

train_subset_labels

Unnamed: 0,youtube_id,label
0,GcTww0NOCk0,hopscotch
1,zeIkGEHK46I,riding camel
2,-Fwy8NwefTk,shot put
3,YYgESo5eQD8,situp
4,38Ml6v4vPzY,playing clarinet
...,...,...
10712,M0O8lByHQ-k,hopscotch
10713,XhZTEPRW5lk,shot put
10714,egfzXatr0jc,playing clarinet
10715,mQ0-F4Y24xo,playing ice hockey


In [25]:
Y_train = []
# sort the labels to have the same order as in the feature vectors and video_ids
for video_id in video_ids:
    sort_label = train_subset_labels[train_subset_labels["youtube_id"] == video_id]["label"].values[0]
    Y_train.append(sort_label)
    

In [30]:
# encode the lables to numbers
labelEncoder = LabelEncoder()
Y_train = labelEncoder.fit_transform(Y_train)

In [31]:
print(Y_train.shape)

(10694,)


# UMAP

In [32]:
umap_red = umap.UMAP(n_components=30)
X_umap = umap_red.fit_transform(X)

In [33]:
print(X_umap.shape)

(10694, 30)


# DBScan

In [34]:
from sklearn.preprocessing import StandardScaler

X_umap = StandardScaler().fit_transform(X_umap)


In [35]:
dbscan = DBSCAN(eps=1.5, min_samples=4).fit(X_umap)
labels_dbscan_umap_train = dbscan.labels_

In [36]:
silhouette_scr = silhouette_score(X_umap, labels_dbscan_umap_train)
adjusted_rand_scr = adjusted_rand_score(Y_train, labels_dbscan_umap_train)
mutual_info_scr = mutual_info_score(Y_train, labels_dbscan_umap_train)
print(silhouette_scr, adjusted_rand_scr, mutual_info_scr)

0.48215133 0.7471182007735846 2.531974034583729


In [37]:
prueba = {
    'ids_train': video_ids[0:10],
    'Y_train': Y_train[0:10],
    'Y_pred' : labels_dbscan_umap_train[0:10]
}
df_prueba = pd.DataFrame(prueba)
df_prueba

Unnamed: 0,ids_train,Y_train,Y_pred
0,--gx7yb1-x0,1,0
1,--Ntf6n-j9Q,1,0
2,-07Ke73N4zI,2,1
3,-1MXpPymXFU,0,1
4,-2KvnLMnrA0,6,2
5,-36efvC2K54,0,1
6,-3tVVBhz8-o,1,0
7,-5oULXqj45c,13,3
8,-5s-IR39XaY,5,4
9,-5Vx7UtZpzk,5,4
