## LESCO Distance-Based Evaluation

In [1]:
import csv
import numpy as np
from sklearn.model_selection import train_test_split
import keras
import tensorflow as tf
from scipy import spatial # cosine similarity
import pandas as pd
from scipy.spatial import distance

In [2]:
# load all pre-proceesed videos.
videos = []
with open("videos.csv", "r") as f:
    reader = csv.reader(f, delimiter="\t")
    for i, line in enumerate(reader):
        video = [float(x) for x in line[0].split(',')]
        videos.append(video)
        
labels = []
with open("labels.csv", "r") as f:
    reader = csv.reader(f, delimiter="\t")
    for i, line in enumerate(reader):
        labels = [x.replace("'","").replace('"',"") for x in line[0].split(',')]


In [3]:
# test train split
X_train, X_test, y_train, y_test = train_test_split(videos, labels, test_size=0.20, random_state=43)

print("Training", len(X_train))
print("Training", len(X_test))

Training 523
Training 131


## Similarity Functions

In [4]:
def get_cosine(train_data, test_item, padding = 50):
    similarity = []
    test_item.extend([0] * (50 - len(test_item)))
    for data in train_data:
        data.extend([0] * (50 - len(data)))
        s = 1 - spatial.distance.cosine(data, test_item)
        similarity.append(s)
        
    
    df = pd.DataFrame({
        "y_train" : y_train,
        "similarity" : similarity
    })
    
    df = df.sort_values(by=['similarity'], ascending=False)
    
    return df.iloc[0]["y_train"]  # returns most similar class


def get_cosine_knn(train_data, test_item, k = 2, padding = 50):
    similarity = []
    test_item.extend([0] * (50 - len(test_item)))
    for data in train_data:
        data.extend([0] * (50 - len(data)))
        s = 1 - spatial.distance.cosine(data, test_item)
        similarity.append(s)
        
    
    df = pd.DataFrame({
        "y_train" : y_train,
        "similarity" : similarity
    })
    
    df = df.sort_values(by=['similarity'], ascending=False)
    
    return df["y_train"]


def get_euclidean(train_data, test_item, padding = 50):
    similarity = []
    test_item.extend([0] * (50 - len(test_item)))
    for data in train_data:
        data.extend([0] * (50 - len(data)))
        s = 1 - np.linalg.norm(np.array(data) - np.array(test_item))
        similarity.append(s)
        
    
    df = pd.DataFrame({
        "y_train" : y_train,
        "similarity" : similarity
    })
    
    df = df.sort_values(by=['similarity'], ascending=False)
    
    return df.iloc[0]["y_train"]  # returns most similar class


def get_manhattan(train_data, test_item, padding = 50):
    similarity = []
    test_item.extend([0] * (50 - len(test_item)))
    for data in train_data:
        data.extend([0] * (50 - len(data)))
        s = 1 - np.abs(np.array(data) - np.array(test_item)).sum()
        similarity.append(s)
        
    
    df = pd.DataFrame({
        "y_train" : y_train,
        "similarity" : similarity
    })
    
    df = df.sort_values(by=['similarity'], ascending=False)
    
    return df.iloc[0]["y_train"]  # returns most similar class

def get_dotproduct(train_data, test_item, padding = 50):
    similarity = []
    test_item.extend([0] * (50 - len(test_item)))
    for data in train_data:
        data.extend([0] * (50 - len(data)))
        s = np.dot(np.array(data), np.array(test_item))
        similarity.append(s)
        
    
    df = pd.DataFrame({
        "y_train" : y_train,
        "similarity" : similarity
    })
    
    df = df.sort_values(by=['similarity'], ascending=False)
    
    return df.iloc[0]["y_train"]  # returns most similar class

def get_chebyshev(train_data, test_item, padding = 50):
    similarity = []
    test_item.extend([0] * (50 - len(test_item)))
    for data in train_data:
        data.extend([0] * (50 - len(data)))
        s =  1 - distance.chebyshev(np.array(data), np.array(test_item))
        similarity.append(s)
        
    
    df = pd.DataFrame({
        "y_train" : y_train,
        "similarity" : similarity
    })
    
    df = df.sort_values(by=['similarity'], ascending=False)
    
    return df.iloc[0]["y_train"]  # returns most similar class

def get_minkowski(train_data, test_item, padding = 50):
    similarity = []
    test_item.extend([0] * (50 - len(test_item)))
    for data in train_data:
        data.extend([0] * (50 - len(data)))
        s =  1 - distance.minkowski(np.array(data), np.array(test_item))
        similarity.append(s)
        
    df = pd.DataFrame({
        "y_train" : y_train,
        "similarity" : similarity
    })
    
    df = df.sort_values(by=['similarity'], ascending=False)
    
    return df.iloc[0]["y_train"]  # returns most similar class


# test
get_cosine(X_train, X_test[0])  
    

'tio'

50

## Test Set Validation

In [11]:
# normal cosine

total = len(X_test)
count = 0

for test, label in zip(X_test, y_test):
    y_pred = get_cosine(X_train, test)
    if (y_pred == label):
        count = count + 1

print("Cosine Normal Fn Test Set Accuracy:", round(count/total,2))
    

Cosine Normal Fn Test Set Accuracy: 0.6


In [12]:
# cosine knn mix

def most_common(lst):
    return max(set(lst), key=lst.count)

total = len(X_test)
count = 0

for test, label in zip(X_test, y_test):
    y_pred = most_common(get_cosine_knn(X_train, test).iloc[:10].to_list())
    if (y_pred == label):
        count = count + 1

print("Cosine KNN Fn Test Set Accuracy:", round(count/total,2))


Cosine KNN Fn Test Set Accuracy: 0.27


In [13]:
total = len(X_test)
count = 0

for test, label in zip(X_test, y_test):
    y_pred = get_euclidean(X_train, test)
    if (y_pred == label):
        count = count + 1

print("Euclidean Normal Fn Test Set Accuracy:", round(count/total,2))

Euclidean Normal Fn Test Set Accuracy: 0.59


In [14]:
total = len(X_test)
count = 0

for test, label in zip(X_test, y_test):
    y_pred = get_manhattan(X_train, test)
    if (y_pred == label):
        count = count + 1

print("Manhattan Normal Fn Test Set Accuracy:", round(count/total,2))

Manhattan Normal Fn Test Set Accuracy: 0.74


In [15]:
total = len(X_test)
count = 0

for test, label in zip(X_test, y_test):
    y_pred = get_dotproduct(X_train, test)
    if (y_pred == label):
        count = count + 1

print("Dot Product Fn Test Set Accuracy:", round(count/total,2))

Dot Product Fn Test Set Accuracy: 0.21


In [22]:
total = len(X_test)
count = 0

for test, label in zip(X_test, y_test):
    y_pred = get_chebyshev(X_train, test)
    if (y_pred == label):
        count = count + 1

print("Chebyshev Fn Test Set Accuracy:", round(count/total,2))

Chebyshev Fn Test Set Accuracy: 0.29


In [30]:
total = len(X_test)
count = 0

for test, label in zip(X_test, y_test):
    y_pred = get_minkowski(X_train, test)
    if (y_pred == label):
        count = count + 1

print("Minkowski Fn Test Set Accuracy:", round(count/total,2))

Minkowski Fn Test Set Accuracy: 0.59


## Cross Validation Manhattan
cross validation shows that Manhattan is the best similarity measure so far...

In [33]:
scores = []
for i in range(0,5):
    X_train, X_test, y_train, y_test = train_test_split(videos, labels, test_size=0.20)
    total = len(X_test)
    count = 0
    for test, label in zip(X_test, y_test):
        y_pred = get_manhattan(X_train, test)
        if (y_pred == label):
            count = count + 1
    scores.append(round(count/total,2))
print("Scores:", scores)
print("Manhattan Cross Validation", np.mean(scores))
    

Scores: [0.75, 0.79, 0.79, 0.72, 0.79]
Manhattan Cross Validation 0.768
