In [4]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.metrics import jaccard_score

TODOS:
    
    - improve similarity metric, currently cosine similarity suffers from sameness issue where relative tag weights are what is being given a high metric score
    - pull tag names from browser and translate to ids 

In [5]:
def get_conn():
    db_name = './movie_sqlite.db'
    conn = sqlite3.connect(db_name)
    return conn

In [6]:
def get_top_similar(tag_ids, entity_type=['movies','directors'][0], top_n=10,
                    metric=['euclidean', 'cosine'][0]):
    '''
    tag_ids: list of tag ids to consider (in ascending order)
    
    return:
        list of tuples [(entity_id, similarity value), ...],
        list of tag ids
    '''
    prefix = 'tt' if entity_type == 'movies' else 'nn'
    select_cols = ',\n'.join([f'sum(case when tag_id = {tg} then relevance end) tag_id_{str(tg)}' for tg in tag_ids])
    sql = f"""
        select fk_id,
            {select_cols}
        from tag_relevance
        where tag_id in {tuple(tag_ids)}
        and fk_id like '{prefix}%'
        group by fk_id;
    """
    conn = get_conn()
    df = pd.read_sql(sql, conn).set_index('fk_id')
    conn.close()
    metric_function = {
        'euclidean': euclidean_distances,
        'cosine'   : cosine_similarity,
        'jaccard'   : jaccard_score,
    }[metric]
       
    df[f'{metric}_similarity'] = metric_function(np.ones((1, len(tag_ids))),df.values).T
    df.sort_values(f'{metric}_similarity', inplace=True, ascending=False if metric=='cosine' else True)
    s = df[:top_n][f'{metric}_similarity']
    return list(zip(s.index, s))+[], tag_ids

In [7]:
# TODO: return tag relevance scores for each movie as well

In [8]:
tags = [1, 3, 8, 10]
top_n, tag_ids = get_top_similar(tags, top_n=5, metric='euclidean')
# print(top_n)
# print(tag_ids)

In [9]:
tags = [1, 3, 8, 10]
top_n, tag_ids = get_top_similar(tags, top_n=5, metric='cosine')
# print(top_n)
# print(tag_ids)

In [11]:
# TODO: make tooltip dynamic to represent entity type (director or movie)

In [12]:
# function signatures needing implementation

def get_entity_name(fk_id):
    prefix = fk_id[:2]
    entity_type = {
        'nm': 'directors',
        'tt': 'movies',
    }[prefix]
    
    name_col = {
        'nm': 'name',
        'tt': 'primary_title',
    }[prefix]
    
    sql = f"""
        select {name_col}
        from {entity_type}
        where id = '{fk_id}'
        ;
    """
    conn = get_conn()
    c = conn.cursor()
    c.execute(sql)
    res = c.fetchall()[0][0]
    conn.close()
    return res

def get_poster_img_link(fk_id):
    """
    TODO: create db table, populate table with links, write function to pull data"""
    return 'url'


In [13]:
fk = 'nm0617588'
get_entity_name(fk)

'Georges Méliès'

In [14]:
fk = 'tt0000012'
get_entity_name(fk)

'The Arrival of a Train'

In [None]:
TASK: get youtube urls for trailers
    using api, take a movie name, year, genre and get url of youtube trailer
    embed the video into website
    