In [5]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re

In [6]:
class BayesianSets:
    def __init__(self, dataset, c=2):
        self.dataset = csr_matrix(dataset)

        m = np.mean(self.dataset, 0)

        self.alpha = c * m
        self.beta = c * (1 - m)

        self.logAlpha = np.log(self.alpha)
        self.logBeta = np.log(self.beta)
        self.logAlphaBeta = np.log(self.alpha + self.beta)

    def compute_parameters(self, query_indices):
        N = len(query_indices)

        sum_x = np.sum(self.dataset[query_indices], axis=0)

        alphaTilde = self.alpha + sum_x
        betaTilde = self.beta + N - sum_x
        logAlphaTilde = np.log(alphaTilde)
        logBetaTilde = np.log(betaTilde)
        logAlphaBetaN = np.log(self.alpha + self.beta + N)

        query = logAlphaTilde - self.logAlpha - logBetaTilde + self.logBeta
        constant = np.sum(self.logAlphaBeta - logAlphaBetaN + logBetaTilde - self.logBeta)

        return query, constant

    def compute_scores(self, query_indices):
        q, nc = self.compute_parameters(query_indices)
        Xq = self.dataset * q.transpose()

        score = nc + Xq

        return np.asarray(score).flatten()

In [7]:
class TreatText:
    def removeNumbers(self, text):
        return re.sub('\s\(\d+\)', '', text)

    def lower(self, text):
        return text.lower()

    def run(self, text):
        t = self.removeNumbers(text)
        t = self.lower(t)
        return t

In [8]:

# Load Movies from CSV
df = pd.read_csv('datasets/movies.csv', index_col=0)
df['title'] = df['title'].apply(TreatText().run)

# Find similar movies
query = np.array([
    'toy story',
    'the lion king',
    'alladin',
    'beauty and the best',
    'cinderella',
    'little mermaid',
    'hercules'
])

query_ids = df.loc[df['title'].isin(query)].index.tolist()
query_ids = np.array(query_ids)

X = TfidfVectorizer().fit_transform(df['title'])

model = BayesianSets(X)

# ranking = np.argsort(model.compute_scores(query_ids))[::-1]
# top10 = ranking[:10]