In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv('movies.csv')
df.describe()

Unnamed: 0,movieId
count,27278.0
mean,59855.48057
std,44429.314697
min,1.0
25%,6931.25
50%,68068.0
75%,100293.25
max,131262.0


In [3]:
df['title'] = [title.lower()for title in df['title']]
df['title'] = [re.sub('\([0-9]{4}\)', '',  title) for title in df['title']]
df.head()

Unnamed: 0,movieId,title,genres
0,1,toy story,Adventure|Animation|Children|Comedy|Fantasy
1,2,jumanji,Adventure|Children|Fantasy
2,3,grumpier old men,Comedy|Romance
3,4,waiting to exhale,Comedy|Drama|Romance
4,5,father of the bride part ii,Comedy


In [4]:
def getDtm(textList):
    vect = CountVectorizer(ngram_range=(1,1))
    vect.fit(textList)
    return vect.transform(textList)

In [5]:
def getM(textList, query):
    vect = CountVectorizer(ngram_range=(1,1))
    vect.fit(textList)
    return vect.transform(query)

In [6]:
def getQ(textList, m, query, c=2):
    sumM = getSumM(textList, query)
    
    alpha = c*m
    alphaL = alpha + sumM
    beta = c*(1-m)
    betaL = beta + len(textList) - sumM
    
    a = np.log(alphaL)
    a2 = np.log(alpha)
    b = np.log(betaL)
    b2 = np.log(beta)
    
    return a-a2-b+b2

In [7]:
query = ['toy sotry', 'old men']
dtm = getDtm(df['title'])
dtm.shape

(27278, 21887)

In [8]:
x = getM(df['title'], query)
x.todense()
x.shape

(2, 21887)

In [9]:
alpha = 2*np.mean(dtm, axis=0)
beta = 2*(1-np.mean(dtm, axis=0))
beta.shape

(1, 21887)

In [10]:
alphaL = alpha + x.sum(axis=0)
betaL = beta + len(query) - x.sum(axis=0)
betaL.shape

(1, 21887)

In [11]:
q = np.log(alphaL) - np.log(alpha) - np.log(betaL) + np.log(beta)
q.shape

(1, 21887)

In [12]:
nc = np.log(alpha + beta) - np.log(alpha + beta + len(query)) + np.log(betaL) - np.log(beta)
nc
nc.sum(axis=0)


matrix([[3.30099583e-04, 1.83302936e-05, 1.83302936e-05, ...,
         1.83302936e-05, 1.83302936e-05, 1.83302936e-05]])

In [13]:
nc

matrix([[3.30099583e-04, 1.83302936e-05, 1.83302936e-05, ...,
         1.83302936e-05, 1.83302936e-05, 1.83302936e-05]])

In [14]:
dtm.shape

(27278, 21887)

In [16]:
escore = dtm*q.transpose()
escore

matrix([[ 5.77931536],
        [-0.69316551],
        [ 8.74296626],
        ...,
        [-1.62229935],
        [-1.38633102],
        [-0.69338555]])

In [29]:
df['score'] = escore
df.sort_values(by=['score'], ascending=False)

Unnamed: 0,movieId,title,genres,score
2,3,grumpier old men,Comedy|Romance,8.742966
3361,3450,grumpy old men,Comedy,8.742948
12216,55820,no country for old men,Crime|Drama,7.343357
19186,95446,tin toy,Animation|Children,5.783654
5744,5843,toy soldiers,Action|Drama,5.783635
3027,3114,toy story 2,Adventure|Animation|Children|Comedy|Fantasy,5.779315
15401,78499,toy story 3,Adventure|Animation|Children|Comedy|Fantasy|IMAX,5.779315
0,1,toy story,Adventure|Animation|Children|Comedy|Fantasy,5.779315
4833,4929,"toy, the",Comedy,5.548199
21772,105271,"gathering of old men, a (murder on the bayou)",Drama,4.968993


In [27]:
pd.DataFrame.sort_values?