# Coursework 1 : Movie Recommendation from Text


In [1]:
from collections import Counter
import math
import pandas as pd
import pickle

Load documents from file and process them

In [2]:
movies_meta=pd.read_csv("data/movie.metadata.tsv", 
                         sep='\t', header=None, usecols=[0,1,2,3,8], 
                         names=['wID', 'fID', 'title', 'data', 'genres' ])

movies_plot=pd.read_csv("data/plot_summaries.txt", 
                         sep='\t', header=None, usecols=[0,1], 
                         names=['wID', 'plot'])

movies_merged = pd.merge(movies_meta, movies_plot, on='wID', how='inner')
    
print("Retrieved {} ".format(len(movies_merged)))

with open("data/vocabulary.pk","rb") as pickle_in:
    vocabulary = pickle.load(pickle_in)

with open("data/tf_idf_small.pk","rb") as pickle_in:
    tf_idf = pickle.load(pickle_in)
    
# print stats

Retrieved 42204 


### Functions to Compute similarity between target and other docs

In [3]:
def analyze(target_row):  
  tfidf =  tf_idf[target_row['wID']]
  num_words = 20
  sorted_tfidf = [
    w  for (w, _) in sorted(tfidf.items(), key=lambda kv: kv[1], reverse=True)
  ]
  print(target_row['title'])
  print("Higher TF-IDF: {}".format(sorted_tfidf[:num_words]))


def cosine_dic(dic1,dic2):
    numerator = 0
    dena = 0
    for key1,val1 in dic1.items():
        numerator += val1*dic2.get(key1,0.0)
        dena += val1*val1
    denb = 0
    for val2 in dic2.values():
        denb += val2*val2
    return numerator/math.sqrt(dena*denb)    
    
def similar(target_1, target_2):
    tfidf1 = tf_idf[target_1['wID']]
    tfidf2 = tf_idf[target_2['wID']]
    return cosine_dic(tfidf1, tfidf2)
    
    
def rocchio(pos, neg):
    alpha = 0.65
    a = {}
    for t in pos:
        tf = tf_idf[t['wID']]
        num_w = len(tf)
        a.update({ k: (a.get(k, 0.0) + v/num_w) for k,v in tf.items() })
    b = {}    
    for t in neg:
        tf = tf_idf[t['wID']]
        num_w = len(tf)
        b.update({ k: (b.get(k, 0.0) + v/num_w) for k,v in tf.items() })
        
    c_pos = a.copy()
    c_pos.update({ k: (a.get(k, 0.0)*alpha - b.get(k, 0.0)*(1-alpha)) for k,v in b.items() })

    c_neg = b.copy()
    c_neg.update({ k: (b.get(k, 0.0)*alpha - a.get(k, 0.0)*(1-alpha)) for k,v in a.items() })
    return c_pos, c_neg
    
def is_relevant(target, c_pos, c_neg):
    beta = 1.5
    tfidf = tf_idf[target['wID']]    
    if cosine_dic(tfidf, c_pos) > cosine_dic(tfidf, c_neg)*beta:
        return True
    else :
        return False

### Example based search

1. Identify a set of relevant movies
2. Identify a set of unwanted movies
3. Use 5 of each as negative/positive examples
4. Test on entire movie dataset, see if we can retrieve our good examples and avoid the bad ones

In [4]:
t_examples_pos = ['Ghosts of Mars',
'Conquest of the Planet of the Apes',
'John Carter',
'Alien',
'Star Wars Episode IV: A New Hope',                  
'2001: A Space Odyssey',
'Prometheus',
'Planet of the Apes',
'Rise of the Planet of the Apes',
'AVP: Alien vs. Predator']

t_examples_neg =['Toy Story',
'Notting Hill',
'Finding Nemo',
'Tarzan',
'The Devil Wears Prada',
'Space Jam',
"Bridget Jones's Diary",
'Indiana Jones and the Last Crusade',
'King Kong',
'The Lion King',
'Robin Hood: Men in Tights']

ex_pos = []
for t in t_examples_pos[:5]:
#    print(t)
    target = movies_merged.loc[movies_merged['title'] == t].iloc[0]
    ex_pos.append(target)

ex_neg = []
for t in t_examples_neg[:5]:
#    print(t)    
    target = movies_merged.loc[movies_merged['title'] == t].iloc[0]
    ex_neg.append(target)
   
c_pos, c_neg = rocchio(ex_pos, ex_neg)

countp = 0

print("Words with higher TF-IDF in the positive vector")
sorted_tfidf = [
w  for (w, _) in sorted(c_pos.items(), key=lambda kv: kv[1], reverse=True)
]
print(sorted_tfidf[:20])

print("Words with higher TF-IDF in the negative vector")
sorted_tfidf = [
w  for (w, _) in sorted(c_neg.items(), key=lambda kv: kv[1], reverse=True)
]
print(sorted_tfidf[:20])


print("Simple Naive Check of the Classifier Performance")
rightc=0
wrongc=0
for idx, movie in movies_merged.iterrows():
    #scr = similar(target_1, movie)
    if is_relevant(movie,c_pos, c_neg) and (movie['title'] in t_examples_pos):
        print("Right! {}".format(movie['title'])) 
        rightc+=1
    if is_relevant(movie,c_pos, c_neg) and (movie['title'] in t_examples_neg):
        print("Wrong! {}".format(movie['title'])) 
        wrongc+=1
print(rightc,wrongc)
print('done')

Words with higher TF-IDF in the positive vector
['ballard', 'martian', 'caesar', 'shang', 'armando', 'obi', 'carter', 'ape', 'burroughs', 'medallion', 'jericho', 'wan', 'leia', 'ripley', 'miner', 'luke', 'helium', 'vader', 'kenobi', 'williams']
Words with higher TF-IDF in the negative vector
['andy', 'tarzan', 'buzz', 'woody', 'marlin', 'gorilla', 'nemo', 'clayton', 'dory', 'kala', 'rc', 'nigel', 'porter', 'anna', 'sid', 'jane', 'toy', 'miranda', 'treehouse', 'explorer']
Simple Naive Check of the Classifier Performance
Right! Ghosts of Mars
Right! Prometheus
Right! AVP: Alien vs. Predator
Right! Conquest of the Planet of the Apes
Right! Planet of the Apes
Right! 2001: A Space Odyssey
Right! Rise of the Planet of the Apes
Wrong! Indiana Jones and the Last Crusade
Right! John Carter
Right! Alien
Right! Planet of the Apes
Wrong! Space Jam
Right! Star Wars Episode IV: A New Hope
11 2
done
