# Coursework 1 : Movie Recommendation from Text
## Data Preprocessing

This notebook has already been processed, and you should be able to skip this part.

### Imports and utility functions

In [3]:
from collections import Counter
import math
import pandas as pd
import pickle
import spacy

import wikipedia # not used here, but can be useful


nlp = spacy.load('en')

In [4]:
def valid_token(tk):
    is_valid = tk.is_alpha
    return is_valid and not tk.is_stop

def get_lemma(tk):
    if tk.pos_ == 'PRON' or tk.lemma_ == '-PRON-':
        return tk.text.lower()
    return tk.lemma_.lower()

def read_wikipedia_page(page_name):
    page = wikipedia.page(page_name)
    content = page.content
    return content

# This function is only for wikipedia pages
def tokenize_page(page_name):
  text = read_wikipedia_page(page_name)
  return tokenize_text(text)

def tokenize_text(text):
  return [
    get_lemma(t)
    for t in nlp(text)
    if valid_token(t)
  ]

### Load Data

In [5]:
movies_meta=pd.read_csv("data/movie.metadata.tsv", 
                         sep='\t', header=None, usecols=[0,1,2,3,8], 
                         names=['wID', 'fID', 'title', 'data', 'genres' ])

movies_plot=pd.read_csv("data/plot_summaries.txt", 
                         sep='\t', header=None, usecols=[0,1], 
                         names=['wID', 'plot'])

movies_merged = pd.merge(movies_meta, movies_plot, on='wID', how='inner')
    
print("Retrieved {} ".format(len(movies_merged)))

Retrieved 42204 


### Tokenize Movie plots and build global vocabulary

In [6]:
vocabulary = set()
idf_counter = Counter()

countp = 0
for movie in movies_merged.itertuples():
  #print("   Processing page {}...".format(page))
  
  # Tokens as Set so we count them only once per document
  page_words = set(tokenize_text(movie.plot))
  vocabulary = vocabulary | page_words
  idf_counter.update(page_words)
    
  # To limit computation  
  countp+=1
  if countp %1000 == 0:
        print(".",  end = '')
  if countp > 1100:
        break
        

# Save to file
with open('data/vocabulary.pk',mode='wb') as vocab_file:
    pickle.dump(vocabulary, vocab_file)

# with open('data/idf.pk',mode='wb') as idf_file:
#     pickle.dump(idf_counter, idf_file)

.

### Compute Inverse Document Frequency

In [7]:
idf = {
  word: math.log(len(movies_merged)/df, 2)  for word, df in idf_counter.items()
}

print("Vocabulary size: {}".format(len(vocabulary)))

Vocabulary size: 17090


### Precompute TF-IDF of each movie in the dataset

In [8]:
def tf_idf(target_row):
  target_words = tokenize_text(target_row['plot'])
  tfidf =  {
    word: (1 + math.log(_tf, 2)) * idf[word]
    for word, _tf in Counter(target_words).items()
  }
  return tfidf
    
tfidf_dic = {}
countp=0
for idx, movie in movies_merged.iterrows():
    countp+=1
    tfidf_dic[movie['wID']] = tf_idf(movie)
    if countp > 1100:
        break
    

with open('data/tf_idf.pk',mode='wb') as tfidf_file:
    pickle.dump(tfidf_dic, tfidf_file)    