# Coursework 1 : Movie Recommendation from Text
## Data Preprocessing

This notebook has already been processed, and you should be able to skip this part.

### Imports and utility functions

In [1]:
from collections import Counter
import math
import pandas as pd
import pickle
import spacy
import time
from tqdm import tqdm

import wikipedia # not used here, but can be useful


nlp = spacy.load('en')

In [2]:
def valid_token(tk):
    is_valid = tk.is_alpha
    return is_valid and not tk.is_stop

def get_lemma(tk):
    if tk.pos_ == 'PRON' or tk.lemma_ == '-PRON-':
        return tk.text.lower()
    return tk.lemma_.lower()

def read_wikipedia_page(page_name):
    page = wikipedia.page(page_name)
    content = page.content
    return content

# This function is only for wikipedia pages
def tokenize_page(page_name):
  text = read_wikipedia_page(page_name)
  return tokenize_text(text)

def tokenize_text(text):
  return [
    get_lemma(t)
    for t in nlp(text)
    if valid_token(t)
  ]

### Load Data

In [3]:
movies_meta=pd.read_csv("data/movie.metadata.tsv", 
                         sep='\t', header=None, usecols=[0,1,2,3,8], 
                         names=['wID', 'fID', 'title', 'data', 'genres' ])

movies_plot=pd.read_csv("data/plot_summaries.txt", 
                         sep='\t', header=None, usecols=[0,1], 
                         names=['wID', 'plot'])

movies_merged = pd.merge(movies_meta, movies_plot, on='wID', how='inner')
    
print("Retrieved {} ".format(len(movies_merged)))

Retrieved 42204 


### Tokenize Movie plots and build global vocabulary

In [4]:
vocabulary = set()
idf_counter = Counter()

countp = 0
start_time = time.time()

for movie in tqdm(movies_merged.itertuples(), total=len(movies_merged)):
#for movie in movies_merged.itertuples():
  #print("   Processing page {}...".format(page))
  
  # Tokens as Set so we count them only once per document
  page_words = set(tokenize_text(movie.plot))
  vocabulary = vocabulary | page_words
  idf_counter.update(page_words)
    
  # To limit computation  
#  countp+=1
#   if countp %1000 == 0:
#         print(".",  end = '')
#   if countp > 100:
#         break

elapsed_time = time.time() - start_time
print("Running time: {} ".format(elapsed_time))

# Save to file
with open('data/vocabulary.pk',mode='wb') as vocab_file:
    pickle.dump(vocabulary, vocab_file)

# with open('data/idf.pk',mode='wb') as idf_file:
#     pickle.dump(idf_counter, idf_file)

 40%|███▉      | 16843/42204 [25:54<57:18,  7.38it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 65%|██████▌   | 27441/42204 [43:08<21:56, 11.21it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 89%|████████▉ | 37681/42204 [59:52<06:50, 11.01it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp

Running time: 4053.539579153061 





### Compute Inverse Document Frequency

In [5]:
idf = {
  word: math.log(len(movies_merged)/df, 2)  for word, df in idf_counter.items()
}

print("Vocabulary size: {}".format(len(vocabulary)))

Vocabulary size: 123349


### Precompute TF-IDF of each movie in the dataset

In [6]:
def tf_idf(target_row):
  target_words = tokenize_text(target_row['plot'])
  tfidf =  {
    word: (1 + math.log(_tf, 2)) * idf[word]
    for word, _tf in Counter(target_words).items()
  }
  return tfidf
    
tfidf_dic = {}
#countp=0
start_time = time.time()
for idx, movie in tqdm(movies_merged.iterrows(), total=len(movies_merged)):
    countp+=1
    tfidf_dic[movie['wID']] = tf_idf(movie)
#     if countp %1000 == 0:
#         print(".",  end = '')
    
#     if countp > 100:
#          break
    
elapsed_time = time.time() - start_time
print("Running time: {} ".format(elapsed_time))

with open('data/tf_idf.pk',mode='wb') as tfidf_file:
    pickle.dump(tfidf_dic, tfidf_file)    

100%|██████████| 42204/42204 [1:02:07<00:00, 11.32it/s]


Running time: 3727.6427714824677 


In [40]:
filtered = {w:f for w, f in idf_counter.items() if f > 3 }

idf_filtered = {}
for w,f in filtered.items():
    idf_filtered[w] = idf[w]

tfidf_dic_filter = {}
for idx, movie in tqdm(movies_merged.iterrows(), total=len(movies_merged)):
    countp+=1
    tfidf_dic_filter[movie['wID']] = { k: val for k,val in tfidf_dic[movie['wID']].items() if k in idf_filtered  }

print(len(idf_counter))
print(len(filtered))    
with open('data/tf_idf_small.pk',mode='wb') as tfidf_file:
    pickle.dump(tfidf_dic_filter, tfidf_file)        

100%|██████████| 42204/42204 [00:09<00:00, 4263.43it/s]


123349
33935
