In [1]:
import requests
import math
import os
from html.parser import HTMLParser

This class extends python's HTMLParser built-in library. When a text is fed to this parser, handle_starttag is called whenever the parser founds a start tag, handle_endtag is called whenever the parser founds an end tag and handle_data is called whenever a string is found other than a tag component. By inspection of the HTML files, titles are enclosed by title_wrapper tag, storyline is enclosed by a *Storyline* followd by a *span* tag. Recommended movie id's are in a tag called *rec_item*.

In [2]:
class MyHTMLParser(HTMLParser):
    def __init__(self, *args, **kwargs):
        self.title_wrapper_position = -1
        self.title = ""
        self.story = ""
        self.storyline = False
        self.get_story = False
        self.recommended = []
        super().__init__(*args, **kwargs)

    def handle_starttag(self, tag, attrs):
        if tag == 'div' and len(attrs) > 0 and attrs[0][0] == 'class' and attrs[0][1] == 'title_wrapper':
            self.title_wrapper_position = self.getpos()[0]
        if tag == 'span' and len(attrs) == 0 and self.storyline:
            self.get_story = True
        if tag == 'div' and len(attrs) == 4 and attrs[0][0] == 'class' and attrs[0][1] == 'rec_item':
            self.recommended.append(attrs[3][1])

    def handle_endtag(self, tag):
        if tag == 'span' and self.storyline:
            self.storyline = False
            self.get_story = False

    def handle_data(self, data):
        if self.title_wrapper_position != -1 and self.getpos()[0] >= self.title_wrapper_position:
            if data.strip() == "":
                return
            self.title = data.strip()
            self.title_wrapper_position = -1
        if data.strip() == 'Storyline':
            self.storyline = True
        if self.get_story:
            if data.strip() == "":
                return
            self.story = self.story + " " + data.strip()


For the pre processing part, I used the same stopwords and punctuations in the first project.

In [3]:
punctuations = {'#', '[', '~', '-', ']', '.', '@', '/', "'", '{', '|', ')',
                '(', '*', ',', '`', ';', '$', '%', '\\', '^', '_', '!', '<', ':', '&', '>', '"', '}', '=', '?', '+'}
stopwords = {'us', 'for', 'this', 'by', 'few', 'which', 'of', 'why', 'you', 'there', 'them', 'some', 'your', 'her', 'many', 'it', 'will', 'the', 'are', 'all', 'who', 'none', 'they', 'a', 'him', 'an',
             'i', 'where', 'its', 'what', 'as', 'have', 'in', 'his', 'she', 'my', 'be', 'any', 'been', 'how', 'or', 'and', 'me', 'their', 'but', 'on', 'is', 'here', 'our', 'with', 'when', 'that', 'was', 'he'}

## IMDB Scraping
In this function request the html content using the requests library. Then I feed the html into the html parser. It finds the id,title,story and recommended movies and sets the objects parameters accordingly. Finally I close the parser and return the data.

In [4]:
def get_movie_contents(imdb_id):
    """
    Gets an imdb id and returns its title, storyline, list of IMDB recommendations respectively.
    """
    r = requests.get("https://www.imdb.com/title/"+imdb_id)
    parser = MyHTMLParser()
    parser.feed(r.text)
    ret = [imdb_id, parser.title, parser.story, parser.recommended]
    parser.close()
    return ret

This function iterates over the given movie id list, collects the data and writes into *"movie_info.txt"*. This function is only called(below) if *"movie_info.txt"* does not exist. By this way if you run the whole code again, there is no need to make requests again.

In [5]:
def IMDB_scrap():
    count = 0
    w_file = open("movie_info.txt", "w")
    for line in open("movie_ids.csv"):
        count += 1
        print( count , end='\r')
        contents = get_movie_contents(line.strip())
        w_file.write(contents[0]+"\n")
        w_file.write(contents[1]+"\n")
        w_file.write(contents[2]+"\n")
        for rec in contents[3]:
            w_file.write(rec + " ")
        w_file.write("\n")

Scrap only if we haven't scrapped yet. If working it prints the number of scrapped movies so far.

In [6]:
if not os.path.isfile("movie_info.txt"):
    IMDB_scrap()

Initialize global sets,lists and variables.

In [7]:
vocabulary = {}
token_to_tokenId = {}
token_counter = 0
movies = []
statistic = []

Tokenize function takes a string, replaces punctuations with space, applies lowercase folding, removes the stopwords from the string and then returns all the tokens in a list.

In [8]:
def tokenize(input):
    input = input.strip()
    without_punc = ""
    for char in input:
        if char in punctuations:
            without_punc += ' '
        else:
            without_punc += char
    tokens = list(map(lambda x: x.lower(), without_punc.split()))
    tokens = list(filter(lambda x: x not in stopwords, tokens))
    return tokens

This objects holds information about movies. It has id which is the same as the imdb id, vector which is the tf_idf vector and recommended is the recommended movie ids.

In [9]:
class Movie():
    def __init__(self, id):
        self.id = id;
        self.vector = []
        self.recommended = []
    
    #insert a word and it's term frequency to the tf_idf vector
    #note that this function only inserts the term frequency of a containing word.
    def insert_element(self, word, freq): 
        self.vector.append((word, freq))

    def insert_rec(self, id): #insert a movie to the recommended movie list
        self.recommended.append(id)
    
    #this function calculates the tf_idf values of the vector. It contained only the term frequency beforehand.
    #tf_idf = (1+log10(tf))*log10(N/df)
    def tf_idf(self, N):
        total_sum = 0.0
        for i in range(len(self.vector)):
            word = int(self.vector[i][0])
            freq = int(self.vector[i][1])
            self.vector[i] = (word, (1+math.log10(freq))*math.log10(N/vocabulary[word]) )
            total_sum += self.vector[i][1]
        self.vector.sort(key = lambda tup: tup[1])
        self.vector.reverse() #sort according to tf_idf values
        
        #here we calculate a statistic to find a value to take the top K element of the tf_idf vectors.
        #We find the K value for the current movie which has the 95% of the data(tf_idf values).
        sum = 0.0
        for i in range(len(self.vector)):
            sum += self.vector[i][1]
            if sum >= 0.95*total_sum:
                statistic.append(i)
                break
    
    # Take the top N element of the tf_idf vector to represent the movie
    def take_top_N(self, N):
        self.vector = self.vector[:min(N,len(self.vector))]
        self.vector.sort(key = lambda tup: tup[0])

    #normalize the tf_idf vector. 
    def normalize(self):
        length = 0.0
        for elem in self.vector:
            length += elem[1]*elem[1]
        length = math.sqrt(length)
        for i in range(len(self.vector)):
            self.vector[i] = ( self.vector[i][0] , self.vector[i][1]/length)

After scrapping and dumping the data into a file, read it and and create our objects which represents movies

In [10]:
def read_data_from_file():
    global token_counter
    count = 0
    for line in open("movie_info.txt", "r"):
        line = line.strip()
        if count == 0:
            movie = Movie(line)
            tf = {}
        if count == 1 or count == 2:
            for token in tokenize(line):
                if token not in token_to_tokenId:
                    token_counter += 1
                    token_to_tokenId[token] = token_counter
                if token_to_tokenId[token] not in tf:
                    tf[token_to_tokenId[token]] = 0    
                tf[token_to_tokenId[token]] += 1
        if count == 3:
            for rec in line.split(" "):
                movie.insert_rec(rec)
            for term in tf:
                movie.insert_element(int(term), int(tf[term]))
                if term not in vocabulary:
                    vocabulary[term] = 0
                vocabulary[term] += 1
            movies.append(movie)
        count = (count+1)%4
read_data_from_file()

Calculate the *tf_idf* values of the movies, then using the collected statistic find the N value to take top N elements, and then take the top N values of each *tf_idf* vector.

In [11]:
for movie in movies:
    movie.tf_idf(len(movies))

average = 0
for st in statistic:
    average += st/len(statistic)
for movie in movies:
    movie.take_top_N(int(average))
    movie.normalize()

Calculate the cosine similarity of given 2 movies. Since they may have different words which represent it's *tf_idf* vector, we must be careful when to multiply the values( only multiply the values for the same words ).

In [12]:
def calc_similarity( movie1, movie2 ):
    i = 0
    j = 0
    similarity = 0.0
    while i < len(movie1.vector) and j < len(movie2.vector):
        if movie1.vector[i][0] == movie2.vector[j][0]:
            similarity += movie1.vector[i][1]*movie2.vector[j][1]
            i += 1
            j += 1
        elif movie1.vector[i][0] < movie2.vector[j][0]:
            i += 1
        else:
            j += 1
    return similarity

## Tf-idf model

## Recommendation
Iterate over all movies. Find the similarities between all movies. Sort them according to similarity and then take the top 11 element(since we only evaluate performance for K=1,2,3,10 we don't need more elements). 

In [13]:
def recommend(imdb_id):
    """
    Gets an imdb id and returns a list of recommended movie ids for that movie. 
    """
    for movie in movies:
        if movie.id == imdb_id:
            current_movie = movie
    recommendations = []
    for movie in movies:
        if movie.id == current_movie.id:
            continue
        similarity = calc_similarity(movie , current_movie )
        recommendations.append((movie.id,similarity))
    recommendations.sort(key = lambda tup: tup[1])
    recommendations.reverse()
    recommendations = recommendations[:11]
    return [x for (x,y) in recommendations]

## Evaluation
Self explanatory

In [14]:
def evaluate_recommendations(rec_movie_ids, relevant_movie_ids, K):
    """
    Gets list of recommended and relevant movie ids and K value.
    
    Returns precision, recall, F1 values for K respectively. 
    """
    rec_movie_ids=rec_movie_ids[:K]
    presicion = len(list(filter( lambda x: x in relevant_movie_ids , rec_movie_ids )))/len(rec_movie_ids)
    recall = len(list(filter( lambda x: x in rec_movie_ids , relevant_movie_ids )))/len(relevant_movie_ids)
    try:
        f1_score = 2*presicion*recall/(presicion+recall)
    except Exception as e:
        f1_score = math.nan
    return (presicion,recall,f1_score)

To make testing easy, this function returns the list of the recommended movies for the given imdb_id.

In [15]:
def get_relevant_movie_ids(imdb_id):
    for movie in movies:
        if movie.id == imdb_id:
            return movie.recommended

This is also for thesting purposes. Print the evaluation values for the given imdb_id.

In [16]:
def get_metrics(imdb_id):
    for k in [1,2,3,10]:
        metrics = evaluate_recommendations(recommend(imdb_id),get_relevant_movie_ids(imdb_id), k)
        print("K = " + str(k))
        print("Presicion = " + str(metrics[0]))
        print("Recall = " + str(metrics[1]))
        print("F1 score = " + str(metrics[2]))

In [17]:
print(recommend("tt1300854"))
get_metrics("tt1300854")

['tt2395427', 'tt2250912', 'tt0086250', 'tt0132905', 'tt0848228', 'tt0167260', 'tt3498820', 'tt0499448', 'tt2166834', 'tt0338348', 'tt0068230']
K = 1
Presicion = 1.0
Recall = 0.08333333333333333
F1 score = 0.15384615384615385
K = 2
Presicion = 0.5
Recall = 0.08333333333333333
F1 score = 0.14285714285714285
K = 3
Presicion = 0.3333333333333333
Recall = 0.08333333333333333
F1 score = 0.13333333333333333
K = 10
Presicion = 0.3
Recall = 0.25
F1 score = 0.2727272727272727
