A second pass as NLP and cosine similarity by adding the year to the movie title and attempting to run pairwise kernel to get the cosine similarities without killing my kernel.


In [2]:
import pandas as pd
import numpy as np

import nltk
from rake_nltk import Rake
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
movies = pd.read_csv('movies-with-wr-and-title-year.csv')

In [18]:
movies.shape

(93957, 20)

In [4]:
movies = movies[['title_year','rated','writer','genre','director','actors','plot','language','country', 'year']]

for column in movies:
    movies[column] = movies[column].apply(lambda x: str(x))
    
# discarding the commas between the actors' full names
movies['actors'] = movies['actors'].map(lambda x: x.split(','))

# putting the genres in a list of words
movies['genre'] = movies['genre'].map(lambda x: x.lower().split(','))

movies['director'] = movies['director'].map(lambda x: x.split(','))


# merging together first and last name for each actor and director, so it's considered as one word 
# and there is no mix up between people sharing a first name
for index, row in movies.iterrows():
    row['actors'] = [x.lower().replace(' ','') for x in row['actors']]
    row['director'] = [x.lower().replace(' ','') for x in row['director']]


# Getting rid of information between () in writers
movies['writer'] = movies['writer'].str.replace(r"\(.*\)","")

# discarding the commas between writer's names
movies['writer'] = movies['writer'].map(lambda x: x.split(','))

# putting the countries in a list of words
movies['country'] = movies['country'].map(lambda x: x.lower().split(','))

# putting the languages in a list of words
movies['language'] = movies['language'].map(lambda x: x.lower().split(','))

movies['rated'] = movies['rated'].str.replace("-","")

movies['year'] = movies['year'].map(lambda x: x.split(','))

# merging together first and last name for each actor and director, so it's considered as one word 
# and there is no mix up between people sharing a first name
for index, row in movies.iterrows():
    row['writer'] = [x.lower().replace(' ','') for x in row['writer']]


In [5]:
# initializing the new column
movies['Key_words'] = ""

for index, row in movies.iterrows():
    plot = row['plot']
    
    # instantiating Rake, by default is uses english stopwords from NLTK
    # and discard all puntuation characters
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary whith key words and their scores
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
movies.drop(columns = ['plot'], inplace = True)

In [6]:
movies.head()

Unnamed: 0,title_year,rated,writer,genre,director,actors,language,country,year,Key_words
0,Toy Story 1995,G,[johnlasseter],"[animation, adventure, comedy, family, fan...",[johnlasseter],"[tomhanks, timallen, donrickles, jimvarney]",[english],[usa],[1995],"[profoundly, threatened, room, boy, new, space..."
1,Jumanji 1995,PG,[jonathanhensleigh],"[adventure, comedy, family, fantasy]",[joejohnston],"[robinwilliams, jonathanhyde, kirstendunst, br...","[english, french]",[usa],[1995],"[finishing, man, trapped, two, kids, find, sto..."
2,Grumpier Old Men 1995,PG13,"[markstevenjohnson, markstevenjohnson]","[comedy, romance]",[howarddeutch],"[waltermatthau, jacklemmon, sophialoren, ann-m...","[english, italian, german]",[usa],[1995],"[beloved, bait, shop, save, attention, john, m..."
3,Waiting to Exhale 1995,R,[terrymcmillan],"[comedy, drama, romance]",[forestwhitaker],"[whitneyhouston, angelabassett, lorettadevine,...",[english],[usa],[1995],"[male, gender, terry, mcmillan, relationships,..."
4,Father of the Bride Part II 1995,PG,[alberthackett],"[comedy, family, romance]",[charlesshyer],"[stevemartin, dianekeaton, martinshort, kimber...",[english],[usa],[1995],"[also, unexpected, pregnancy, george, banks, m..."


In [7]:
# Drop duplicate movies based on their title and year
movies = movies.drop_duplicates(subset='title_year', keep= 'first')
movies.shape

(87967, 10)

In [8]:
movies.set_index('title_year', inplace = True)

In [9]:
movies['bag_of_words'] = ''
columns = movies.columns
for index, row in movies.iterrows():
    words = ''
    for col in columns:
        if col == 'rated':
            words = words.join(row[col]) + ' '
        elif col == 'imdb_rating':
            None
        else:
            words = words + ' '.join(row[col])+ ' '
#        else:
#            words = words + row[col]+ ' '
    row['bag_of_words'] = words
    
movies.drop(columns = [col for col in movies.columns if col!= 'bag_of_words'], inplace = True)

In [10]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(movies['bag_of_words'])

# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use later to match the indexes
indices = pd.Series(movies.index)
indices[:5]

0                      Toy Story 1995
1                        Jumanji 1995
2               Grumpier Old Men 1995
3              Waiting to Exhale 1995
4    Father of the Bride Part II 1995
Name: title_year, dtype: object

In [11]:
movies.shape

(87967, 1)

In [13]:
from sklearn.metrics.pairwise import pairwise_kernels

cosine_sim = pairwise_kernels(count_matrix, metric='cosine', n_jobs=3)

In [None]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(count_matrix, count_matrix)

In [None]:
def cosine_similarity_n_space(m1, m2, batch_size=100):
    assert m1.shape[1] == m2.shape[1]
    ret = np.ndarray((m1.shape[0], m2.shape[0]))
    for row_i in range(0, int(m1.shape[0] / batch_size) + 1):
        start = row_i * batch_size
        end = min([(row_i + 1) * batch_size, m1.shape[0]])
        if end <= start:
            break # cause I'm too lazy to elegantly handle edge cases
        rows = m1[start: end]
        sim = cosine_similarity(rows, m2) # rows is O(1) size
        ret[start: end] = sim
        cosine_sim = ret
    return ret

In [None]:
cosine_similarity_n_space(count_matrix, count_matrix, batch_size=100)

In [None]:
# Change chunk_size to control resource consumption and speed
# Higher chunk_size means more memory/RAM needed but also faster 
chunk_size = 500 
matrix_len = count_matrix.shape[0] # Not sparse numpy.ndarray

def similarity_cosine_by_chunk(start, end):
    if end > matrix_len:
        end = matrix_len
    return cosine_similarity(X=count_matrix[start:end], Y=count_matrix) # scikit-learn function

for chunk_start in range(0, matrix_len, chunk_size):
    cosine_similarity_chunk = similarity_cosine_by_chunk(chunk_start, chunk_start+chunk_size)
    # Handle cosine_similarity_chunk  ( Write it to file_timestamp and close the file )
    # Do not open the same file again or you may end up with out of memory after few chunks 

In [None]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)