# Content Based Recommendation System with MXNet

In [1]:
import pandas as pd
import mxnet as mx
import numpy as np
import glob
import gensim
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint

  from ._conv import register_converters as _register_converters
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [2]:
class PreprocessText():

	def __init__(self):
		self.additional_stop_words = {"-PRON-"}
		self.stop_words = set(STOP_WORDS.union(self.additional_stop_words))

	def make_bigrams(self, texts):
		"""
		Create bigrams from documents.
		Higher thresholds yield fewer phrases
		"""
		bigram = gensim.models.Phrases(texts, min_count=2, threshold=10)
		bigram_mod = gensim.models.phrases.Phraser(bigram)
		return [bigram_mod[doc] for doc in texts]

	def lemmatization(self, texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
		"""
		Tokenize and lemmatize all documents. The following criteria are used to evaluate each word.
				Is the token a stop word?
				Is the token comprised of letters?
				Is the token longer than 1 letter?
				Is the token an allowed POS tag?
				Is the lemmatized token a stop word?

		"""
		print("Lemmatizing Text")

		# Initialize spaCy
		nlp = spacy.load('en_core_web_md', disable=["parser", "ner"])

		texts_out = []

		for text in texts:
			doc = nlp(text)
			texts_out.append([token.lemma_ for token in doc
							  if not token.is_stop
							  and token.lemma_ not in self.stop_words
							  and token.is_alpha
							  and len(token) > 1
							  and token.pos_ in allowed_postags])

			if len(texts_out) % 1000 == 0:
				print("Lemmatized {0} of {1} documents".format(
					len(texts_out), len(texts)))

		return texts_out

Create the file path to the article files.

In [3]:
file_path = "../data/"
all_files = glob.glob(file_path + "*.csv")

Import all articles with Pandas.

In [4]:
extract_features = lambda f : pd.read_csv(f, usecols = ["id", "title", "publication", "content"])

In [5]:
# Concatenate features across all files into a single data frame.
articles = pd.concat((extract_features(f) for f in all_files))

Select the first 1000 articles.

In [6]:
articles = articles.head(1000)

Create the TF-IDF matrix.

In [7]:
tf = TfidfVectorizer(analyzer="word",
                    ngram_range=(1, 3),
                    min_df=0.2, # ignore terms with a document frequency lower than 0.2 (20%)
                    stop_words="english")

In [8]:
tfidf_matrix = tf.fit_transform(articles["content"])

Convert `tfidf_matrix` to an MXNet NDArray.

In [9]:
mx_tfidf = mx.nd.sparse.array(tfidf_matrix, ctx=mx.gpu())

### Dot Product Timing: NumPy vs MXNet 

Time the dot product with NumPy and Scikit-Learn sparse matrix.

In [10]:
%%timeit
np.dot(tfidf_matrix, tfidf_matrix.T)

28.5 ms ± 150 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


Time the dot product of the MXNet sparse matrix.

In [11]:
%%timeit
mx.nd.sparse.dot(mx_tfidf, mx_tfidf.T)
mx.nd.waitall()

1.63 ms ± 9.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


Compute the dot product, and create the cosine similarity matrix.

In [12]:
mx_recsys = mx.nd.sparse.dot(mx_tfidf, mx_tfidf.T)

In [13]:
def get_recommendations(df_articles, article_idx, mx_mat, n_recs=10):
    """
    Request top N article recommendations.

    INPUT
        df_articles: Pandas DataFrame containing all articles.
        user_id: User ID being provided matches.
        mx_mat: MXNet cosine similarity matrix
    OUTPUT
        Pandas DataFrame of top N article recommendations.
    """

    article_sims = mx_mat[article_idx].asnumpy()
    article_recs = np.argsort(-article_sims)[:n_recs + 1]

    # Top recommendations
    df_recs = df_articles.loc[list(article_recs)]
    df_recs["similarity"] = article_sims[article_recs]

    return df_recs

Get the top 10 recommendations from the article at index 3.

In [14]:
df_recs = get_recommendations(df_articles = articles,
    article_idx = 3, mx_mat = mx_recsys, n_recs=10)

Show the recommendations in the DataFrame.

In [15]:
df_recs[["title", "similarity"]]

Unnamed: 0,title,similarity
3,I feared my life lacked meaning. Cancer pushed...,1.0
167,Chuck (aka The Bleeder) review - Liev Schreibe...,0.544347
726,Thom Yorke’s ex-partner Rachel Owen dies at 48,0.504341
373,Mr Robot returns and The Girlfriend Experience...,0.50074
764,My nieces don’t know they were conceived by do...,0.482433
563,Bridget Jones: how to turn a female character ...,0.482205
216,Robert Rauschenberg and the subversive languag...,0.476691
96,Zsa Zsa Gabor dies aged 99,0.476189
678,"The rise of K2: the drug is legal, dangerous –...",0.469531
765,Facebook is chipping away at privacy – and my ...,0.464698
