# Content Based Recommendation System with MXNet

In [2]:
import pandas as pd
import mxnet as mx
import numpy as np
import glob
import gensim
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint

  from ._conv import register_converters as _register_converters
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [3]:
class PreprocessText():

	def __init__(self):
		self.additional_stop_words = {"-PRON-"}
		self.stop_words = set(STOP_WORDS.union(self.additional_stop_words))

	def make_bigrams(self, texts):
		"""
		Create bigrams from documents.
		Higher thresholds yield fewer phrases
		"""
		bigram = gensim.models.Phrases(texts, min_count=2, threshold=10)
		bigram_mod = gensim.models.phrases.Phraser(bigram)
		return [bigram_mod[doc] for doc in texts]

	def lemmatization(self, texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
		"""
		Tokenize and lemmatize all documents. The following criteria are used to evaluate each word.
				Is the token a stop word?
				Is the token comprised of letters?
				Is the token longer than 1 letter?
				Is the token an allowed POS tag?
				Is the lemmatized token a stop word?

		"""
		print("Lemmatizing Text")

		# Initialize spaCy
		nlp = spacy.load('en_core_web_md', disable=["parser", "ner"])

		texts_out = []

		for text in texts:
			doc = nlp(text)
			texts_out.append([token.lemma_ for token in doc
							  if not token.is_stop
							  and token.lemma_ not in self.stop_words
							  and token.is_alpha
							  and len(token) > 1
							  and token.pos_ in allowed_postags])

			if len(texts_out) % 1000 == 0:
				print("Lemmatized {0} of {1} documents".format(
					len(texts_out), len(texts)))

		return texts_out

Create the file path to the article files.

In [4]:
file_path = "../data/"
all_files = glob.glob(file_path + "*.csv")

Import all articles with Pandas.

In [5]:
articles = pd.concat((pd.read_csv(
    f, usecols=["id", "title", "publication", "content"]) for f in all_files))


Select the first 1000 articles.

In [6]:
articles = articles.head(1000)

Create the TF-IDF matrix.

In [7]:
tf = TfidfVectorizer(analyzer="word",
                    ngram_range=(1, 3),
                    min_df=2,
                    stop_words="english")

In [8]:
mx_tfidf = tf.fit_transform(articles["content"])

Convert `mx_tfidf` to an MXNet NDArray.

In [9]:
mx_tfidf = mx.nd.sparse.array(mx_tfidf, ctx=mx.cpu())

Compute the dot product, and create the cosine similarity matrix.

In [10]:
mx_recsys = mx.nd.sparse.dot(mx_tfidf, mx_tfidf.T)

In [11]:
def get_recommendations(df_articles, article_idx, mx_mat, n_recs=10):
    """
    Request top N article recommendations.

    INPUT
        df_articles: Pandas DataFrame containing all articles.
        user_id: User ID being provided matches.
        mx_mat: MXNet cosine similarity matrix
    OUTPUT
        Pandas DataFrame of top N article recommendations.
    """

    # user_idx = article_idx

    article_sims = mx_mat[article_idx].asnumpy()
    article_recs = np.argsort(-article_sims)[:n_recs + 1]

    # Top recommendations
    df_recs = df_articles.loc[list(article_recs)]
    df_recs["similarity"] = article_sims[article_recs]

    return df_recs

Get the top 10 recommendations from the article at index 3.

In [12]:
df_recs = get_recommendations(df_articles = articles,
    article_idx = 3, mx_mat = mx_recsys, n_recs=10)

Show the recommendations in the DataFrame.

In [13]:
df_recs[["title", "similarity"]]

Unnamed: 0,title,similarity
3,I feared my life lacked meaning. Cancer pushed...,1.0
845,2016 Eyewitness: our summary of the defining i...,0.116453
558,"Very hot drinks may cause cancer, but coffee d...",0.110511
374,Facing my fear: did I have the cancer mutation...,0.100189
959,Gay Talese: ‘Most journalists are voyeurs. Of ...,0.087257
202,‘None of the old rules apply’: Dave Eggers tra...,0.083596
958,Chesley Sullenberger: an old-fashioned kind of...,0.080756
876,"‘It’s not about your age, it’s about your idea...",0.078553
76,"Tragic, fascinating, brilliant – life of ‘wild...",0.078132
47,Transgender stories: ’People think we wake up ...,0.076837
