Try use topic modeling to calculate the semantic similarities between the text.

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import text 

import nltk
nltk.download('punkt')


from sklearn.decomposition import LatentDirichletAllocation as LDA

# improve the model with lemmatization 
# use a different vectorizer. 

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

count_vectorizer = text.CountVectorizer(tokenizer=LemmaTokenizer(),
                                strip_accents = 'unicode', # works 
                                stop_words = 'english', # works
                                lowercase = True, # works
                                max_df = 0.5, # works
                                min_df = 10) # works


tfidf_vectorizer = text.TfidfVectorizer(tokenizer=LemmaTokenizer(),
                                strip_accents = 'unicode', # works 
                                stop_words = 'english', # works
                                lowercase = True, # works
                                max_df = 0.5, # works
                                min_df = 10) # works

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
wiki_phys = pd.read_csv("../../nlp_clean/wiki_phys_cleaned.csv")

In [3]:
wiki_phys.head()

Unnamed: 0,Name,Link,Subcategory,Text,Length,Reference
0,Wikipedia:FAQ/Categorization,https://en.wikipedia.org/wiki/Wikipedia:FAQ/Ca...,Concepts in physics,help faq categorization faq frequently asked q...,3359,"['Help:Contents', 'Help:Contents', 'Wikipedia:..."
1,Category:Concepts in physics,https://en.wikipedia.org/w/index.php?title=Cat...,Concepts in physics,category comprises topic fundamental descripti...,94,"['Wave', 'Momentum', 'Wikipedia:FAQ/Categoriza..."
2,4D vector,https://en.wikipedia.org/wiki/4D_vector,Concepts in physics,computer science vector vector data type us in...,441,"['Computer science', 'Vector (mathematics)', '..."
3,Active and passive transformation,https://en.wikipedia.org/wiki/Active_and_passi...,Concepts in physics,physic engineering spatial transformation eucl...,1121,"['Physics', 'Engineering', 'Transformation (ma..."
4,Ansatz,https://en.wikipedia.org/wiki/Ansatz,Concepts in physics,physic mathematics ansatz german ˈʔanzats mean...,276,"['Physics', 'Mathematics', 'Help:IPA/English',..."


In [4]:
len(wiki_phys['Subcategory'].value_counts())

26

## Using CountVectorizer

In [5]:
data_count_vectorized=count_vectorizer.fit_transform(wiki_phys['Text'].values.astype('U'))
lda_model_count=LDA(n_components=26
              ,max_iter=100,learning_method='batch', n_jobs=3).fit(data_count_vectorized)

## Using TfidfVectorizer

In [6]:
data_tfidf_vectorized=tfidf_vectorizer.fit_transform(wiki_phys['Text'].values.astype('U'))
lda_model_tfidf=LDA(n_components=26
              ,max_iter=100,learning_method='batch', n_jobs=3).fit(data_tfidf_vectorized)

In [7]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

In [8]:
#print_topics(lda_model_count, count_vectorizer)

#print_topics(lda_model_tfidf, count_vectorizer)

Some of the grouped key words indeed made sense. But how to measure the model's performance? 

# define the average precision metric

The averaged precision metric is defined as the averaged precision for top 1, top2, top3 ...top10 of the recommended articles to be in the same subcategory as the one you are reading. 

It is not a perfect metric since you might want diversity in the category you read and relevant articles are not always in the same category. But it is a good metric to use for the first iteration of the recommendation engine. 

In [9]:
from sklearn.metrics.pairwise import cosine_similarity as cos_sim

def top_n_index_sorted(array, top_n):
    # get the index of top 10 values in the array
    index = np.argpartition(array,-top_n)[-top_n:]
    
    # return the index of the sorted top_n values 
    return index[np.argsort(array[index])]

def top10_recommend_index(index,model,data,top_n):
    # transform the data using the model
    lda_vectors=model.transform(data)
    # calculate the cos similarity from the lda vectors 
    similarity=cos_sim(lda_vectors)
    
    top_n_index=top_n_index_sorted(similarity[index,],top_n)
    
    # get the decreasing similarity index
    return top_n_index[::-1]    
# example 
# top10_recommend_index(2,lda_model_2,data_count_vectorized,10)


In [10]:
def top10_averaged_precision(top_n_index,data):
    
    # get the subcategory data
    recommended_subcategory = wiki_phys["Subcategory"].iloc[top_n_index]
    recommended_subcategory=recommended_subcategory.values
    # set the ground truth 
    ground_truth = recommended_subcategory[0]
    # get the recommended subcategory
    recommended_subcategory = recommended_subcategory[1:]
    
    # calculate the precision to be in the same category. 
    percentage_list = list()
    for index,item in enumerate(recommended_subcategory):
        percentage = sum(ground_truth ==recommended_subcategory[:(index+1)] )/len(recommended_subcategory[:(index+1)])
        percentage_list.append(percentage)
    # return the mean of the percentage_list
    return sum(percentage_list)/len(percentage_list)         

In [11]:
top10_averaged_precision(top10_recommend_index(10,lda_model_count,data_count_vectorized,10), wiki_phys)

0.37971781305114632

In [None]:
precision_list_count = list()

for i in range(wiki_phys.shape[0]):
    top10_averaged=top10_averaged_precision(top10_recommend_index(i,lda_model_count,data_count_vectorized,10), wiki_phys)
    
    precision_list_count.append(top10_averaged)

In [None]:
precision_list_tfidf = list()

for i in range(wiki_phys.shape[0]):
    top10_averaged=top10_averaged_precision(top10_recommend_index(i,lda_model_tfidf,data_tfidf_vectorized,10), wiki_phys)
    
    precision_list_tfidf.append(top10_averaged)