In [6]:
import numpy as np
import pandas as pd
import re
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Note: This module requires that the ArXiv dataset has been downloaded and converted to a 
# Pandas DataFrame in order to run

In [16]:
def preprocess(article_data):
    
    article_data.drop_duplicates(subset = ['id'], keep = 'first', inplace = True)
    
    article_data['categories_new'] = list(article_data.categories.str.split(' '))
    
    return article_data

In [8]:
def create_author_data(article_data):
    
    #remove all non-alpha characters
    authors_stripped = [[re.sub('[^a-z]', '', (author[0] + author[1]).lower()) for author in ls] for ls in article_data.authors_parsed]
    
    #remove empty strings
    authors_stripped = [[author for author in ls if author] for ls in authors_stripped]
    
    assert(len(authors_stripped) == len(article_data))
    
    article_data['authors_stripped'] = authors_stripped
    article_data['num_authors'] = [len(ls) for ls in article_data.authors_stripped]
    
    authors_array = np.array([article_data.id, authors_stripped]).T
    authors_dict = {}
    for article in authors_array:
        ID = article[0]
        for author in article[1]:
            if author in authors_dict.keys():
                authors_dict[author].append(ID)
            
            else:
                authors_dict[author] = [ID]
                
    authors_data = pd.DataFrame()
    authors_data['Authors'] = authors_dict.keys()
    authors_data['Articles'] = authors_dict.values()
    authors_data['Num_Articles'] = [len(articles) for articles in authors_data.Articles]
    
    return authors_data

In [9]:
def categorize(article_data):
    
    categories_set = list(set([category for categories in article_data.categories_new for category in categories]))
    
    article_dictionary = dict()

    for category in categories_set:
    
        of_interest = [category in categories for categories in article_data.categories_new]
    
        article_dictionary[category] = article_data[of_interest]
    
        article_dictionary[category].reset_index(inplace = True)
        
    return article_dictionary

In [10]:
def vectorize(article_data):
    
    categories_set = list(set([category for categories in article_data.categories_new for category in categories]))
    
    vectorizer = TfidfVectorizer(stop_words = 'english', max_df = .9)
    
    text_vectorized = dict()
    
    for category in categories_set:
        of_interest = [category in categories for categories in article_data.categories_new]
        articles_of_interest = article_data[of_interest]
    
        corpus = list(articles_of_interest.abstract)
    
        text_vectorized[category] = vectorizer.fit_transform(corpus)
        
    return text_vectorized

In [11]:
def cosine_similarity(to_ids, text_vectorized, article_data):
    
    N = len(article_data)
    M = len(to_ids)
    similarity_array = np.empty((N,M))

    for i in range(M):
        
        index = list(article_data[article_data.id == to_ids[i]].index)[0]
    
        similarity = np.dot(text_vectorized, text_vectorized[index].T)
        similarity = similarity.todense()
        similarity += 1
        similarity_array[:, i] = np.squeeze(similarity)
    
    similarity_score = M / np.sum(1/similarity_array, axis = 1)
    similarity_score += -1
    
    return similarity_score

In [12]:
def text_similarity(ids, vector_dictionary, article_dictionary):
    
    categories_list = list()
    for article_id in ids:
        categories_list += list(article_data[article_data.id == article_id].categories_new)[0]
        
    categories_list = list(set(categories_list))
    
    recommended = dict()
    
    for category in categories_list:
        
        articles = article_dictionary[category]
        vectors = vector_dictionary[category]
        
        indices = articles[articles['id'].isin(ids)].index
    
        df = pd.DataFrame()
        df['id'] = articles.id
        df['similarity_score'] = cosine_similarity(ids, vectors, articles)
        
        df.sort_values(by = 'similarity_score', ascending = False, inplace = True)
        df.drop(indices, inplace = True)
        recommended[category] = df.head(10)
    
    recommendations = pd.concat(recommended.values())
    recommendations.sort_values(by = 'similarity_score', ascending = False, inplace = True)
    recommendations.drop_duplicates(subset = ['id'], keep = 'first', inplace = True)
    
    return recommendations.head(10)

In [17]:
class Recommender:
    def __init__(self, article_data, fresh = True):
        
        if fresh:
            self.articles = preprocess(article_data)
            self.articles_categorized = categorize(self.articles)
            self.authors = create_author_data(self.articles)
            self.vectors = vectorize(self.articles)
        
            with open('ArXiv/articles_categorized.pkl', 'wb') as handle:
                pickle.dump(self.articles_categorized, handle, protocol = pickle.HIGHEST_PROTOCOL)
            
            self.authors.to_pickle('ArXiv/authors.pkl')
            
            with open('ArXiv/vectors.pkl', 'wb') as handle:
                pickle.dump(self.vectors, handle, protocol = pickle.HIGHEST_PROTOCOL)
        
        else: 
            with open('ArXiv/articles_categorized.pkl', 'rb') as handle:
                data = pickle.load(handle)
            self.articles_categorized = data
            
            self.authors = pd.read_pickle('ArXiv/authors.pkl')
            
            with open('ArXiv/vectors.pkl', 'rb') as handle:
                data = pickle.load(handle)
            self.vectors = data
            
    def recommend(self, ids):
        
        results = text_similarity(ids, self.vectors, self.articles_categorized)
        
        return results