In [1]:
# Import packages
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from scipy.sparse import dok_matrix, csr_matrix
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import re
import pandas as pd
import numpy as np

Tf-idf
-----

In [2]:
# Get top 9 articles
def find_most_similar_posts_tfidf(k=3):
    df = pd.read_csv('../data/fb_news_posts_20K.csv')[['post_id', 'message']]
    df.fillna('', inplace=True)
    vec = TfidfVectorizer(stop_words='english')
    tfidf = vec.fit_transform(df['message'])
    similar = []
    similar_scores = []
    for i in range(0, df.shape[0]):
        cosine_similarities = linear_kernel(tfidf[i:i + 1], tfidf).flatten()
        sim_ids = cosine_similarities.argsort()[-2:-(2 + k):-1]  # The the k most similar posts
        sim = cosine_similarities[sim_ids]
        rec_posts = []
        for sim_index in sim_ids:
            postid = df.iloc[sim_index]['post_id']
            rec_posts.append(postid)
        similar.append(rec_posts)
        similar_scores.append(sim)
    df['most_similar'] = similar
    df['most_similar_rating'] = similar_scores
    df.to_csv(path_or_buf='data/fb_news_posts_20K_tfidf.csv',
              index=False,
              columns=['post_id', 'most_similar', 'most_similar_rating'])

    return (0)
find_most_similar_posts_tfidf(k=9)

KeyboardInterrupt: 

Item-item Filtering
------------------

In [86]:
def find_most_similar_posts_collabfilter(comments, k=9):
#     comments = pd.read_csv("../data/fb_news_comments_1000k_cleaned.csv")
    user_post = comments[['from_id', 'post_id']]
    user_post.drop_duplicates(inplace=True)
    unique_users = user_post.from_id.drop_duplicates().values
    unique_posts = user_post.post_id.drop_duplicates().values
    users_map = dict(zip(unique_users, range(len(unique_users))))
    posts_map = dict(zip(unique_posts, range(len(unique_posts))))
    user_all_posts = user_post.groupby('from_id')['post_id'].apply(list).reset_index()
    
    item_matrix = dok_matrix((len(unique_users), len(unique_posts)), dtype=np.float32)
    for l in range(user_all_posts.shape[0]):
        i = users_map[user_all_posts.iloc[l].from_id]
        posts = user_all_posts.iloc[l].post_id
        for post in posts:
            j = posts_map[post]
            item_matrix[i, j] = 1
    
    cosine_sim = cosine_similarity(item_matrix.transpose())
    
    similar_posts = []
    similar_rating = []
    for l in range(cosine_sim.shape[0]):
        source_sim = cosine_sim[l,:]
        sim_ids = np.argpartition(source_sim, -k)[-k:]
        sim = source_sim[sim_ids]
        sim_posts = [unique_posts[d] for d in sim_ids]
        similar_posts.append(sim_posts)
        similar_rating.append(sim)
    
    df = pd.DataFrame(data = {'post_id': unique_posts,
                             'most_similar': similar_posts,
                             'most_similar_rating': similar_rating})
#     df.to_csv(path_or_buf='data/fb_news_posts_20K_cf.csv',
#               index=False)
    return (df)
find_most_similar_posts_collabfilter(comments)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,most_similar,most_similar_rating,post_id
0,"[101988643193353_1474808579244679, 22873566721...","[0.031403713, 0.03396831, 0.034139436, 0.03413...",228735667216_10154890879532217
1,"[228735667216_10154867467342217, 6499393458_10...","[0.030934412, 0.031016141, 0.031782087, 0.0335...",228735667216_10154890968202217
2,"[326683984410_10155756905094411, 228735667216_...","[0.014537095, 0.020965697, 0.01657484, 0.02096...",228735667216_10154890852247217
3,"[114050161948682_1557918010895216, 22873566721...","[0.023057148, 0.030151132, 0.030151132, 0.0402...",228735667216_1426789250735491
4,"[228735667216_10154889414912217, 228735667216_...","[0.030304577, 0.030618623, 0.033333335, 0.0365...",228735667216_10154890645702217
5,"[228735667216_10154884554312217, 228735667216_...","[0.040818453, 0.041030496, 0.044946656, 0.0410...",228735667216_10154890600247217
6,"[228735667216_10154882787207217, 228735667216_...","[0.020306924, 0.020306924, 0.020412415, 0.9999...",228735667216_10154890480662217
7,"[326683984410_10155757318994411, 228735667216_...","[0.02901294, 0.030151132, 0.030457247, 0.03178...",228735667216_10154890399087217
8,"[228735667216_1862601217090061, 228735667216_1...","[0.030000001, 0.030000001, 0.030151132, 0.0301...",228735667216_1887717684813716
9,"[228735667216_10154890399087217, 228735667216_...","[0.024931919, 0.024931919, 0.025863029, 0.0251...",228735667216_10154890122852217


Recall and Precision at K=9
----

In [288]:
def split_id(val):
    return(re.sub("[^_,\d]", "", val).split(","))

def split_rating(val, sep=","):
    if sep == " ":
        patterns = "[\[\]]"
        string_rating = re.sub(patterns, "", val).split()
    else:
        patterns = "[^.,\d]"
        string_rating = re.sub(patterns, "", val).split(",")
    float_rating = [float(r) for r in string_rating]
    return(float_rating)

def format_similarity(df, sep=","): #split string to post_id's and similarities
    df['most_similar'] = df.most_similar.apply(split_id)    
    df['most_similar_rating'] = df.most_similar_rating.apply(split_rating, sep=sep)
    return(df)

In [398]:
def is_in_top_k(test_case, rec, train_set, k=9):
    user = test_case[0]
    truth = test_case[1]
    user_old_posts = train_set.loc[user]
    recs = rec[rec.post_id.isin(user_old_posts)].most_similar.values.tolist()
    recs = np.array([np.array(sublist) for sublist in recs]).flatten()
    recs_rating = rec[rec.post_id.isin(user_old_posts)].most_similar_rating.values.tolist()
    recs_rating = np.array([np.array(sublist) for sublist in recs_rating]).flatten()
    top_k_indx = np.argpartition(recs_rating, -k)[-k:]
    if truth in recs[top_k_indx]:
        return 1
    else:
        return 0

In [29]:
posts = pd.read_csv("../data/fb_news_posts_20K.csv")
comments = pd.read_csv("../data/fb_news_comments_1000k_cleaned.csv")

In [397]:
def main(comments):
#     comments = pd.read_csv("../data/fb_news_comments_1000k_cleaned.csv")
    doc2vec = format_similarity(pd.read_csv("data/fb_news_posts_20K_doc2v.csv"))
    tfidf = format_similarity(pd.read_csv("data/fb_news_posts_20K_tfidf.csv"), sep=" ")
    
    n_fold = 5
    test_prob = 1/n_fold
    if n_fold == 1:
        test_prob = 0.2
    
    # 5 fold cross validation
    comments = comments.drop_duplicates(['from_id', 'post_id'])
    users = comments.groupby('from_id').count().iloc[:, 0].reset_index() # created_time : number of post read
    potential_test_users = users[users.created_time > 1]
    test_set_size = np.int(test_prob*potential_test_users.shape[0])
    
    doc2vec_cv_recall = []
    tfidf_cv_recall = []
    cf_cv_recall = []
    cv_recalls = [doc2vec_cv_recall, tfidf_cv_recall, cf_cv_recall]
    
    for n in range(n_fold):
        
        test_users = potential_test_users.iloc[(n*test_set_size): ((n+1)*test_set_size - 1), :]
        test_set = pd.merge(test_users, comments.drop_duplicates('from_id'), on='from_id').loc[:,['from_id', 'post_id']]
        
        train_set = pd.merge(test_set, comments, on=['from_id','post_id'], how='outer', indicator=True)
        train_set = train_set[train_set['_merge'] != 'both'].loc[:,['from_id',  'post_id']]
        train_set_unique = train_set.groupby('from_id')['post_id'].apply(list)
        
        cf = find_most_similar_posts_collabfilter(train_set)
        
        recommenders = [doc2vec, tfidf, cf]
        
        # recommendation results
        for m in range(3):        
            r = test_set.apply(is_in_top_k, 
                              rec=recommenders[m], 
                              train_set=train_set_unique,
                              axis=1)
            cv_recalls[m].append(sum(r)/len(r))
            
    return( cv_recalls)

In [401]:
cv_recalls = main(comments)

In [402]:
cv_recalls

[[0.004743594350810364,
  0.005067021238365616,
  0.004743594350810364,
  0.004204549538218277,
  0.004276422179897222],
 [0.014230783052431093,
  0.013044884464728502,
  0.012038667481223272,
  0.012290221727099579,
  0.010996514176878571],
 [0.03212707083048837,
  0.03241456139720415,
  0.028353757142343766,
  0.02932403780500952,
  0.028749056671577965]]

In [403]:
np.save("cv_recalls", cv_recalls)

In [None]:
import matplotlib.pyplot as plt
