In [2]:
# Import packages
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from scipy.sparse import dok_matrix, csr_matrix
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import re
import pandas as pd
import numpy as np

Tf-idf
-----

In [62]:
# Get top 9 articles
def find_most_similar_posts_tfidf(k=3):
    df = pd.read_csv('../data/fb_news_posts_20K.csv')[['post_id', 'message']]
    df.fillna('', inplace=True)
    tfidf = TfidfVectorizer().fit_transform(df['message'])
    similar = []
    similar_scores = []
    for i in range(0, df.shape[0]):
        cosine_similarities = linear_kernel(tfidf[i:i + 1], tfidf).flatten()
        sim_ids = cosine_similarities.argsort()[-2:-(2 + k):-1]  # The the k most similar posts
        sim = cosine_similarities[sim_ids]
        rec_posts = []
        for sim_index in sim_ids:
            postid = df.iloc[sim_index]['post_id']
            rec_posts.append(postid)
        similar.append(rec_posts)
        similar_scores.append(sim)
    df['most_similar'] = similar
    df['most_similar_rating'] = similar_scores
    df.to_csv(path_or_buf='data/fb_news_posts_20K_tfidf.csv',
              index=False,
              columns=['post_id', 'most_similar', 'most_similar_rating'])

    return (0)
find_most_similar_posts_tfidf(k=9)

0

Item-item Filtering
------------------

In [86]:
def find_most_similar_posts_collabfilter(comments, k=9):
#     comments = pd.read_csv("../data/fb_news_comments_1000k_cleaned.csv")
    user_post = comments[['from_id', 'post_id']]
    user_post.drop_duplicates(inplace=True)
    unique_users = user_post.from_id.drop_duplicates().values
    unique_posts = user_post.post_id.drop_duplicates().values
    users_map = dict(zip(unique_users, range(len(unique_users))))
    posts_map = dict(zip(unique_posts, range(len(unique_posts))))
    user_all_posts = user_post.groupby('from_id')['post_id'].apply(list).reset_index()
    
    item_matrix = dok_matrix((len(unique_users), len(unique_posts)), dtype=np.float32)
    for l in range(user_all_posts.shape[0]):
        i = users_map[user_all_posts.iloc[l].from_id]
        posts = user_all_posts.iloc[l].post_id
        for post in posts:
            j = posts_map[post]
            item_matrix[i, j] = 1
    
    cosine_sim = cosine_similarity(item_matrix.transpose())
    
    similar_posts = []
    similar_rating = []
    for l in range(cosine_sim.shape[0]):
        source_sim = cosine_sim[l,:]
        sim_ids = np.argpartition(source_sim, -k)[-k:]
        sim = source_sim[sim_ids]
        sim_posts = [unique_posts[d] for d in sim_ids]
        similar_posts.append(sim_posts)
        similar_rating.append(sim)
    
    df = pd.DataFrame(data = {'post_id': unique_posts,
                             'most_similar': similar_posts,
                             'most_similar_rating': similar_rating})
#     df.to_csv(path_or_buf='data/fb_news_posts_20K_cf.csv',
#               index=False)
    return (df)
find_most_similar_posts_collabfilter(comments)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,most_similar,most_similar_rating,post_id
0,"[101988643193353_1474808579244679, 22873566721...","[0.031403713, 0.03396831, 0.034139436, 0.03413...",228735667216_10154890879532217
1,"[228735667216_10154867467342217, 6499393458_10...","[0.030934412, 0.031016141, 0.031782087, 0.0335...",228735667216_10154890968202217
2,"[326683984410_10155756905094411, 228735667216_...","[0.014537095, 0.020965697, 0.01657484, 0.02096...",228735667216_10154890852247217
3,"[114050161948682_1557918010895216, 22873566721...","[0.023057148, 0.030151132, 0.030151132, 0.0402...",228735667216_1426789250735491
4,"[228735667216_10154889414912217, 228735667216_...","[0.030304577, 0.030618623, 0.033333335, 0.0365...",228735667216_10154890645702217
5,"[228735667216_10154884554312217, 228735667216_...","[0.040818453, 0.041030496, 0.044946656, 0.0410...",228735667216_10154890600247217
6,"[228735667216_10154882787207217, 228735667216_...","[0.020306924, 0.020306924, 0.020412415, 0.9999...",228735667216_10154890480662217
7,"[326683984410_10155757318994411, 228735667216_...","[0.02901294, 0.030151132, 0.030457247, 0.03178...",228735667216_10154890399087217
8,"[228735667216_1862601217090061, 228735667216_1...","[0.030000001, 0.030000001, 0.030151132, 0.0301...",228735667216_1887717684813716
9,"[228735667216_10154890399087217, 228735667216_...","[0.024931919, 0.024931919, 0.025863029, 0.0251...",228735667216_10154890122852217


Recall and Precision at K=9
----

In [288]:
def split_id(val):
    return(re.sub("[^_,\d]", "", val).split(","))

def split_rating(val, sep=","):
    if sep == " ":
        patterns = "[\[\]]"
        string_rating = re.sub(patterns, "", val).split()
    else:
        patterns = "[^.,\d]"
        string_rating = re.sub(patterns, "", val).split(",")
    float_rating = [float(r) for r in string_rating]
    return(float_rating)

def format_similarity(df, sep=","): #split string to post_id's and similarities
    df['most_similar'] = df.most_similar.apply(split_id)    
    df['most_similar_rating'] = df.most_similar_rating.apply(split_rating, sep=sep)
    return(df)

In [292]:
# is_in_top_k(test_set.iloc[1,:], doc2vec)
doc2vec.head(3)

Unnamed: 0,post_id,most_similar,most_similar_rating
0,228735667216_10154890879532217,"[15704546335_10155650265786336, 40656699159_10...","[0.8075387477874756, 0.8046978712081909, 0.780..."
1,228735667216_10154890968202217,"[155869377766434_1898357183517636, 60894670532...","[0.8500571250915527, 0.8402469754219055, 0.832..."
2,228735667216_10154890852247217,"[14660729657_10155634752274658, 5834919267_101...","[0.840069591999054, 0.7996846437454224, 0.7782..."


In [29]:
posts = pd.read_csv("../data/fb_news_posts_20K.csv")
comments = pd.read_csv("../data/fb_news_comments_1000k_cleaned.csv")

In [216]:
# results
doc2vec = pd.read_csv("data/fb_news_posts_20K_doc2v.csv")
tfidf = pd.read_csv("data/fb_news_posts_20K_tfidf.csv")
cf = pd.read_csv("data/fb_news_posts_20K_cf.csv")
# most similar: 9 most similar posts to post_id, and their similarities are in most_similar_rating

In [217]:
# clean results
doc2vec = format_similarity(doc2vec)
tfidf = format_similarity(tfidf, sep=" ")
cf = format_similarity(cf, sep=" ")

In [276]:
# potential test users: users who have read more than 1 post
users = comments.groupby('from_id').count().iloc[:, 0].reset_index() # created_time : number of post read
test_set_size = np.int(0.2*users.shape[0])
potential_test_users = users[users.created_time > 1]
test_users = potential_test_users.sample(n=test_set_size)
# comments = comments.sample(frac=1).reset_index(drop=True) # shuffle data table so the test set is different
test_set = pd.merge(test_users, comments.drop_duplicates('from_id'), on='from_id').loc[:,['from_id', 'post_id']]

In [277]:
test_set.head()

Unnamed: 0,from_id,post_id
0,1871027686480208,43179984254_10155723215644255
1,10209208272196021,69813760388_10159261241170389
2,1599429306734354,85452072376_10154552110287377
3,216339585557184,7629206115_10154455642181116
4,10213207179791414,220198801458577_1114033675408414


In [312]:
train_set = pd.merge(test_set, comments, on=['from_id','post_id'], how='outer', indicator=True)
train_set = train_set[train_set['_merge'] != 'both'].loc[:,['from_id',  'post_id']]
train_set.head()

Unnamed: 0,from_id,post_id
105415,789496347883560,7642602143_10154904627962144
105416,1478915285462764,10643211755_10155823759236756
105417,10203629023884339,273864989376427_1636292599800319
105418,317293268729168,153005644864469_856893554475671
105419,10209495384130029,153005644864469_872901282874898


In [357]:
def is_in_top_k(test_case, rec, train_set, k=9):
    user = test_case[0]
    truth = test_case[1]
    user_old_posts = train_set.loc[user]
    recs = rec[rec.post_id.isin(user_old_posts)].most_similar.values.tolist()
    recs = np.array([np.array(sublist) for sublist in recs]).flatten()
    recs_rating = rec[rec.post_id.isin(user_old_posts)].most_similar_rating.values.tolist()
    recs_rating = np.array([np.array(sublist) for sublist in recs_rating]).flatten()
    top_k_indx = np.argpartition(recs_rating, -k)[-k:]
    if truth in recs[top_k_indx]:
        return 1
    else:
        return 0
    

In [None]:
def is_in_top_k(test_case, rec, train_set):
    user = test_case[0]
    truth = test_case[1]
    user_old_posts = train_set.loc[user].post_id
    recs = rec[rec.post_id.isin(user_old_posts)].most_similar.flatten()
    if truth in recs:
        return 1
    else:
        return 0

In [315]:
train_set_unique = train_set.groupby('from_id')['post_id'].apply(list)
train_set_unique.head()

from_id
5281959998    [5281959998_10151227066069999, 5281959998_1015...
5863113009    [5863113009_10155627549148010, 5863113009_1015...
5953023255                       [5953023255_10155535265963256]
6013004059    [6013004059_10155621098074060, 6013004059_1015...
6250307292    [6250307292_10155345345317293, 6250307292_1015...
Name: post_id, dtype: object

In [358]:
test_set.head().apply(is_in_top_k, 
                      rec=tfidf, 
                      train_set=train_set_unique,
                      axis=1)

user 1871027686480208
truth 43179984254_10155723215644255
mst sim rating [0.23758174 0.24001367 0.23758174 0.24377362 0.28313291 0.25644374
 0.28711919 0.29891195 0.4514414 ]
mst sim ['1416139158459267_1570695063003675' '18343191100_10155361150956101'
 '119984188013847_1618695724809345' '10606591490_10154886881871491'
 '41632789656_10156591033634657' '687156898054966_1208041569299827'
 '18343191100_10155348233571101' '10606591490_10154887331926491'
 '147772245840_10154643886285841']
user 10209208272196021
truth 69813760388_10159261241170389
mst sim rating [0.78155776 0.78155776 0.78155776 0.78155776 0.78155776 0.78155776
 0.78155776 0.78155776 0.78155776]
mst sim ['147772245840_10154622317400841' '326683984410_10155770183334411'
 '37763684202_10155634853219203' '37763684202_10155647698879203'
 '37763684202_10155636263009203' '37763684202_10155636637759203'
 '20324257234_10155554057332235' '98658495398_10155217399835399'
 '95475020353_10159463675845354']
user 1599429306734354
truth 8545

0    0
1    0
2    0
3    0
4    0
dtype: int64

In [349]:
is_in_top_k(test_set.iloc[1,:], doc2vec, train_set_unique)

69813760388_10159261241170389
mst sim rating [0.88532972 0.88616896 0.89278638 0.89402264 0.89766431 0.90299976
 0.94773328 0.90030879 0.97174335]
mst sim ['455410617809655_1705666756117362' '455410617809655_1723310037686367'
 '455410617809655_1595420800475292' '618786471475708_1565467036807642'
 '455410617809655_1624985004185538' '455410617809655_1706988082651896'
 '95475020353_10159443360685354' '455410617809655_1708093702541334'
 '10643211755_10155841600316756']


0

In [350]:
is_in_top_k(test_set.iloc[1,:], cf, train_set_unique)

69813760388_10159261241170389
mst sim rating [0.10153462 0.9999995  0.10936967 0.99999964 0.99999964 1.
 1.0000007  1.0000007  1.        ]
mst sim ['695526053890545_1245922578850887' '95475020353_10159464973745354'
 '95475020353_10159464877805354' '182919686769_10154657290936770'
 '182919686769_10154657578991770' '95475020353_10159457097155354'
 '182919686769_10154656795121770' '95475020353_10159456349685354'
 '182919686769_10154656608601770']


0