In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [2]:
from pyfm import pylibfm

In [3]:
#from pyfm import pylibfm
import random
import numpy as np
import scipy.sparse as ss
import os

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/v-taqi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
stopWords = stopwords.words('english')

In [6]:
def read_news(path,filename):
    news=['']
    category=[]
    subcategory=[]
    news_index={}
    index=1
    word_dict={}
    word_index=1
    with open(os.path.join(path,filename)) as f:
        lines=f.readlines()
    for line in lines:
        doc_id,vert,subvert,title=line.strip('\n').split('\t')[0:4]
        news_index[doc_id]=index
        index+=1
        category.append(vert)
        subcategory.append(subvert)
        news.append(title)

    return news,news_index

In [7]:
data_root_path = './V2Data'

In [8]:
news,news_index= read_news(data_root_path,'docs.tsv')

In [9]:
def read_clickhistory(path,filename):
    with open(os.path.join(path,filename)) as f:
        lines=f.readlines()
    session=[]
    for l in lines:
        userid, clicks, imp =l.strip().split('\t')
        clicks = clicks.split('#N#')
        true_click = []
        for click in clicks:
            t = click.split('#TAB#')[0]
            if t =='':
                continue
            true_click.append(t)
        pos, neg, _ = imp.split('#TAB#')
        pos = pos.split()
        neg = neg.split()
        
        session.append([true_click,pos,neg])
    return session

In [10]:
train_session=read_clickhistory(data_root_path,'train.tsv')
test_session=read_clickhistory(data_root_path,'test.tsv')
val_session=read_clickhistory(data_root_path,'val.tsv')

In [11]:
def parser_user(session,news_index,news):
    users = []
    for i in range(len(session)):
        clicked = ''
        for j in range(len(session[i][0])):
            h =session[i][0][j]
            index = news_index[h]
            clicked+=news[index]+' '
        users.append(clicked)
    return users

In [12]:
train_user = parser_user(train_session,news_index,news)
test_user = parser_user(test_session,news_index,news)
val_user = parser_user(val_session,news_index,news)

In [13]:
transformer = TfidfVectorizer(stop_words = stopWords,max_features=12000)
NewsVec = transformer.fit(news)
news_vec = transformer.transform(news)

In [14]:
train_user_vec = transformer.transform(train_user)
test_user_vec = transformer.transform(test_user)
val_user_vec = transformer.transform(val_user)

In [15]:
def newsample(nnn,ratio):
    if ratio >len(nnn):
        return random.sample(nnn*(ratio//len(nnn)+1),ratio)
    else:
        return random.sample(nnn,ratio)

def get_train_session(session,npratio,news_index):
    user_id = []
    doc_id = []
    label_id = []
    for sess_id in range(len(session)):
        clicked, poss, negs = session[sess_id]
        for pos in poss:
            news_id = news_index[pos]
            user_id.append(sess_id)
            doc_id.append(news_id)
            label_id.append(1)
            sampled_negs = newsample(negs,npratio)
            for neg in sampled_negs:
                news_id = news_index[neg]
                user_id.append(sess_id)
                doc_id.append(news_id)
                label_id.append(0)
    user_id = np.array(user_id,dtype='int32')
    doc_id = np.array(doc_id,dtype='int32')
    label_id = np.array(label_id)
    return user_id, doc_id, label_id

def get_test_session(session,news_index):
    sess_all = []
    user_id = []
    label = []
    sess_location = []
    index = 0
    for sess_id in range(len(session)):
        sess = session[sess_id]
        _,poss,negs=sess
        start = index
        for pos in poss:
            sess_all.append(news_index[pos])
            label.append(1.0)
            user_id.append(sess_id)
            index+=1
        for neg in negs:
            sess_all.append(news_index[neg])
            label.append(0.0)
            user_id.append(sess_id)
            index+=1
        ed = index
        sess_location.append([start,ed])
    sess_all = np.array(sess_all,dtype='int32')
    user_id = np.array(user_id,dtype='int32')
    label = np.array(label,dtype='int32')
    
    return sess_all,label,user_id,sess_location

In [16]:
train_user_id, train_doc_id, train_label_id = get_train_session(train_session,1,news_index)
test_doc_id, test_label_id, test_user_id, test_impressionid = get_test_session(test_session,news_index)
val_doc_id, val_label_id, val_user_id, val_impressionid = get_test_session(val_session,news_index)

In [17]:
train_user_feature = train_user_vec[train_user_id.astype('int')]
train_news_feature = news_vec[train_doc_id]

In [18]:
train_x = ss.hstack([train_user_feature,train_news_feature]).tocsr()

In [19]:
fm = pylibfm.FM(
            num_factors=10, 
            num_iter=1, 
            verbose=True, 
            task="classification", 
            initial_learning_rate=0.001, 
            learning_rate_schedule="optimal")

In [20]:
fm = pylibfm.FM()

In [21]:
1

1

In [22]:
%time fm.fit(train_x, train_label_id)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training log loss: 0.66477
CPU times: user 5min 57s, sys: 1.13 s, total: 5min 59s
Wall time: 6min 2s


In [23]:
test_predict_label = np.zeros(test_label_id.shape)

In [24]:
def compute_test_in_batch(test_predict_label):
    i = 0
    batch_size = 1024
    flag = True
    while flag:
        if i%1000==0:
            print(i)
        start = i*batch_size
        ed = (i+1)*batch_size
        i+=1
        if ed>=test_predict_label.shape[0]:
            flag = False
            ed = test_predict_label.shape[0]
        batch_test_user_feature = test_user_vec[test_user_id[start:ed].astype('int')]
        batch_test_news_feature = news_vec[test_doc_id[start:ed]]
        batch_test_x = ss.hstack([batch_test_user_feature,batch_test_news_feature]).tocsr()
        test_predict_label[start:ed]=fm.predict(batch_test_x)
    return test_predict_label

In [25]:
test_predict_label = compute_test_in_batch(test_predict_label)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [26]:
def dcg_score(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)


def ndcg_score(y_true, y_score, k=10):
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    return actual / best


def mrr_score(y_true, y_score):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order)
    rr_score = y_true / (np.arange(len(y_true)) + 1)
    return np.sum(rr_score) / np.sum(y_true)

In [27]:
def evalute(score,test_impressionid):
    all_auc=[]
    all_mrr=[]
    all_ndcg=[]
    all_ndcg2=[]
    for impression_id in range(len(test_impressionid)):
        start,ed=test_impressionid[impression_id]
        y_score=score[start:ed]
        y_true=test_label[start:ed]
            
        all_auc.append(roc_auc_score(y_true,y_score))
        all_mrr.append(mrr_score(y_true,y_score))
        all_ndcg.append(ndcg_score(y_true,y_score,5))
        all_ndcg2.append(ndcg_score(y_true,y_score,10))
    print(np.mean(all_auc),np.mean(all_mrr),np.mean(all_ndcg),np.mean(all_ndcg2))
    return [np.mean(all_auc),np.mean(all_mrr),np.mean(all_ndcg),np.mean(all_ndcg2)]

In [28]:
from sklearn.metrics import roc_auc_score

In [29]:
def evalute(score,test_label,test_impressionid):
    all_auc=[]
    all_mrr=[]
    all_ndcg=[]
    all_ndcg2=[]
    for impression_id in range(len(test_impressionid)):
        start,ed=test_impressionid[impression_id]
        y_score=score[start:ed]
        y_true=test_label[start:ed]
        print(impression_id)
        all_auc.append(roc_auc_score(y_true,y_score))
        all_mrr.append(mrr_score(y_true,y_score))
        all_ndcg.append(ndcg_score(y_true,y_score,5))
        all_ndcg2.append(ndcg_score(y_true,y_score,10))
    print(np.mean(all_auc),np.mean(all_mrr),np.mean(all_ndcg),np.mean(all_ndcg2))
    return [np.mean(all_auc),np.mean(all_mrr),np.mean(all_ndcg),np.mean(all_ndcg2)]

In [30]:
def evalute(score,test_label,test_impressionid):
    all_auc=[]
    all_mrr=[]
    all_ndcg=[]
    all_ndcg2=[]
    for impression_id in range(len(test_impressionid)):
        start,ed=test_impressionid[impression_id]
        y_score=score[start:ed]
        y_true=test_label[start:ed]
        all_auc.append(roc_auc_score(y_true,y_score))
        all_mrr.append(mrr_score(y_true,y_score))
        all_ndcg.append(ndcg_score(y_true,y_score,5))
        all_ndcg2.append(ndcg_score(y_true,y_score,10))
    all_auc = np.array(all_auc)*100
    all_mrr = np.array(all_mrr)*100
    all_ndcg = np.array(all_ndcg)*100
    all_ndcg2 = np.array(all_ndcg2)*100
    return all_auc, all_mrr, all_ndcg, all_ndcg2

def print_mean(AUC,MRR,n5,n10):
    AUC = AUC.mean()
    MRR = MRR.mean()
    n5 = n5.mean()
    n10 = n10.mean()
    print(AUC, MRR, n5, n10)
    return [AUC, MRR, n5, n10]

In [31]:
AUC, MRR, nDCG5, nDCG10 = evalute(test_predict_label,test_label_id,test_impressionid)

In [32]:
print_mean(AUC, MRR, nDCG5, nDCG10)

55.68677383957965 25.836799571121862 27.098444658760297 32.54843744188427


[55.68677383957965, 25.836799571121862, 27.098444658760297, 32.54843744188427]

In [36]:
val_predict_label = np.zeros(val_label_id.shape,dtype='float32')

In [37]:
def compute_val_in_batch():
    i = 0
    batch_size = 1024
    flag = True
    while flag:
        if i%1000==0:
            print(i)
        start = i*batch_size
        ed = (i+1)*batch_size
        i+=1
        if ed>=val_predict_label.shape[0]:
            flag = False
            ed = val_predict_label.shape[0]
        batch_val_user_feature = val_user_vec[val_user_id[start:ed].astype('int')]
        batch_val_news_feature = news_vec[val_doc_id[start:ed]]
        batch_val_x = ss.hstack([batch_val_user_feature,batch_val_news_feature]).tocsr()
        t=fm.predict(batch_val_x)
        val_predict_label[start:ed] = t
    return None

In [38]:
compute_val_in_batch()

0


In [39]:
val_AUC, val_MRR, val_nDCG5, val_nDCG10 = evalute(val_predict_label,val_label_id,val_impressionid)

In [40]:
print_mean(val_AUC,val_MRR,val_nDCG5,val_nDCG10)

56.295982350628755 25.681055902498475 27.811220951601094 33.81432924016593


[56.295982350628755, 25.681055902498475, 27.811220951601094, 33.81432924016593]