In [1]:
import sys
sys.path.insert(0, "../scripts/")

from dataset import *
from evaluate import *

In [2]:
import pandas as pd
from pathlib import Path
import sys
import numpy as np
import random

from sklearn import metrics

from tqdm import tqdm
tqdm.pandas()

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
PATH_DATA = Path.home()/Path('code/microsoft-recommendation-contest/data/')
sys.path.append(str(PATH_DATA))

In [4]:
# Hyperparameters
EPOCHaS = 10 # epoch
LR = 5e-3  # learning rate
BATCH_SIZE = 64 # batch size for training

## Content

A few main modules.

- Load data
- Construct label dataset, containing two parts per sample
    - input: [],[] list of user historical news, list of news in impression
    - ouput: [] list of labels indicating which news are clicked (0/1) in the impression
- Model
    - Train (in word2vec approach, no need)
        - input: dataset
        - ouput: trained model, evaluation per epoch(optional), tensorboard data(optional)
    - Evaluate
        - input: dataset
        - ouput: all 4 scores
    - Inference
        - input: dataset
        - ouput: ranked recommendation result as of required format

## Load data

In [5]:
behaviors_train, news_train = load_data('MINDsmall_train')
behaviors_val, news_val = load_data('MINDsmall_dev')

../data/MINDsmall_train/behaviors.tsv, data shape: (156965, 5)
../data/MINDsmall_train/news.tsv, data shape: (51282, 8)
../data/MINDsmall_dev/behaviors.tsv, data shape: (73152, 5)
../data/MINDsmall_dev/news.tsv, data shape: (42416, 8)


In [6]:
# sel columns 
news_col_sel = ['news_id','title','abstract']
behaviors_col_sel = ['impression_id','user_id','history','impressions']

news_train = news_train[news_col_sel]
news_val = news_val[news_col_sel]
behaviors_train = behaviors_train[behaviors_col_sel]
behaviors_val = behaviors_val[behaviors_col_sel]

In [7]:
behaviors_train.sample(1)

Unnamed: 0,impression_id,user_id,history,impressions
112095,112096,U23964,N51892 N37327 N2203 N42458 N9933 N54496 N59649...,N56193-1 N27581-0 N37870-0 N12042-0 N18870-0 N...


In [8]:
news_train.sample(1)

Unnamed: 0,news_id,title,abstract
15733,N58470,Law to allow speed limit increase on Oklahoma ...,A law that will allow the speed limit to be in...


In [9]:
news_df = pd.concat([news_train,news_val]).drop_duplicates()
news_df.shape

(65238, 3)

## Generate News Embedding

Embedding can be created in various ways.
1. pooling of word embedding <- use this one in this notebook
2. news title+abstract embedding based on language model
3. end-to-end trained from network
4. from graph relation
5. entity embedding provided
6. mixed

In [10]:
glove_embedding_file = '../data/glove.6B/glove.6B.50d.txt'

In [11]:
embedding, word_ind  = load_w2v_from_file(glove_embedding_file, dim_size=None)

In [12]:
news_train.head(3)

Unnamed: 0,news_id,title,abstract
0,N55528,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the..."
1,N19639,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...
2,N61837,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...


In [13]:
def news_emb_gen_from_df(news_df, embedding, word_ind):
    """
    create news embedding from a dataframe
    return a dictionary whose key is news id and value is a vector
    """
    result = {}
      
    for _,row in news_df.iterrows():
        
        vec = np.array([embedding[word_ind.get(x,0)] for x in row[TITLE].split()])
        result[row[NEWS_ID]]=np.mean(vec, axis=0)
    
    return result

# function test
# process_emb_df(df_emb, dim_size=None)

In [14]:
%time
news_2_vec = news_emb_gen_from_df(pd.concat([news_train, news_val]), embedding, word_ind)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.05 µs


In [15]:
def build_news_embedding(x):
    emb = []
    news_2_index = dict() # index starts from 1, reserve 0 for padding
    index_2_news = ['PADDING']
    
    for i, news_id in enumerate(news_2_vec):
        news_2_index[news_id] = i+1
        emb.append(torch.tensor(news_2_vec[news_id]))
        index_2_news.append(news_id)
    
#     emb.insert(0, torch.randn(len(emb[0])))  # insert a randomized embedding for padding
    emb.insert(0, torch.zeros(len(emb[0])))  # insert a zero vector for padding
    weight = torch.stack(emb)
#     embedding = nn.Embedding.from_pretrained(weight, freeze=True)
    
    return weight, news_2_index, index_2_news

In [16]:
torch.zeros(2)

tensor([0., 0.])

In [17]:
news_embedding, news_2_index, index_2_news = build_news_embedding(news_2_vec)

In [18]:
len(news_2_index)

65238

In [19]:
len(news_2_vec)

65238

## Construct Dataset

- Construct label dataset, containing two parts per sample
    - input: [],[] list of user historical news, list of news in impression
    - ouput: [] list of labels indicating which news are clicked (0/1) in the impression

Build Dataset and DataLoader

#### List-wise training dataset

In [None]:
class BehaviorDataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, behavior_df, mode = 'train'):
        """
        Initialization
        mode: train, eval, inference
        """
        self.df = behavior_df
        self.mode = mode

    def __len__(self):
        'Denotes the total number of samples'
        return self.df.shape[0]

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        r = self.df.iloc[index]

        # Load data and get label
        history = r['history'].split()
        if self.mode in ('train','eval'):
            impressions = [x.split('-')[0] for x in r['impressions'].split(' ')]
            labels = [int(x[-1]) for x in r['impressions'].split(' ')]
        else:
            impressions = [x for x in r['impressions'].split(' ')]
            labels = None

        return (history,impressions,labels)

In [None]:
train_ds = BehaviorDataset(behaviors_train)

In [None]:
def collate_fn_listwise(batch, max_len = 20):
    # because there is no paddng, so only support batch size =1 
    history_list, impressions_list, label_list = [], [], []
    for history,impressions,labels in batch:
        if len(history)>max_len:
            history_list.append([news_2_index.get(news_id, 0) for news_id in history[-20:]])
        else:
            n = len(history)
            tmp = [0]*(max_len-n) + [news_2_index.get(news_id, 0) for news_id in history]
            history_list.append(tmp)
#         history_list.append([news_2_index.get(news_id, 0) for news_id in history])
        impressions_list.append([news_2_index.get(news_id, 0) for news_id in impressions])
        label_list.append(labels)
        
    history_list = torch.tensor(history_list, dtype=torch.int64)
    impressions_list = torch.tensor(impressions_list, dtype=torch.int64)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    
    return history_list.to(device), impressions_list.to(device), label_list.to(device)

In [None]:
train_dl = torch.utils.data.DataLoader(
    train_ds, batch_size=1, 
    shuffle=False, num_workers=0, collate_fn=collate_batch)

#### Pointwise dataset

In [20]:
def explode_behavior_to_points(df, is_test=False):
    """df: user bahaviour df"""
    df['impression'] = df['impressions'].apply(lambda x: [y for y in x.split()])
    df_pred = df[['impression_id','user_id','history','impression']].explode('impression')
    df_pred['news_id'] = df_pred['impression'].apply(lambda x: x.split('-')[0])
    
    if not is_test:
        df_pred['label'] = df_pred['impression'].apply(lambda x: x.split('-')[1]).astype(np.uint8)
    
    return df_pred

In [21]:
class PointwiseDataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, pointwise_df, mode = 'train'):
        """
        Initialization
        mode: train, eval, test
        """
        self.df = pointwise_df
        self.mode = mode

    def __len__(self):
        'Denotes the total number of samples'
        return self.df.shape[0]

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        r = self.df.iloc[index]

        # Load data and get label
        history = r['history'].split()
        impressions = r['news_id']
        label = r['label'] if self.mode in ('train','eval') else None

        return (history,impressions,label)

In [22]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [23]:
def collate_fn_pointwise(batch):
    history_list, impressions_list, label_list = [], [], []
    for history,impression,label in batch:
        history_list.append([news_2_index.get(news_id, 0) for news_id in history])
        impressions_list.append(news_2_index.get(impression, 0))
        label_list.append(label)
    
    history_lengths = torch.tensor([len(h) for h in history_list], dtype=torch.int64) # shape: B*1, B is batchsize
    history_list = [torch.tensor(h, dtype=torch.int64) for h in history_list] 
    history_list = pad_sequence(history_list, batch_first=True, padding_value=0) # shape: B*T, T is longest length
#     history_list = torch.tensor(history_list, dtype=torch.int64)
    impressions_list = torch.tensor(impressions_list, dtype=torch.int64) # shape: B*1, B is batchsize
    label_list = torch.tensor(label_list, dtype=torch.int64) # shape: B*1, B is batchsize
    
    return history_list.to(device), history_lengths.to(device), impressions_list.to(device), label_list.to(device)

In [24]:
pointwise_train = explode_behavior_to_points(behaviors_train)
pointwise_train.dropna(inplace=True)
print(f'training sample shape = {pointwise_train.shape}')

training sample shape = (5723002, 6)


Positive vs negative ratio is about 1:22

In [25]:
pointwise_train.label.value_counts()

0    5491472
1     231530
Name: label, dtype: int64

In [26]:
train_dataset = PointwiseDataset(pointwise_train)

Verify implementation

In [34]:
# train_dl = torch.utils.data.DataLoader(
#     train_ds, batch_size=BATCH_SIZE, 
#     shuffle=True, num_workers=0, 
#     collate_fn=collate_fn_pointwise)

In [29]:
# for idx, (history, length, impression, label) in enumerate(train_dl):
#     print(history)
#     print(length)
#     print(impression)
#     print(label)
#     break

In [30]:
def visualize_prediction(history, impression, label):
    print('>'*30 )
    
    # user
    print('User viewed news history')
    k = 10 # sample size
    if len(history)>k:
        history_sample = random.sample(history, k=10)
    elif len(history)==0:
        print('\tNo history')
        history_sample = []
    else:
        history_sample = history
    
    tokens_user = set()
    for news_id in history_sample:
        title = news_df.loc[news_df['news_id']==index_2_news[news_id],'title'].values[0]
        tokens_user.update(title.split())
        print(f'\t{title}')

    # impression
    title = news_df.loc[news_df['news_id']==index_2_news[impression],'title'].values[0]
    print(f'Impression: {title}')
    tokens_news = set(title.split())
    
    # print shared tokens between user and news
    print(f'Shared tokens: {tokens_news&tokens_user}')
    
    # label
    print(f"\nPrediction: {label}")

In [31]:
news_df.loc[news_df['news_id']==index_2_news[17678]]

Unnamed: 0,news_id,title,abstract
17677,N6218,Why ex-NFL star Kellen Winslow II finally plea...,Ex-NFL tight end Kellen Winslow II struggled b...


In [32]:
for idx, (history, impression, label) in enumerate(train_dl):
    if label.numpy()[0]==0:
        continue
    h, i, l = history.numpy()[0].tolist(), impression.numpy()[0], label.numpy()[0]
    visualize_prediction(h,i,l)
    break

NameError: name 'train_dl' is not defined

Based on observation, there is barely any words in common between the user history and the news to predict.

## Model

In [49]:
from torch import nn

class PointwiseModel(nn.Module):
    def __init__(self, embedding_weight):
        super(PointwiseModel, self).__init__()
#         self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.embedding = nn.Embedding.from_pretrained(embedding_weight, freeze=True)
#         self.lstm = nn.LSTM()
#         num_class = 2
        embed_dim = embedding_weight.shape[1] # todo: dynamically assign based on embedding size.
        self.fc = nn.Linear(embed_dim*2, 1)
#         self.init_weights()

#     def init_weights(self):
#         initrange = 0.5
#         self.embedding.weight.data.uniform_(-initrange, initrange)
#         self.fc.weight.data.uniform_(-initrange, initrange)
#         self.fc.bias.data.zero_()

    def forward(self, history, length, impression):
        # B: batch size: T: sequence length, D: embedding dimension
        
        # history vector
        history_embedding = self.embedding(history)
        # to get average of vector while skipping the padding, sum first, then divided by length
        history_embedding = torch.sum(history_embedding, dim=1, keepdim=False) # # expect dimension: B*D
        user_embedding = history_embedding/length.view(-1,1)  # expect dimension: B*D
        
        # impression vector
        impressions_embedding = self.embedding(impression)  # expect dimension: B*D
#         similarities = F.cosine_similarity(user_embedding, impressions_embedding)

        h = torch.cat((impressions_embedding, user_embedding), dim=1)
        log_probs = F.log_softmax(self.fc(h), dim=1)
        return log_probs

In [95]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (history, length, impression, label) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(history, length, impression)
#         loss = criterion(predicted_label, label.type(torch.cuda.FloatTensor)) # MSE expect float tensor, not Long 
        loss = criterion(predicted_label, label.type(torch.cuda.FloatTensor).unsqueeze(1)) # MSE expect float tensor, not Long 
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += ((predicted_label.squeeze()>THRESHOLD) == label).sum()#.item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | accuracy {:8.3f}'.format(
                epoch, idx, len(dataloader),total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()
            
#             print(f'debug: {predicted_label.size()}')
#             print(f'debug: {label.size()}')
#             print(f'debug:{((predicted_label.squeeze()>THRESHOLD) == label).sum()}')

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (history, length, impression, label) in enumerate(dataloader):
            predicted_label = model(history, length, impression)
            loss = criterion(predicted_label, label.type(torch.cuda.FloatTensor))
            total_acc += ((predicted_label>THRESHOLD) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count


In [68]:
model = PointwiseModel(embedding_weight=news_embedding)
model.to(device)

PointwiseModel(
  (embedding): Embedding(65239, 50)
  (fc): Linear(in_features=100, out_features=1, bias=True)
)

In [69]:
# Hyperparameters
EPOCHS = 10 # epoch
LR = 5e-2  # learning rate
BATCH_SIZE = 1024 # batch size for training
THRESHOLD = 0.5

In [93]:
from torch.utils.data.dataset import random_split

criterion = torch.nn.CrossEntropyLoss()
criterion = torch.nn.BCEWithLogitsLoss()
# criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

total_accu = None
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = torch.utils.data.DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_fn_pointwise)
valid_dataloader = torch.utils.data.DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_fn_pointwise)
# test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
#                              shuffle=True, collate_fn=collate_batch)


In [96]:
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

| epoch   1 |   500/ 5310 batches | accuracy    0.960
| epoch   1 |  1000/ 5310 batches | accuracy    0.959
| epoch   1 |  1500/ 5310 batches | accuracy    0.960
| epoch   1 |  2000/ 5310 batches | accuracy    0.960
| epoch   1 |  2500/ 5310 batches | accuracy    0.959
| epoch   1 |  3000/ 5310 batches | accuracy    0.959
| epoch   1 |  3500/ 5310 batches | accuracy    0.959
| epoch   1 |  4000/ 5310 batches | accuracy    0.960
| epoch   1 |  4500/ 5310 batches | accuracy    0.959
| epoch   1 |  5000/ 5310 batches | accuracy    0.960


ValueError: Target size (torch.Size([1024])) must be the same as input size (torch.Size([1024, 1]))

In [None]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

## Generate user embedding

Embedding can be created in various ways.
1. pooling of news embedding
3. end-to-end trained model
4. from graph relation

In [None]:
def user_emb_gen_from_pooling(behavior_df, news_2_vec):
    """
    create user embedding by the pooling of its corresponding news embedding
    return a dictionary whose key is user id and value is a vector
    """
    behavior_df['history'].fillna('',inplace=True)
    
    result = {}
    for _,row in behavior_df.iterrows():
        if len(row['history'])>0:
            vec = np.array([news_2_vec.get(news_id) for news_id in row['history'].split()] )
            vec = np.mean(vec,axis=0)
            result[row[USER_ID]] = vec

    return result

In [None]:
%time
user_2_vec = user_emb_gen_from_pooling(pd.concat([behaviors_train, behaviors_val]), news_2_vec)

In [None]:
# def user_emb_gen_from_pooling(behavior_df, news_2_vec):
#     """
#     create user embedding by the pooling of its corresponding news embedding
#     return nn.Embedding instance and word_to_index dictionary
#     """
    
#     vocab = []
#     emb = []
    
#     for _,row in behavior_df.iterrows():
#         lookup_list = [news_2_vec.get(news_id) for news_id in row['history'].split()]
#         lookup_tensor = torch.tensor(lookup_list, dtype=torch.long)
#         vec = torch.mean(news_emb(lookup_tensor),axis=0).squeeze()
#         print(vec.size())
        
#         vocab.append(row['user_id'])
#         emb.append(vec)
    
#     print(emb)
#     emb.insert(0, torch.randn(len(vec)))  # insert a randomized embedding for padding
#     weight = torch.stack(emb)
#     embedding = nn.Embedding.from_pretrained(weight, freeze=True)
    
#     word_ind = {w:i+1 for i,w in enumerate(vocab)}  # index starts from 1, reserve 0 for padding
    
#     return embedding, word_ind 

In [None]:
# # test function
# user_emb, user_word_index = user_emb_gen_from_pooling(None, news_emb, news_word_index, behaviors_train.head(3))

## Predict

In [None]:
behaviors_val.head()

prepare labelled data

In [None]:
def cosine_sim(user_v,news_v):
    return np.dot(user_v, news_v)/(np.linalg.norm(user_v)*np.linalg.norm(news_v))

In [None]:
np.zeros(3)

In [None]:
def get_label_from_behavior(df):
#     df['is_clicked'] =
    return df['impressions'].apply(lambda s: [int(x[-1]) for x in s.split(' ')])


def predict_from_behavior(df):
    def _calculate_relevence(r):
        news = [x.split('-')[0] for x in r['impressions'].split(' ')]
        global news_2_vec, user_2_vec
        
        # if user id is not calculated, pad 0 as relevence
        if r['user_id'] not in user_2_vec:
            return np.zeros(len(news))
        
        # construct similarity
        user_v = user_2_vec.get(r['user_id'])
        relevence = [cosine_sim(news_2_vec.get(news_id),user_v) if news_id in news_2_vec else 0 for news_id in news ]
        
        rank = np.argsort(np.argsort(relevence)[::-1]) + 1  # really trick i would say. check https://github.com/numpy/numpy/issues/8757
        return 1./rank
    return df.progress_apply(_calculate_relevence, axis=1)
  

In [None]:
# calculate predictions
behaviors_train['is_clicked_pred'] = predict_from_behavior(behaviors_train)

In [None]:
behaviors_train['is_clicked'] = get_label_from_behavior(behaviors_train)

In [None]:
behaviors_train.sample(3)

In [None]:
scoring_from_relevence(behaviors_train['is_clicked'], behaviors_train['is_clicked_pred'])

In [None]:
# empty user embedding is only a small portion
behaviors_train[behaviors_train['is_clicked_pred'].apply(sum)==0].shape

In [None]:
behaviors_train.shape

In [None]:
# calculate predictions
behaviors_val['is_clicked_pred'] = predict_from_behavior(behaviors_val)
behaviors_val['is_clicked'] = get_label_from_behavior(behaviors_val)

In [None]:
group_auc = behaviors_val.progress_apply(lambda r: roc_auc_score(r['is_clicked'],r['is_clicked_pred']), axis=1).mean()
print(group_auc)

In [None]:
def generate_pred_dataset(df, is_test=False):
    """df: user bahaviour df"""
    df['impression'] = df['impressions'].apply(lambda x: [y for y in x.split()])
    df_pred = df[['user_id','impression']].explode('impression')
    df_pred['news_id'] = df_pred['impression'].apply(lambda x: x.split('-')[0])
    
    if not is_test:
        df_pred['label'] = df_pred['impression'].apply(lambda x: x.split('-')[1]).astype(np.uint8)
    
    return df_pred

In [None]:
df2 = generate_pred_dataset(behaviors_train)

In [None]:
df2.head()

In [None]:
def evaluate(r):
    user_v = user_2_vec.get(r[USER_ID], None)
    news_v = news_2_vec.get(r[NEWS_ID], None)
    
    if user_v is not None and news_v is not None:
        return np.dot(user_v, news_v)/(np.linalg.norm(user_v)*np.linalg.norm(news_v))
    else:
        return 0

In [None]:
y_pred = df2.progress_apply(evaluate,axis=1)

In [None]:
max(y_pred)

In [None]:
(y_pred.values+1)/2

In [None]:
import seaborn as sns


In [None]:
sns.distplot(y_pred.sample(100000))

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(df2[LABEL].values,(y_pred.values+1)/2,pos_label=1)

In [None]:
metrics.auc(fpr, tpr)

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
fig, ax = plt.subplots()
ax.plot(fpr, tpr, color='b', lw=2)
plt.show()

The result is no better than random guess. Need to validate the news vector. To see if closet vectors are making sense.

In [None]:
def count_click(impressions):
    return sum([int(x[-1]) for x in impressions.split(' ')])

In [None]:
behaviors_val['impressions'].apply(count_click)

In [None]:
news_val.head()

In [None]:
def predict(pred_net, user_emb, news_emb, pred_dataset):
    """
    generate a score between (0-1) on whether this news should be recommmended to this user
    
    pred_dataset: two values per instance, user_id and news_id
    """
    