In [19]:
import numpy as np
import openai
import os
import pandas as pd
import plotly.express as px
import random
import torch
from tqdm.auto import tqdm
tqdm.pandas()

import src.constants as constants
from src.common_utils import read_pickled_data
from src.data_exploration.embedding_utils import embed_news, embed_categories, load_embeddings, build_feature_vectors, load_feature_vectors, one_hot_encode_categories, embed_news_openai

random.seed(42)

In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Load the data.

In [21]:
data_news = read_pickled_data([constants.CONCAT_ALL_PATH, "preprocessed", "emb_news.pkl"])
data_news.head()

Unnamed: 0,news_id,category,sub_category,title,abstract,title_and_abstract,all
0,N88753,lifestyle,royals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...","The Brands Queen Elizabeth, Prince Charles, an...","category: lifestyle, sub-category: royals. tit..."
1,N45436,news,science and technology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,Walmart Slashes Prices on Last-Generation iPad...,"category: news, sub-category: science and tech..."
2,N23144,health,weight loss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,50 Worst Habits For Belly Fat These seemingly ...,"category: health, sub-category: weight loss. t..."
3,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,Dispose of unwanted prescription drugs during ...,"category: health, sub-category: medical. title..."
4,N93187,news,worldwide,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,The Cost of Trump's Aid Freeze in the Trenches...,"category: news, sub-category: worldwide. title..."


In [23]:
count = data_news['title_and_abstract'].str.split().apply(len).value_counts()
count.max()

2992

Produce the embeddings for title, abstract and the concatenation of title and abstract. Each result is stored in its own embeddings map.

In [18]:
for to_embed in ["all"]:
    emb_map = embed_news(
        data_news,
        "sentence-transformers/all-mpnet-base-v2",
        save_dir=constants.CONCAT_ALL_PATH,
        to_embed=to_embed,
    )

[INFO] embeddings will be saved in c:\workbench\developer\drlnrs\dataset_MIND\MINDlarge_all\embeddings
[INFO] device: cuda
[INFO] loading model: sentence-transformers/all-mpnet-base-v2
[INFO] preparing dataloader
[INFO] embedding: all


  0%|          | 0/1019 [00:00<?, ?it/s]

[INFO] creating embeddings map


0it [00:00, ?it/s]

[INFO] saving


In the following cells, we examine the structure of our embeddings. We sample 1000 random embeddings from an embeddings map, get statistical values and plot a histogram of the vector elements.

In [6]:
title_embeddings_map = load_embeddings(CONCAT_ALL_PATH, to_embed="abstract")

In [None]:
samples = random.sample(list(title_embeddings_map.values()), 100000)
samples = np.array([emb.numpy() for emb in samples])
points = pd.DataFrame(samples.reshape(-1, 1), columns=["point"])

In [None]:
print(np.square(samples).sum(axis=1).mean())

In [None]:
fig = px.histogram(
    points,
    x="point",
    range_x=[-0.5, 0.5],
    nbins=200,
    histnorm='probability',
    title="Embeddings Distribution",
    width=750, height=500,
    # TODO template
)
fig.update_xaxes(dtick=0.1)
fig.show()

# Features

In [9]:
data_news = read_pickled_data([CONCAT_ALL_PATH, "preprocessed", "exp_news.pkl"])
data_news.head()

Unnamed: 0,news_id,category,sub_category,title,abstract,url,title_entities,abstract_entities,title_tokens,title_tokens_no_stopwords,...,abstract_length,abstract_no_stopwords_length,title_and_abstract_length,title_and_abstract_no_stopwords_length,survival_time_hrs,first_read_timestamp,clicked,ignored,shown,engagement_percentage
0,N88753,lifestyle,lifestyleroyals,"the brands queen elizabeth, prince charles, an...","shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],"[the, brands, queen, elizabeth, prince, charle...","[brands, queen, elizabeth, prince, charles, pr...",...,12,7,23,15,161.186389,2019-11-11 07:55:42,0,1,1,0.0
1,N45436,news,newsscienceandtechnology,walmart slashes prices on last-generation ipads,apple's new ipad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[walmart, slashes, prices, on, last-generation...","[walmart, slashes, prices, last-generation, ip...",...,11,10,17,15,0.0,NaT,0,0,0,0.0
2,N23144,health,weightloss,50 worst habits for belly fat,these seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[50, worst, habits, for, belly, fat]","[50, worst, habits, belly, fat]",...,19,11,25,16,0.0,NaT,0,0,0,0.0
3,N86255,health,medical,dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[],"[dispose, of, unwanted, prescription, drugs, d...","[dispose, unwanted, prescription, drugs, dea, ...",...,0,0,11,8,0.0,NaT,0,0,0,0.0
4,N93187,news,newsworld,the cost of trump's aid freeze in the trenches...,lt. ivan molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...","[the, cost, of, trump, aid, freeze, in, the, t...","[cost, trump, aid, freeze, trenches, ukraine, ...",...,36,21,48,28,0.0,NaT,0,0,0,0.0


In [10]:
columns = ['title_length',
       'title_no_stopwords_length', 'abstract_length',
       'abstract_no_stopwords_length', 'title_and_abstract_length',
       'title_and_abstract_no_stopwords_length', 'survival_time_hrs', 'clicked', 'ignored', 'shown',
       'engagement_percentage']

In [11]:
features_map = build_feature_vectors(data_news, feature_columns=columns, map_name="no_ts", save_dir=CONCAT_ALL_PATH)

[INFO] embeddings will be saved in ../../dataset_MIND\MINDlarge_all\embeddings
[INFO] converting timestamp column


100%|██████████| 130379/130379 [00:00<00:00, 1552067.23it/s]


[INFO] building features map


100%|██████████| 130379/130379 [00:07<00:00, 17861.32it/s]


[INFO] saving


# Category Embeddings

In [5]:
cat_em, sub_cat_em = embed_categories(
    data_news,
    "sentence-transformers/all-MiniLM-L12-v2",
    save_dir=constants.CONCAT_ALL_PATH
)

[INFO] embeddings will be saved in c:\workbench\developer\drlnrs\dataset_MIND\MINDlarge_all\embeddings
[INFO] device: cuda
[INFO] loading model: sentence-transformers/all-MiniLM-L12-v2
[INFO] preparing dataloader
[INFO] embedding: category


  0%|          | 0/1019 [00:00<?, ?it/s]

[INFO] creating embeddings map


0it [00:00, ?it/s]

[INFO] saving
[INFO] embedding: sub_category


  0%|          | 0/1019 [00:00<?, ?it/s]

[INFO] creating embeddings map


0it [00:00, ?it/s]

[INFO] saving


In [13]:
cat_em = one_hot_encode_categories(data_news, save_dir=constants.CONCAT_ALL_PATH)

[INFO] embeddings will be saved in c:\workbench\developer\drlnrs\dataset_MIND\MINDlarge_all\embeddings


  0%|          | 0/130379 [00:00<?, ?it/s]

[INFO] saving


# OpenAI Embedding

In [4]:
openai.api_key = "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"

In [5]:
emb_map = embed_news_openai(data_news, save_dir=constants.CONCAT_ALL_PATH)

  0%|          | 0/130379 [00:00<?, ?it/s]

[INFO] saving


In [26]:
emb_map = torch.load(
    os.path.join(
        constants.CONCAT_ALL_PATH,
        "embeddings", "title_and_abstract_openai_emb_map.pt"
    )
)