In [None]:
import numpy as np
import openai
import os
import pandas as pd
import plotly.express as px
import random
import torch
from tqdm.auto import tqdm
tqdm.pandas()

import src.constants as constants
from src.common_utils import read_pickled_data
from src.data.embedding_utils import embed_news, embed_categories, load_embeddings, build_feature_vectors, load_feature_vectors, one_hot_encode_categories, embed_news_openai

random.seed(42)

In [None]:
%load_ext autoreload
%autoreload 2

Load the data.

In [None]:
data_news = read_pickled_data([constants.CONCAT_ALL_PATH, "preprocessed", "emb_news.pkl"])
data_news.head()

Produce the embeddings for title, abstract and the concatenation of title and abstract. Each result is stored in its own embeddings map.

In [None]:
for to_embed in ["all"]:
    emb_map = embed_news(
        data_news,
        "sentence-transformers/all-mpnet-base-v2",
        save_dir=constants.CONCAT_ALL_PATH,
        to_embed=to_embed,
    )

In the following cells, we examine the structure of our embeddings. We sample 1000 random embeddings from an embeddings map, get statistical values and plot a histogram of the vector elements.

In [None]:
title_embeddings_map = load_embeddings(constants.CONCAT_ALL_PATH, to_embed="abstract")

In [None]:
samples = random.sample(list(title_embeddings_map.values()), 10000)
samples = np.array([emb.numpy() for emb in samples])
points = pd.DataFrame(samples.reshape(-1, 1), columns=["point"])

In [None]:
print(np.square(samples).sum(axis=1).mean())

In [None]:
fig = px.histogram(
    points,
    x="point",
    range_x=[-0.5, 0.5],
    nbins=200,
    histnorm='probability',
    title="Embeddings Distribution",
    width=750, height=500,
)
fig.update_xaxes(dtick=0.1)
fig.show()

# Features

In [None]:
data_news = read_pickled_data([constants.CONCAT_ALL_PATH, "preprocessed", "exp_news.pkl"])
data_news.head()

In [None]:
columns = [
    "title_length",
    "title_no_stopwords_length",
    "abstract_length",
    "abstract_no_stopwords_length",
    "title_and_abstract_length",
    "title_and_abstract_no_stopwords_length",
    "survival_time_hrs",
    "clicked",
    "ignored",
    "shown",
    "engagement_percentage"
]

In [None]:
features_map = build_feature_vectors(
    data_news,
    feature_columns=columns,
    map_name="no_ts",
    save_dir=constants.CONCAT_ALL_PATH
)

# Category Embeddings

In [None]:
cat_em, sub_cat_em = embed_categories(
    data_news,
    "sentence-transformers/all-MiniLM-L12-v2",
    save_dir=constants.CONCAT_ALL_PATH
)

In [None]:
cat_em = one_hot_encode_categories(data_news, save_dir=constants.CONCAT_ALL_PATH)

# OpenAI Embedding

In [None]:
openai.api_key = ""

In [None]:
emb_map = embed_news_openai(data_news, save_dir=constants.CONCAT_ALL_PATH)

In [None]:
emb_map = torch.load(
    os.path.join(
        constants.CONCAT_ALL_PATH,
        "embeddings", "title_and_abstract_openai_emb_map.pt"
    )
)