# NYT Article Text to OpenAI Embeddings

In [None]:
import pandas as pd
import numpy as np
import glob
import tiktoken
import json
import re
import os
from typing import Union, List

from joblib import Parallel, delayed
from itertools import product

from scipy.spatial.distance import cdist

import gzip

import openai
from tenacity import retry, wait_random_exponential, stop_after_attempt, retry_if_not_exception_type

EMBEDDING_MODEL = "text-embedding-ada-002"
EMBEDDING_ENCODING = "cl100k_base"
MAX_TOKENS = 8191

In [None]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(openai.InvalidRequestError))
def fetch_embedding(input_: Union[str, List[int]]) -> List[float]:
    return openai.Embedding.create(input=input_, model=EMBEDDING_MODEL)["data"][0]["embedding"]

encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
def fetch_encoding(text: str, truncate: bool = False) -> List[int]:
    enc = encoding.encode(text)
    if truncate:
        enc = enc[:MAX_TOKENS]
    return enc

In [None]:
all_files = glob.glob('ScrappedData/nytArticles/articleText/*.json')

In [None]:
all_articles = []
for filename in all_files:
    with open(filename, 'r') as f:
        json_data = json.load(f)
        json_data['id'] = os.path.splitext(os.path.basename(filename))[0]
        if json_data['article'] is not None:
            json_data['article'] = json_data['article'].replace("\n", " ")
            all_articles.append(json_data)
all_articles_df = pd.DataFrame(all_articles)
all_articles_df.index = all_articles_df['id']
all_articles_df = all_articles_df.drop(['summary', 'id'], axis=1)
article_text = all_articles_df['article']

In [None]:
all_articles_df["n_tokens"] = all_articles_df['article'].apply(lambda x: len(fetch_encoding(x)))

In [None]:
# Warning: OpenAI sometimes randomly returns [nan] as the embedding. Just redo that entry.

all_articles_df["embedding"] = all_articles_df['article'].apply(
    lambda x: fetch_embedding(fetch_encoding(x, truncate=True))
)

In [None]:
# all_articles_df = all_articles_df.drop('article', axis=1)
all_articles_df.to_csv("EmbeddingData/articles_with_embeddings.csv")
all_articles_df.to_parquet('EmbeddingData/articles_with_embeddings.parquet.gzip', compression='gzip')  

# Compute Euclidean Distances for Embedding Vectors

In [None]:
vector_df = pd.DataFrame(all_articles_df['embedding'].to_list())
embeddings_distance_matrix = cdist(vector_df, vector_df, metric='euclidean')
np.save('EmbeddingData/embeddings_distance_matrix.npy', embeddings_distance_matrix)

# Alternative Distance Measure Based on gzip Compression

In [None]:
def gzip_distance(text1: str, text2: str) -> float:
    '''
    Information-theory-based distance between two string using gzip.
    Based on https://arxiv.org/abs/2212.09410
    Not a symmetric distance! Also, distance of a text to itself is not zero!
    '''
    text1_compressed_length = len(gzip.compress(text1.encode()))
    text2_compressed_length = len(gzip.compress(text2.encode()))  
    combinedtext_compressed_length = len(gzip.compress(f'{text1} {text2}'.encode()))
    return (combinedtext_compressed_length - min(text1_compressed_length, text2_compressed_length)) / max(text1_compressed_length, text2_compressed_length)

In [None]:
articles = article_text.to_list()
# gzip_distance_matrix = np.array([[gzip_distance(text1, text2) for text1 in articles] for text2 in articles])
gzip_distance_array = Parallel(n_jobs=-1)(delayed(gzip_distance)(text1, text2) for text1, text2 in product(articles, articles))

In [None]:
gzip_distance_matrix = np.array(gzip_distance_array).reshape(len(articles), len(articles))
np.save('EmbeddingData/gzip_distance_matrix.npy', gzip_distance_matrix)