In [None]:
!pip install sentence-transformers

In [None]:
import os
import numpy as np
import pandas as pd
import torch
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

In [None]:
def extract_article_name(file):
    return os.path.splitext(file)[0]

In [None]:
articles = [extract_article_name(file) for file in os.listdir("../input/plaintext-articles")]

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [None]:
data = []
embeddings = []

for root, dirs, files in os.walk("../input/wikispeedia-htmls"):
    if 'index' in dirs:
        dirs.remove('index')
    
    for file in files:
        if not file.endswith('.htm'):
            continue

        article_name = extract_article_name(file)

        if article_name not in articles:
            continue
        
        file_path = os.path.join(root, file)
        
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
            except UnicodeDecodeError:
                with open(file_path, 'r', encoding='latin-1') as f:
                    content = f.read()
            soup = BeautifulSoup(content, 'lxml')
            
            first_paragraph = soup.find('p')
            text = first_paragraph.get_text(strip=False)
            
            embedding = model.encode(text).tolist()
            embeddings.append(embedding)
            data.append({
                "article_name": article_name,
                "embedding": embedding,
            })

In [None]:
# reduced_embeddings = PCA(n_components=192).fit_transform(np.array(embeddings))

In [None]:
# for i, reduced_embedding in enumerate(reduced_embeddings):
#     data[i]["embedding"] = reduced_embedding.tolist()

In [None]:
df = pd.DataFrame(data)

In [None]:
df.head()

In [None]:
len(df)

In [None]:
df.to_csv("article_embeddings.csv", index=False)