In [2]:
!pip install -U hopsworks --quiet

In [5]:
import requests
import pandas as pd
from datetime import datetime
from sentence_transformers import SentenceTransformer
import hopsworks

In [15]:
def get_headlines():
    response = requests.get('https://bbc-api.vercel.app/news?lang=english')
    news_data = response.json()
    title_set = set()
    for article in news_data['Latest']:
        title_set.add(article['title'])
    return list(title_set)

def df_time_headlines(headlines_today):
    df = pd.DataFrame(headlines_today, columns=["title"])
    df["pubdate"] = pd.to_datetime(datetime.today().strftime('%Y-%m-%d'))
    df = df[["pubdate", "title"]]
    return df

headlines_today = get_headlines()
today_df = df_time_headlines(headlines_today)
print(today_df)

     pubdate                                              title
0 2024-12-30              Can Ukraine face another year of war?
1 2024-12-30  2024: Relive the year through its most remarka...
2 2024-12-30  Video captures moments before South Korea plan...
3 2024-12-30  'It's unbearable': Families wait to see loved ...
4 2024-12-30  Frustrated families camp out at South Korea ai...
5 2024-12-30  Chlamydia could make koalas extinct. Can a vac...
6 2024-12-30  'I can't go on like this': US asks what's next...


In [24]:
emb_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = emb_model.encode(today_df['title'].tolist())
df = pd.DataFrame(embeddings, columns=[f"dim_{i}" for i in range(384)])
df["headlines"] = today_df['title'].tolist()
df["date"] = today_df['pubdate'].tolist()
print(df)
assert len(df) >=1, 'No new headlines collected'

2024-12-30 10:53:44,955 INFO: Use pytorch device_name: cpu
2024-12-30 10:53:44,956 INFO: Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

      dim_0     dim_1     dim_2     dim_3     dim_4     dim_5     dim_6  \
0 -0.042341  0.015876  0.022307 -0.022998 -0.047326 -0.001631 -0.047772   
1 -0.007151  0.113358  0.064126 -0.022800  0.016063 -0.009702 -0.137604   
2  0.005106  0.004954  0.001836 -0.058283  0.198408  0.048843 -0.029198   
3  0.066357  0.041667  0.023121  0.029047  0.104442 -0.002813  0.052574   
4  0.053918  0.025673 -0.042712  0.038671  0.152683  0.041900  0.022052   
5 -0.031812  0.076123 -0.062426  0.000308 -0.006568  0.062150 -0.071240   
6  0.002277  0.028623 -0.007793 -0.006434  0.027018  0.046218  0.052001   

      dim_7     dim_8     dim_9  ...   dim_376   dim_377   dim_378   dim_379  \
0 -0.121304 -0.018053 -0.038186  ... -0.007596  0.047679 -0.040935  0.030172   
1 -0.054552 -0.002961 -0.013483  ...  0.005183  0.037771 -0.037603  0.068029   
2  0.073342  0.065216  0.027922  ...  0.105715 -0.044914  0.025257 -0.025875   
3 -0.026101 -0.001229 -0.014729  ... -0.007839  0.073078 -0.004648  0.094011   

In [20]:
KEY = "wOWQmzzHeITT5wlJ.FkOButuQ3XpDXSUO1LnTuaNfD7SPWolfF1hateoistsLFFsBF7upULn5z6qKPOIB"
project = hopsworks.login(api_key_value=KEY)
fs = project.get_feature_store()
feature_group = fs.get_or_create_feature_group(
    name="headlinesemb",
    version=1,
    description="Feature group for 384-dimensional vectors",
    primary_key=["headlines"], 
    online_enabled=True 
)
feature_group.insert(df, write_options={"wait_for_job": True})

Uploading Dataframe: 100.00% |██████████| Rows 7/7 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: headlinesemb_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1163414/jobs/named/headlinesemb_1_offline_fg_materialization/executions
2024-12-30 10:01:49,363 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2024-12-30 10:01:52,494 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2024-12-30 10:04:45,115 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2024-12-30 10:04:45,245 INFO: Waiting for log aggregation to finish.
2024-12-30 10:05:06,769 INFO: Execution finished successfully.


(Job('headlinesemb_1_offline_fg_materialization', 'SPARK'), None)