In [2]:
# Add this to the top of your notebook for auto-reloading
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from scripts.collect_articles import NewsDataCollector

from dotenv import load_dotenv
load_dotenv()


True

In [7]:
DATA_DIR = '../data/stock_portfolios'

In [8]:
import pickle

with open(f'{DATA_DIR}/portfolio_articles.pkl', 'rb') as f:
    article_dict = pickle.load(f)

with open(f'{DATA_DIR}/article_id_full_context.pkl', 'rb') as f:
    article_full_context = pickle.load(f)


In [9]:
import pandas as pd
from typing import Dict, Any

def extract_article_fields(article_dict: Dict[str, Any]) -> pd.DataFrame:
    """
    Extract specified fields from a dictionary of articles and return as a DataFrame.

    Args:
        article_dict (Dict[str, Any]): Dictionary where key is article_id and value is a dict of article metadata.

    Returns:
        pd.DataFrame: DataFrame with one row per article and columns for selected fields.
    """
    fields = [
        'article_id', 'pubDate', 'pubDateTZ', 'title', 'link', 'creator',
        'description', 'source_id', 'source_name', 'source_url', 'source_icon'
    ]
    data = []
    for article_id, item in article_dict.items():
        row = []
        for field in fields:
            if field == 'article_id':
                row.append(article_id)
            elif field == 'creator':
                creators = item.get(field, [])
                if isinstance(creators, list):
                    row.append(' | '.join(str(c) for c in creators))
                else:
                    row.append(str(creators) if creators is not None else '')
            else:
                row.append(item.get(field, ''))
        data.append(row)
    df = pd.DataFrame(data, columns=fields)
    return df

In [10]:
article_df = extract_article_fields(article_dict)

In [11]:
import pandas as pd
from typing import Dict, Any

def articles_to_dataframe(article_dict: Dict[str, Any]) -> pd.DataFrame:
    """
    Convert a dictionary of articles into a pandas DataFrame.

    Args:
        article_dict (Dict[str, Any]): Dictionary mapping article IDs to their information.

    Returns:
        pd.DataFrame: DataFrame with columns: article_id, title, summary, text.
    """
    data = []
    # Iterate through each article and extract relevant fields
    for article_id, info in article_dict.items():
        data.append({
            'article_id': article_id,
            # 'title': info.get('title', ''),
            'summary': info.get('summary', ''),
            'text': info.get('text', '')
        })
    # Create and return a DataFrame from the extracted data
    return pd.DataFrame(data)

In [12]:
full_context_df = articles_to_dataframe(article_full_context)

In [13]:
final_df = \
pd.merge(
    article_df,
    full_context_df,
    on='article_id',
    how = 'left'
)


In [14]:
final_df.to_csv(f'{DATA_DIR}/articles.csv',index=False)