In [1]:
from dotenv import load_dotenv
from loguru import logger
import sys

sys.path.append("..")

from news_extraction_pipeline.config import AINewsConfig
from news_extraction_pipeline.pipeline_steps import (
    extract_from_multiple_feed_urls,
    filter_by_date_threshold,
    filter_by_keywords,
    convert_datetime_columns_to_str
)
from utils.io_utils import store_df_to_excel

load_dotenv()

True

In [2]:
news_config = AINewsConfig()

In [4]:
bad_url = "https://techcrunch.com/category/artificial-intelligence/"

In [5]:
logger.info("Starting AI news retrieval process...")

news_config_dict = news_config.model_dump()

news_sources = [val for key, val in news_config_dict.items() if key.endswith("_FEED_URL")]

news_sources = [bad_url, news_config.MIT_NEWS_FEED_URL, news_config.AI_NEWS_FEED_URL]

articles = extract_from_multiple_feed_urls(news_sources)

[32m2025-10-08 23:46:05.933[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mStarting AI news retrieval process...[0m
[32m2025-10-08 23:46:05.935[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.extractors.news.news_extractors[0m:[36mcurrent_feed_url[0m:[36m82[0m - [1mSetting current feed url to https://techcrunch.com/category/artificial-intelligence/[0m
[32m2025-10-08 23:46:05.938[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.extractors.news.news_extractors[0m:[36mcurrent_feed_url[0m:[36m82[0m - [1mSetting current feed url to https://news.mit.edu/rss/feed[0m
[32m2025-10-08 23:46:05.940[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.extractors.news.news_extractors[0m:[36mcurrent_feed_url[0m:[36m82[0m - [1mSetting current feed url to https://www.artificialintelligence-news.com/artificial-intelligence-news/feed/[0m
[32m2025-10-08 23:46:05.941[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.selectors.ext

In [5]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62 entries, 0 to 49
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   title         62 non-null     object             
 1   news_link     62 non-null     object             
 2   image_link    57 non-null     object             
 3   publish_date  62 non-null     datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), object(3)
memory usage: 2.4+ KB


In [6]:
articles.sample(10)

Unnamed: 0,title,news_link,image_link,publish_date
16,AI maps how a new antibiotic targets gut bacteria,https://news.mit.edu/2025/ai-maps-how-new-anti...,https://news.mit.edu/sites/default/files/style...,2025-10-03 21:00:00+00:00
45,MIT’s work with Idaho National Laboratory adva...,https://news.mit.edu/2025/mit-work-with-idaho-...,https://news.mit.edu/sites/default/files/style...,2025-09-23 13:00:00+00:00
19,Martin Trust Center for MIT Entrepreneurship w...,https://news.mit.edu/2025/martin-trust-center-...,https://news.mit.edu/sites/default/files/style...,2025-10-02 19:55:00+00:00
33,A beacon of light,https://news.mit.edu/2025/beacon-of-light-geol...,https://news.mit.edu/sites/default/files/style...,2025-09-29 20:00:00+00:00
35,How the brain splits up vision without you eve...,https://news.mit.edu/2025/how-brain-splits-vis...,https://news.mit.edu/sites/default/files/style...,2025-09-26 19:50:00+00:00
2,"Riccardo Comin, two MIT alumni named 2025 Moor...",https://news.mit.edu/2025/riccardo-comin-mit-a...,https://news.mit.edu/sites/default/files/style...,2025-10-08 19:00:00+00:00
3,How AI is changing the way we travel,https://www.artificialintelligence-news.com/ne...,https://www.artificialintelligence-news.com/wp...,2025-10-07 11:00:00+00:00
6,China Mobile Shanghai launches industry-first ...,https://www.artificialintelligence-news.com/ne...,https://www.artificialintelligence-news.com/wp...,2025-10-03 09:00:00+00:00
4,Using generative AI to diversify virtual train...,https://news.mit.edu/2025/using-generative-ai-...,https://news.mit.edu/sites/default/files/style...,2025-10-08 17:45:00+00:00
21,A simple formula could guide the design of fas...,https://news.mit.edu/2025/simple-formula-could...,https://news.mit.edu/sites/default/files/style...,2025-10-02 18:00:00+00:00


In [6]:
filtered_by_date = filter_by_date_threshold(
    df = articles,
    filter_column=news_config.DATE_COLUMN,
    max_days_old=10
)


[32m2025-10-08 23:47:18.702[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_date_threshold[0m:[36m162[0m - [1mFiltering articles published within the last 10 days.[0m
[32m2025-10-08 23:47:18.705[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_date_threshold[0m:[36m175[0m - [1mDate filtering complete. 47 articles published within the allowed range.[0m


In [7]:
filtered_by_kw = filter_by_keywords(
    df = filtered_by_date,
    filter_column=news_config.COLUMN_TO_FILTER_BY_KW,
    case_sen_search_kw=news_config.CASE_SEN_SEARCH_KW,
    case_insen_search_kw=news_config.CASE_INSEN_SEARCH_KW,
)

[32m2025-10-08 23:47:21.154[0m | [34m[1mDEBUG   [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_keywords[0m:[36m107[0m - [34m[1mFiltering articles by the next parameters...[0m
[32m2025-10-08 23:47:21.155[0m | [34m[1mDEBUG   [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_keywords[0m:[36m108[0m - [34m[1mcase_sen_search_kw =[' AI ', 'AI ', 'AI ', 'A.I.', ' AI-', 'AI-'][0m
[32m2025-10-08 23:47:21.156[0m | [34m[1mDEBUG   [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_keywords[0m:[36m109[0m - [34m[1mcase_insen_search_kw =['Artificial Intelligence', 'Machine Learning', 'Deep Learning', 'Neural Networks', 'NLP', 'Computer Vision', 'Data Science', 'Gemini', 'Bard', 'ChatGPT', 'GPT-4', 'DALL-E', 'MidJourney', 'Stable Diffusion', 'Claude', 'LLaMA', 'Whisper'][0m
[32m2025-10-08 23:47:21.157[0m | [34m[1mDEBUG   [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_keywords[0m:

In [9]:
final_df = convert_datetime_columns_to_str(
    df = filtered_by_kw,
    string_format = news_config.DATE_STRING_FORMAT
)

final_df.info()

[32m2025-10-08 23:47:33.663[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mconvert_datetime_columns_to_str[0m:[36m214[0m - [1mConverting datetime columns to string format: %Y-%m-%dT%H:%M:%SZ[0m
[32m2025-10-08 23:47:33.664[0m | [34m[1mDEBUG   [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mconvert_datetime_columns_to_str[0m:[36m215[0m - [34m[1mDetected datetime columns: ['publish_date'][0m
[32m2025-10-08 23:47:33.665[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mconvert_datetime_columns_to_str[0m:[36m220[0m - [1mDatetime conversion complete.[0m


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         14 non-null     object
 1   news_link     14 non-null     object
 2   image_link    14 non-null     object
 3   publish_date  14 non-null     object
dtypes: object(4)
memory usage: 580.0+ bytes


In [10]:
store_df_to_excel(
    df = final_df,
    local_file_path=news_config.FILE_PATH,
    sheet_name=news_config.EXCEL_SHEET_NAME,
    table_name=news_config.EXCEL_TABLE_NAME
)