In [1]:
from dotenv import load_dotenv
from loguru import logger
import sys

sys.path.append("..")

from news_extraction_pipeline.config import AINewsConfig
from news_extraction_pipeline.schemas import PipelineArgs
from news_extraction_pipeline.pipeline_steps import (
    extract_from_multiple_feed_urls,
    filter_by_date_threshold,
    filter_by_keywords,
    convert_datetime_columns_to_str
)

load_dotenv()

True

In [2]:
news_config = AINewsConfig()
pipeline_args = PipelineArgs()

In [3]:
bad_url = "https://techcrunch.com/category/artificial-intelligence/"

In [4]:
logger.info("Starting AI news retrieval process...")

news_config_dict = news_config.model_dump()

news_sources = [val for key, val in news_config_dict.items() if key.endswith("_FEED_URL")]

news_sources = [bad_url, news_config.MIT_NEWS_FEED_URL, news_config.AI_NEWS_FEED_URL]

articles = extract_from_multiple_feed_urls(news_sources)

[32m2025-10-09 18:31:57.335[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mStarting AI news retrieval process...[0m
[32m2025-10-09 18:31:57.345[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.extractors.news.news_extractors[0m:[36mcurrent_feed_url[0m:[36m64[0m - [1mSetting current feed url to https://techcrunch.com/category/artificial-intelligence/[0m
[32m2025-10-09 18:31:57.351[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.extractors.news.news_extractors[0m:[36mcurrent_feed_url[0m:[36m64[0m - [1mSetting current feed url to https://news.mit.edu/rss/feed[0m
[32m2025-10-09 18:31:57.351[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.extractors.news.news_extractors[0m:[36mcurrent_feed_url[0m:[36m64[0m - [1mSetting current feed url to https://www.artificialintelligence-news.com/artificial-intelligence-news/feed/[0m
[32m2025-10-09 18:31:57.356[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.selectors.ext

In [5]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62 entries, 0 to 49
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   title         62 non-null     object             
 1   news_link     62 non-null     object             
 2   image_link    62 non-null     object             
 3   publish_date  62 non-null     datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), object(3)
memory usage: 2.4+ KB


In [6]:
articles.sample(10)

Unnamed: 0,title,news_link,image_link,publish_date
46,MIT’s work with Idaho National Laboratory adva...,https://news.mit.edu/2025/mit-work-with-idaho-...,https://news.mit.edu/sites/default/files/style...,2025-09-23 13:00:00+00:00
3,"Riccardo Comin, two MIT alumni named 2025 Moor...",https://news.mit.edu/2025/riccardo-comin-mit-a...,https://news.mit.edu/sites/default/files/style...,2025-10-08 19:00:00+00:00
39,AI system learns from many types of scientific...,https://news.mit.edu/2025/ai-system-learns-man...,https://news.mit.edu/sites/default/files/style...,2025-09-25 15:00:00+00:00
20,Martin Trust Center for MIT Entrepreneurship w...,https://news.mit.edu/2025/martin-trust-center-...,https://news.mit.edu/sites/default/files/style...,2025-10-02 19:55:00+00:00
17,AI maps how a new antibiotic targets gut bacteria,https://news.mit.edu/2025/ai-maps-how-new-anti...,https://news.mit.edu/sites/default/files/style...,2025-10-03 21:00:00+00:00
9,Laurent Demanet appointed co-director of MIT C...,https://news.mit.edu/2025/laurent-demanet-co-d...,https://news.mit.edu/sites/default/files/style...,2025-10-07 21:00:00+00:00
5,Using generative AI to diversify virtual train...,https://news.mit.edu/2025/using-generative-ai-...,https://news.mit.edu/sites/default/files/style...,2025-10-08 17:45:00+00:00
2,MIT Schwarzman College of Computing and MBZUAI...,https://news.mit.edu/2025/mit-schwarzman-colle...,https://news.mit.edu/sites/default/files/style...,2025-10-08 19:10:00+00:00
7,Uncovering new physics in metals manufacturing,https://news.mit.edu/2025/uncovering-new-physi...,https://news.mit.edu/sites/default/files/style...,2025-10-08 09:00:00+00:00
15,Report: Sustainability in supply chains is sti...,https://news.mit.edu/2025/report-sustainabilit...,https://news.mit.edu/sites/default/files/style...,2025-10-06 12:00:00+00:00


In [7]:
filtered_by_date = filter_by_date_threshold(
    df = articles,
    filter_column=news_config.DATE_COLUMN,
    max_days_old=pipeline_args.max_days_old
)


[32m2025-10-09 18:32:31.364[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_date_threshold[0m:[36m162[0m - [1mFiltering articles published within the last 2 days.[0m
[32m2025-10-09 18:32:31.369[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_date_threshold[0m:[36m175[0m - [1mDate filtering complete. 15 articles published within the allowed range.[0m


In [8]:
filtered_by_kw = filter_by_keywords(
    df = filtered_by_date,
    filter_column=news_config.COLUMN_TO_FILTER_BY_KW,
    case_sen_search_kw=pipeline_args.case_sen_search_kw,
    case_insen_search_kw=pipeline_args.case_insen_search_kw,
)

[32m2025-10-09 18:32:31.384[0m | [34m[1mDEBUG   [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_keywords[0m:[36m107[0m - [34m[1mFiltering articles by the next parameters...[0m
[32m2025-10-09 18:32:31.386[0m | [34m[1mDEBUG   [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_keywords[0m:[36m108[0m - [34m[1mcase_sen_search_kw =[' AI ', 'AI ', 'AI ', 'A.I.', ' AI-', 'AI-'][0m
[32m2025-10-09 18:32:31.387[0m | [34m[1mDEBUG   [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_keywords[0m:[36m109[0m - [34m[1mcase_insen_search_kw =['Artificial Intelligence', 'Machine Learning', 'Deep Learning', 'Neural Networks', 'NLP', 'Computer Vision', 'Data Science', 'Gemini', 'Bard', 'ChatGPT', 'GPT-4', 'DALL-E', 'MidJourney', 'Stable Diffusion', 'Claude', 'Whisper'][0m
[32m2025-10-09 18:32:31.388[0m | [34m[1mDEBUG   [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_keywords[0m:[36m110[

In [9]:
final_df = convert_datetime_columns_to_str(
    df = filtered_by_kw,
    string_format = news_config.DATE_STRING_FORMAT
)

final_df.info()

[32m2025-10-09 18:32:31.407[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mconvert_datetime_columns_to_str[0m:[36m214[0m - [1mConverting datetime columns to string format: %Y-%m-%dT%H:%M:%SZ[0m
[32m2025-10-09 18:32:31.407[0m | [34m[1mDEBUG   [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mconvert_datetime_columns_to_str[0m:[36m215[0m - [34m[1mDetected datetime columns: ['publish_date'][0m
[32m2025-10-09 18:32:31.407[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mconvert_datetime_columns_to_str[0m:[36m220[0m - [1mDatetime conversion complete.[0m


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         7 non-null      object
 1   news_link     7 non-null      object
 2   image_link    7 non-null      object
 3   publish_date  7 non-null      object
dtypes: object(4)
memory usage: 356.0+ bytes


## Full Pipeline in a line

In [None]:
import sys

sys.path.append("..")

from news_extraction_pipeline.pipeline import main

final_articles = main()

[32m2025-10-09 18:37:29.359[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.pipeline[0m:[36mmain[0m:[36m34[0m - [1mStarting AI news retrieval process...[0m
[32m2025-10-09 18:37:29.359[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.extractors.news.news_extractors[0m:[36mcurrent_feed_url[0m:[36m64[0m - [1mSetting current feed url to https://news.mit.edu/rss/feed[0m
[32m2025-10-09 18:37:29.359[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.extractors.news.news_extractors[0m:[36mcurrent_feed_url[0m:[36m64[0m - [1mSetting current feed url to https://www.artificialintelligence-news.com/artificial-intelligence-news/feed/[0m
[32m2025-10-09 18:37:29.359[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.selectors.extractor_selector[0m:[36mget_extractor[0m:[36m73[0m - [1mExtractor 'MITImageExtractor' selected for base URL: https://news.mit.edu[0m
[32m2025-10-09 18:37:29.384[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.selector

In [3]:
final_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         7 non-null      object
 1   news_link     7 non-null      object
 2   image_link    7 non-null      object
 3   publish_date  7 non-null      object
dtypes: object(4)
memory usage: 356.0+ bytes
