In [1]:
from dotenv import load_dotenv
from loguru import logger
import sys

sys.path.append("..")

from news_extraction_pipeline.config import AINewsConfig
from news_extraction_pipeline.schemas import PipelineArgs
from news_extraction_pipeline.pipeline_steps import (
    extract_from_multiple_feed_urls,
    filter_by_date_threshold,
    filter_by_keywords,
    convert_datetime_columns_to_str,
    store_in_database
)

load_dotenv()

E0000 00:00:1761151182.222679   23735 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


True

In [2]:
news_config = AINewsConfig()
pipeline_args = PipelineArgs()

In [3]:
bad_url = "https://techcrunch.com/category/artificial-intelligence/"

In [4]:
logger.info("Starting AI news retrieval process...")

news_config_dict = news_config.model_dump()

news_sources = [val for key, val in news_config_dict.items() if key.endswith("_FEED_URL")]

news_sources = [bad_url, news_config.MIT_NEWS_FEED_URL, news_config.AI_NEWS_FEED_URL]

articles = extract_from_multiple_feed_urls(news_sources)

[32m2025-10-22 16:39:42.291[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mStarting AI news retrieval process...[0m
[32m2025-10-22 16:39:42.293[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.extractors.news.news_extractors[0m:[36mcurrent_feed_url[0m:[36m64[0m - [1mSetting current feed url to https://techcrunch.com/category/artificial-intelligence/[0m
[32m2025-10-22 16:39:42.296[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.extractors.news.news_extractors[0m:[36mcurrent_feed_url[0m:[36m64[0m - [1mSetting current feed url to https://news.mit.edu/rss/feed[0m
[32m2025-10-22 16:39:42.298[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.extractors.news.news_extractors[0m:[36mcurrent_feed_url[0m:[36m64[0m - [1mSetting current feed url to https://www.artificialintelligence-news.com/artificial-intelligence-news/feed/[0m
[32m2025-10-22 16:39:42.300[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.extractor_sel

In [5]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62 entries, 0 to 49
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   title         62 non-null     object             
 1   news_link     62 non-null     object             
 2   image_link    62 non-null     object             
 3   publish_date  62 non-null     datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), object(3)
memory usage: 2.4+ KB


In [6]:
articles.sample(10)

Unnamed: 0,title,news_link,image_link,publish_date
16,Book reviews technologies aiming to remove car...,https://news.mit.edu/2025/book-reviews-tech-ai...,https://news.mit.edu/sites/default/files/style...,2025-10-16 20:35:00+00:00
31,MIT releases financials and endowment figures ...,https://news.mit.edu/2025/mit-releases-financi...,https://news.mit.edu/sites/default/files/style...,2025-10-10 20:00:00+00:00
8,South Korea scraps AI textbook programme,https://www.artificialintelligence-news.com/ne...,https://www.artificialintelligence-news.com/wp...,2025-10-17 12:01:59+00:00
49,System lets people personalize online social s...,https://news.mit.edu/2025/system-lets-people-p...,https://news.mit.edu/sites/default/files/style...,2025-10-01 14:00:00+00:00
38,Laurent Demanet appointed co-director of MIT C...,https://news.mit.edu/2025/laurent-demanet-co-d...,https://news.mit.edu/sites/default/files/style...,2025-10-07 21:00:00+00:00
5,China’s generative AI user base doubles to 515...,https://www.artificialintelligence-news.com/ne...,https://www.artificialintelligence-news.com/wp...,2025-10-21 10:00:00+00:00
24,Engineering next-generation fertilizers,https://news.mit.edu/2025/giorgio-rizzo-engine...,https://news.mit.edu/sites/default/files/style...,2025-10-14 20:50:00+00:00
47,"Palladium filters could enable cheaper, more e...",https://news.mit.edu/2025/palladium-filters-co...,https://news.mit.edu/sites/default/files/style...,2025-10-01 18:00:00+00:00
2,How AI adoption is moving IT operations from r...,https://www.artificialintelligence-news.com/ne...,https://www.artificialintelligence-news.com/wp...,2025-10-21 13:59:14+00:00
17,Breaking the old model of education with MIT O...,https://news.mit.edu/2025/breaking-old-model-e...,https://news.mit.edu/sites/default/files/style...,2025-10-16 19:15:00+00:00


In [7]:
filtered_by_date = filter_by_date_threshold(
    df = articles,
    filter_column=news_config.DATE_COLUMN,
    max_days_old=pipeline_args.max_days_old
)


[32m2025-10-22 16:40:04.508[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_date_threshold[0m:[36m165[0m - [1mFiltering articles published within the last 2 days.[0m
[32m2025-10-22 16:40:04.513[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_date_threshold[0m:[36m178[0m - [1mDate filtering complete. 17 articles published within the allowed range.[0m


In [8]:
filtered_by_kw = filter_by_keywords(
    df = filtered_by_date,
    filter_column=news_config.COLUMN_TO_FILTER_BY_KW,
    case_sen_search_kw=pipeline_args.case_sen_search_kw,
    case_insen_search_kw=pipeline_args.case_insen_search_kw,
)

[32m2025-10-22 16:40:04.525[0m | [34m[1mDEBUG   [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_keywords[0m:[36m110[0m - [34m[1mFiltering articles by the next parameters...[0m
[32m2025-10-22 16:40:04.526[0m | [34m[1mDEBUG   [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_keywords[0m:[36m111[0m - [34m[1mcase_sen_search_kw =[' AI ', 'AI ', 'AI ', 'A.I.', ' AI-', 'AI-'][0m
[32m2025-10-22 16:40:04.528[0m | [34m[1mDEBUG   [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_keywords[0m:[36m112[0m - [34m[1mcase_insen_search_kw =['Artificial Intelligence', 'Machine Learning', 'Deep Learning', 'Neural Networks', 'NLP', 'Computer Vision', 'Data Science', 'Gemini', 'Bard', 'ChatGPT', 'GPT-4', 'DALL-E', 'MidJourney', 'Stable Diffusion', 'Claude', 'Whisper'][0m
[32m2025-10-22 16:40:04.528[0m | [34m[1mDEBUG   [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mfilter_by_keywords[0m:[36m113[

Store data in the database

In [9]:
store_in_database(filtered_by_kw)

[32m2025-10-22 16:40:13.362[0m | [1mINFO    [0m | [36mdatabase.tables.bigquery.news_metadata[0m:[36madd_rows[0m:[36m132[0m - [1mInserting 7 new rows into BigQuery table news_metadata[0m
[32m2025-10-22 16:40:14.107[0m | [1mINFO    [0m | [36mutils.gcp.bigquery[0m:[36minsert_rows[0m:[36m269[0m - [1mRows inserted into news_metadata.[0m


This function is to prepare the data to be returned by the API

In [9]:
final_df = convert_datetime_columns_to_str(
    df = filtered_by_kw,
    string_format = news_config.DATE_STRING_FORMAT
)

final_df.info()

[32m2025-10-09 20:09:14.407[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mconvert_datetime_columns_to_str[0m:[36m214[0m - [1mConverting datetime columns to string format: %Y-%m-%dT%H:%M:%SZ[0m
[32m2025-10-09 20:09:14.414[0m | [34m[1mDEBUG   [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mconvert_datetime_columns_to_str[0m:[36m215[0m - [34m[1mDetected datetime columns: ['publish_date'][0m
[32m2025-10-09 20:09:14.416[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.pipeline_steps[0m:[36mconvert_datetime_columns_to_str[0m:[36m220[0m - [1mDatetime conversion complete.[0m


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         7 non-null      object
 1   news_link     7 non-null      object
 2   image_link    7 non-null      object
 3   publish_date  7 non-null      object
dtypes: object(4)
memory usage: 356.0+ bytes


## Full Pipeline in a line

In [10]:
import sys

sys.path.append("..")

from news_extraction_pipeline.pipeline import main

final_articles = main()

[32m2025-10-22 17:11:11.434[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.pipeline[0m:[36mmain[0m:[36m35[0m - [1mStarting AI news retrieval process...[0m
[32m2025-10-22 17:11:11.438[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.extractors.news.news_extractors[0m:[36mcurrent_feed_url[0m:[36m64[0m - [1mSetting current feed url to https://news.mit.edu/rss/feed[0m
[32m2025-10-22 17:11:11.443[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.extractor_selectors.extractor_selector[0m:[36mget_extractor[0m:[36m73[0m - [1mExtractor 'MITImageExtractor' selected for base URL: https://news.mit.edu[0m
[32m2025-10-22 17:11:11.443[0m | [1mINFO    [0m | [36mnews_extraction_pipeline.extractors.news.news_extractors[0m:[36mcurrent_feed_url[0m:[36m64[0m - [1mSetting current feed url to https://www.artificialintelligence-news.com/artificial-intelligence-news/feed/[0m
[32m2025-10-22 17:11:11.450[0m | [1mINFO    [0m | [36mnews_extraction_pipelin

In [11]:
final_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         7 non-null      object
 1   news_link     7 non-null      object
 2   image_link    7 non-null      object
 3   publish_date  7 non-null      object
dtypes: object(4)
memory usage: 356.0+ bytes
