In [6]:
import feedparser
import requests
from datetime import datetime, timedelta
import json
import os
from typing import List, Dict, Set
import re
import time

class KeywordFilteredRSSParser:
    def __init__(self, rss_url: str = "https://www.cell.com/neuron/inpress.rss"):
        self.rss_url = rss_url
        self.articles_file = "nature_neuroscience_articles.json"
        self.relevant_articles_file = "relevant_articles.json"
        
        # Default keywords - you can modify these
        self.keywords = {
            'imaging': ['eeg', 'fmri', 'imaging', 'mri', 'pet scan', 'neuroimaging', 
                       'functional magnetic resonance', 'electroencephalography', 
                       'magnetoencephalography', 'meg', 'bold signal', 'diffusion tensor',
                       'dti', 'functional connectivity', 'resting state', 'task-based fmri'],
            'techniques': ['electrophysiology', 'optogenetics', 'calcium imaging',
                          'two-photon', 'confocal', 'microscopy', 'electrode'],
            'analysis': ['machine learning', 'deep learning', 'neural network',
                        'classification', 'decoding', 'connectivity analysis']
        }
        
        # Flatten keywords for easier searching
        self.all_keywords = []
        for category, words in self.keywords.items():
            self.all_keywords.extend(words)
    
    def add_keywords(self, new_keywords: List[str], category: str = 'custom'):
        """Add new keywords to search for"""
        if category not in self.keywords:
            self.keywords[category] = []
        
        self.keywords[category].extend(new_keywords)
        self.all_keywords.extend(new_keywords)
        print(f"Added {len(new_keywords)} keywords to category '{category}'")
    
    def set_keywords(self, keywords_dict: Dict[str, List[str]]):
        """Set custom keywords dictionary"""
        self.keywords = keywords_dict
        self.all_keywords = []
        for category, words in self.keywords.items():
            self.all_keywords.extend(words)
        print(f"Updated keywords. Total: {len(self.all_keywords)} keywords across {len(self.keywords)} categories")
    
    def check_relevance(self, article: Dict) -> Dict:
        """
        Check if article is relevant based on keywords
        Returns dict with relevance info
        """
        text_to_search = f"{article['title']} {article['summary']}".lower()
        
        found_keywords = []
        keyword_categories = []
        
        # Check each category
        for category, words in self.keywords.items():
            category_matches = []
            for keyword in words:
                if keyword.lower() in text_to_search:
                    category_matches.append(keyword)
                    if keyword not in found_keywords:
                        found_keywords.append(keyword)
            
            if category_matches:
                keyword_categories.append({
                    'category': category,
                    'matches': category_matches
                })
        
        relevance_score = len(found_keywords)
        is_relevant = relevance_score > 0
        
        return {
            'is_relevant': is_relevant,
            'relevance_score': relevance_score,
            'found_keywords': found_keywords,
            'keyword_categories': keyword_categories
        }
    
    def fetch_and_parse_rss(self) -> List[Dict]:
        """Fetch and parse the RSS feed from Nature Neuroscience"""
        try:
            feed = feedparser.parse(self.rss_url)
            articles = []
            
            for entry in feed.entries:
                article = {
                    'title': self.clean_title(entry.title),
                    'link': entry.link,
                    'date': self.parse_date(entry.get('published', '')),
                    'summary': self.clean_summary(entry.get('summary', '')),
                    'authors': self.extract_authors(entry),
                    'doi': self.extract_doi(entry),
                    'fetched_on': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }
                
                # Add relevance information
                relevance_info = self.check_relevance(article)
                article.update(relevance_info)
                
                articles.append(article)
            
            return articles
            
        except Exception as e:
            print(f"Error fetching RSS feed: {e}")
            return []
    
    def clean_title(self, title: str) -> str:
        """Remove CDATA tags and clean up the title"""
        if title:
            title = title.replace('<![CDATA[', '').replace(']]>', '')
            title = re.sub(r'<[^>]+>', '', title)
        return title.strip()
    
    def clean_summary(self, summary: str) -> str:
        """Clean up the summary text"""
        if summary:
            summary = summary.replace('<![CDATA[', '').replace(']]>', '')
            summary = re.sub(r'<[^>]+>', ' ', summary)
            summary = re.sub(r'\s+', ' ', summary)
        return summary.strip()
    
    def parse_date(self, date_str: str) -> str:
        """Parse and format the publication date"""
        if not date_str:
            return ""
        try:
            parsed_date = datetime.strptime(date_str[:10], '%Y-%m-%d')
            return parsed_date.strftime('%Y-%m-%d')
        except:
            return date_str
    
    def extract_authors(self, entry) -> List[str]:
        """Extract authors from the entry"""
        authors = []
        if hasattr(entry, 'authors'):
            for author in entry.authors:
                if hasattr(author, 'name'):
                    authors.append(author.name)
        return authors
    
    def extract_doi(self, entry) -> str:
        """Extract DOI from the entry"""
        doi = ""
        if hasattr(entry, 'prism_doi'):
            doi = entry.prism_doi
        elif hasattr(entry, 'id'):
            if 'doi:' in entry.id:
                doi = entry.id.replace('doi:', '')
        return doi
    
    def display_articles(self, articles: List[Dict], title: str = "Articles", show_relevance: bool = True):
        """Display articles in a formatted way"""
        print(f"\n{'='*80}")
        print(f"{title} ({len(articles)} articles)")
        print(f"{'='*80}")
        
        for i, article in enumerate(articles, 1):
            print(f"\n{i}. {article['title']}")
            print(f"   Link: {article['link']}")
            print(f"   Date: {article['date']}")
            
            if show_relevance and article.get('is_relevant'):
                print(f"   üéØ RELEVANCE SCORE: {article['relevance_score']}")
                print(f"   üìù KEYWORDS FOUND: {', '.join(article['found_keywords'])}")
                
                # Show which categories matched
                for cat_info in article['keyword_categories']:
                    print(f"   üìÇ {cat_info['category'].upper()}: {', '.join(cat_info['matches'])}")
            
            if article['authors']:
                authors_str = ', '.join(article['authors'][:3])
                if len(article['authors']) > 3:
                    authors_str += f" et al. ({len(article['authors'])} total)"
                print(f"   üë• Authors: {authors_str}")
            
            if article['doi']:
                print(f"   üîó DOI: {article['doi']}")
            
            if article['summary']:
                summary = article['summary'][:300] + "..." if len(article['summary']) > 300 else article['summary']
                print(f"   üìÑ Summary: {summary}")
            
            print("-" * 80)
    
    def filter_relevant_articles(self, articles: List[Dict]) -> List[Dict]:
        """Filter articles to only include relevant ones"""
        relevant = [article for article in articles if article.get('is_relevant', False)]
        
        # Sort by relevance score (highest first)
        relevant.sort(key=lambda x: x.get('relevance_score', 0), reverse=True)
        
        return relevant
    
    def save_relevant_articles(self, articles: List[Dict]):
        """Save only relevant articles to a separate file"""
        relevant_articles = self.filter_relevant_articles(articles)
        
        try:
            with open(self.relevant_articles_file, 'w', encoding='utf-8') as f:
                json.dump(relevant_articles, f, indent=2, ensure_ascii=False)
            print(f"üíæ {len(relevant_articles)} relevant articles saved to {self.relevant_articles_file}")
        except Exception as e:
            print(f"Error saving relevant articles: {e}")
    
    def generate_summary_report(self, articles: List[Dict]):
        """Generate a summary report of findings"""
        relevant_articles = self.filter_relevant_articles(articles)
        
        print(f"\n{'='*60}")
        print("üìä SUMMARY REPORT")
        print(f"{'='*60}")
        print(f"Total articles checked: {len(articles)}")
        print(f"Relevant articles found: {len(relevant_articles)}")
        print(f"Relevance rate: {len(relevant_articles)/len(articles)*100:.1f}%" if articles else "0%")
        
        if relevant_articles:
            # Keyword frequency analysis
            keyword_freq = {}
            category_freq = {}
            
            for article in relevant_articles:
                for keyword in article.get('found_keywords', []):
                    keyword_freq[keyword] = keyword_freq.get(keyword, 0) + 1
                
                for cat_info in article.get('keyword_categories', []):
                    cat = cat_info['category']
                    category_freq[cat] = category_freq.get(cat, 0) + 1
            
            print(f"\nüîù Top Keywords Found:")
            sorted_keywords = sorted(keyword_freq.items(), key=lambda x: x[1], reverse=True)
            for keyword, count in sorted_keywords[:10]:
                print(f"   ‚Ä¢ {keyword}: {count} articles")
            
            print(f"\nüìÇ Category Distribution:")
            for category, count in sorted(category_freq.items()):
                print(f"   ‚Ä¢ {category}: {count} articles")
            
            print(f"\n‚≠ê Highest Scoring Articles:")
            top_articles = sorted(relevant_articles, key=lambda x: x.get('relevance_score', 0), reverse=True)[:5]
            for i, article in enumerate(top_articles, 1):
                print(f"   {i}. [{article['relevance_score']} pts] {article['title'][:60]}...")
    
    def check_daily_updates(self):
        """Main method to check for daily updates with keyword filtering"""
        print(f"üîç Checking Nature Neuroscience RSS feed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"üéØ Searching for articles containing keywords from {len(self.keywords)} categories")
        print(f"üìù Total keywords: {len(self.all_keywords)}")
        
        # Fetch and analyze articles
        articles = self.fetch_and_parse_rss()
        
        if not articles:
            print("‚ùå No articles found or error occurred while fetching.")
            return
        
        # Filter relevant articles
        relevant_articles = self.filter_relevant_articles(articles)
        
        print(f"\n‚úÖ Found {len(relevant_articles)} relevant articles out of {len(articles)} total articles")
        
        if relevant_articles:
            self.display_articles(relevant_articles, "üéØ RELEVANT ARTICLES")
            self.save_relevant_articles(articles)
        else:
            print("üòû No relevant articles found with current keywords.")
            print("üí° Consider expanding your keyword list or checking back later.")
        
        # Generate summary report
        self.generate_summary_report(articles)
        
        return relevant_articles

def main():
    """Main function to run the keyword-filtered RSS parser"""
    
    # Initialize parser
    parser = KeywordFilteredRSSParser()
    
    # You can customize keywords like this:
    custom_keywords = {
        'neuroimaging': ['eeg', 'fmri', 'mri', 'pet', 'neuroimaging', 'bold', 
                        'functional magnetic resonance', 'electroencephalography',
                        'magnetoencephalography', 'meg', 'diffusion tensor imaging',
                        'dti', 'resting state', 'task fmri', 'connectivity'],
        'brain_stimulation': ['tms', 'transcranial magnetic stimulation', 
                             'transcranial direct current stimulation', 'tdcs',
                             'deep brain stimulation', 'dbs', 'optogenetics'],
        'analysis_methods': ['machine learning', 'deep learning', 'artificial intelligence',
                           'neural decoding', 'classification', 'regression',
                           'connectivity analysis', 'network analysis', 'graph theory'],
        'techniques': ['electrophysiology', 'single cell', 'calcium imaging',
                      'two-photon microscopy', 'patch clamp', 'microelectrode']
    }
    
    # Set your custom keywords
    parser.set_keywords(custom_keywords)
    
    # Or add additional keywords to existing categories
    # parser.add_keywords(['nirs', 'fnirs', 'near infrared'], 'neuroimaging')
    
    # Check for relevant articles
    relevant_articles = parser.check_daily_updates()
    
    # Save a timestamped report
    if relevant_articles:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        report_file = f"relevant_articles_report_{timestamp}.json"
        with open(report_file, 'w', encoding='utf-8') as f:
            json.dump(relevant_articles, f, indent=2, ensure_ascii=False)
        print(f"\nüíæ Detailed report saved to {report_file}")

if __name__ == "__main__":
    main()

Updated keywords. Total: 37 keywords across 4 categories
üîç Checking Nature Neuroscience RSS feed at 2025-07-16 10:42:19
üéØ Searching for articles containing keywords from 4 categories
üìù Total keywords: 37

‚úÖ Found 3 relevant articles out of 39 total articles

üéØ RELEVANT ARTICLES (3 articles)

1. How the brain shifts between external and internal attention
   Link: https://www.cell.com/neuron/fulltext/S0896-6273(25)00471-4?rss=yes
   Date: 
   üéØ RELEVANCE SCORE: 1
   üìù KEYWORDS FOUND: pet
   üìÇ NEUROIMAGING: pet
   üë• Authors: Anna C. Nobre, Daniela Gresch
   üìÑ Summary: Nobre and Gresch call for an upgrade of attention research by considering how the brain shifts its focus between contents in the external sensory stream and internal memory representations. They highlight competing hypotheses, review the few experimental attempts and findings, propose candidate neur...
--------------------------------------------------------------------------------

2. Grouping 