In [1]:
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from collections import Counter # Though not directly used in the provided functions, it's good for keyword counting if you extend
from sklearn.feature_extraction.text import TfidfVectorizer
import string # For punctuation, though spaCy handles a lot of this
import pandas as pd # For creating and displaying DataFrames with NLP results

print("NLP libraries imported.")
print(f"Current time: {pd.Timestamp.now(tz='Asia/Kolkata')}") # Using pandas for timezone-aware current time

NLP libraries imported.
Current time: 2025-05-23 09:17:32.196486+05:30


In [2]:
# Load spaCy model
# This assumes you have already run: python -m spacy download en_core_web_sm
try:
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model 'en_core_web_sm' loaded successfully.")
except OSError:
    print("spaCy model 'en_core_web_sm' not found.")
    print("Please run the following in your terminal (with your virtual environment activated):")
    print("python -m spacy download en_core_web_sm")
    print("Then, restart this Jupyter kernel and re-run this cell.")
    nlp = None # Set to None if loading fails

# Initialize VADER sentiment analyzer
try:
    vader_analyzer = SentimentIntensityAnalyzer()
    print("VADER sentiment analyzer initialized successfully.")
except Exception as e:
    print(f"Error initializing VADER: {e}")
    vader_analyzer = None

# Quick check
if nlp and vader_analyzer:
    print("Both spaCy and VADER are ready.")
else:
    print("One or more NLP tools could not be initialized. Please check messages above.")

spaCy model 'en_core_web_sm' loaded successfully.
VADER sentiment analyzer initialized successfully.
Both spaCy and VADER are ready.


In [3]:
def get_sentiment(text):
    """
    Analyzes the sentiment of a given text using VADER.
    Returns a dictionary with a compound score and a label ('positive', 'negative', 'neutral').
    """
    if not text or not isinstance(text, str) or not vader_analyzer: # Added check for vader_analyzer
        return {'score': 0.0, 'label': 'neutral'} # Default for empty text or if VADER isn't loaded
        
    vs = vader_analyzer.polarity_scores(text)
    score = vs['compound'] # The compound score is a normalized, weighted composite score.
    
    # Determine label based on compound score thresholds
    if score >= 0.05:
        label = 'positive'
    elif score <= -0.05:
        label = 'negative'
    else:
        label = 'neutral'
        
    return {'score': score, 'label': label}

print("get_sentiment function defined.")

# --- Test cases for get_sentiment ---
print("\n--- Testing get_sentiment function ---")
test_sentiments = [
    "This is a great and wonderful development for renewable energy!", # Expected positive
    "The progress is terribly slow and disappointing for the sector.",   # Expected negative
    "The report was published today.",                                  # Expected neutral
    "Solar power is good, but wind energy can sometimes be unreliable.", # Mixed, VADER will give a compound
    "", # Empty string
    None # None input
]

if vader_analyzer: # Only run tests if VADER is available
    for i, sentence in enumerate(test_sentiments):
        sentiment_result = get_sentiment(sentence)
        print(f"Sentence {i+1}: '{sentence}'")
        print(f"  Sentiment: Score={sentiment_result['score']:.3f}, Label='{sentiment_result['label']}'")
else:
    print("VADER analyzer not initialized. Skipping get_sentiment tests.")

get_sentiment function defined.

--- Testing get_sentiment function ---
Sentence 1: 'This is a great and wonderful development for renewable energy!'
  Sentiment: Score=0.880, Label='positive'
Sentence 2: 'The progress is terribly slow and disappointing for the sector.'
  Sentiment: Score=-0.612, Label='negative'
Sentence 3: 'The report was published today.'
  Sentiment: Score=0.000, Label='neutral'
Sentence 4: 'Solar power is good, but wind energy can sometimes be unreliable.'
  Sentiment: Score=0.557, Label='positive'
Sentence 5: ''
  Sentiment: Score=0.000, Label='neutral'
Sentence 6: 'None'
  Sentiment: Score=0.000, Label='neutral'


In [4]:
def get_entities(text):
    """
    Extracts named entities from a given text using spaCy.
    Returns a dictionary where keys are entity texts and values are their labels.
    """
    if not text or not isinstance(text, str) or not nlp: # Added check for nlp
        return {} # Default for empty text or if spaCy isn't loaded
        
    # Process the text with spaCy
    # spaCy's default models have a length limit (e.g., 1,000,000 characters for en_core_web_sm)
    # Truncate text if it's too long to prevent errors, though very long single "texts" from articles are unlikely.
    doc = nlp(text[:nlp.max_length]) 
    
    entities = {}
    if doc.ents:
        for ent in doc.ents:
            # Store entity text and its label
            # Using ent.text.strip() to remove leading/trailing whitespace from entity text
            entities[ent.text.strip()] = ent.label_
            
    return entities

print("get_entities function defined.")

# --- Test cases for get_entities ---
print("\n--- Testing get_entities function ---")
test_entities_text = [
    "Apple Inc. is looking at buying U.K. startup for $1 billion in London.",
    "Dr. Emily Carter from Princeton University published a paper on solar energy in Germany.",
    "The recent G7 summit discussed advancements in renewable technology in Tokyo last week.",
    "Tesla and SpaceX, companies led by Elon Musk, are pushing boundaries.",
    "" # Empty string
]

if nlp: # Only run tests if spaCy is available
    for i, sentence in enumerate(test_entities_text):
        entities_result = get_entities(sentence)
        print(f"Sentence {i+1}: '{sentence}'")
        print(f"  Entities: {entities_result}")
else:
    print("spaCy nlp model not initialized. Skipping get_entities tests.")

get_entities function defined.

--- Testing get_entities function ---
Sentence 1: 'Apple Inc. is looking at buying U.K. startup for $1 billion in London.'
  Entities: {'Apple Inc.': 'ORG', 'U.K.': 'GPE', '$1 billion': 'MONEY', 'London': 'GPE'}
Sentence 2: 'Dr. Emily Carter from Princeton University published a paper on solar energy in Germany.'
  Entities: {'Emily Carter': 'PERSON', 'Princeton University': 'ORG', 'Germany': 'GPE'}
Sentence 3: 'The recent G7 summit discussed advancements in renewable technology in Tokyo last week.'
  Entities: {'G7': 'PRODUCT', 'Tokyo': 'GPE', 'last week': 'DATE'}
Sentence 4: 'Tesla and SpaceX, companies led by Elon Musk, are pushing boundaries.'
  Entities: {'Tesla': 'ORG', 'Elon Musk': 'PERSON'}
Sentence 5: ''
  Entities: {}


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
# spaCy's stop words list can be quite comprehensive.
# Ensure spaCy is loaded if you plan to use its stop words directly here,
# otherwise, TfidfVectorizer has its own 'english' stop word list.
# For simplicity, if nlp (spaCy model) is loaded, we will use its stop words.
# Otherwise, TfidfVectorizer will use its default English stop words.

def get_keywords_tfidf(text_list, num_keywords=10):
    """
    Extracts keywords from a list of texts (intended for a single document's content,
    passed as a list) using TF-IDF.
    Returns a dictionary of keywords and their TF-IDF scores.
    """
    if not text_list or not all(isinstance(s, str) and s.strip() for s in text_list if s):
        # print("Warning: Empty or invalid text_list provided to get_keywords_tfidf.")
        return {} # Return empty dict if input is problematic (e.g., list of empty strings)
    
    valid_texts = [s for s in text_list if s and s.strip()]
    if not valid_texts:
        # print("Warning: No valid texts in text_list for get_keywords_tfidf after stripping.")
        return {}

    try:
        current_stop_words = 'english' # Default for TfidfVectorizer
        if nlp and hasattr(nlp.Defaults, 'stop_words'): # Check if spaCy's nlp object is loaded and has stop words
            current_stop_words = list(nlp.Defaults.stop_words)
        
        # Initialize TfidfVectorizer
        # token_pattern ensures we capture words with hyphens and words that are at least 2 characters long.
        vectorizer = TfidfVectorizer(
            stop_words=current_stop_words,
            max_features=1000,         # Limit the number of features (vocabulary size)
            ngram_range=(1, 2),        # Consider unigrams and bigrams
            token_pattern=r'(?u)\b\w[\w-]*\w\b|\b\w\w+\b' # More robust token pattern
        )
        
        tfidf_matrix = vectorizer.fit_transform(valid_texts)
        feature_names = vectorizer.get_feature_names_out()
        
        # Since this function is often called with a single document's content (as a list containing one string),
        # we'll process the first (and likely only) document in the matrix.
        doc_vector = tfidf_matrix[0] # Get the TF-IDF vector for the first document
        
        # Create a dictionary of keywords and their scores
        tfidf_scores_dict = {}
        # doc_vector.nonzero()[1] gives the indices of non-zero elements (features present in the doc)
        for col_idx in doc_vector.nonzero()[1]:
            tfidf_scores_dict[feature_names[col_idx]] = doc_vector[0, col_idx]
            
        # Sort keywords by TF-IDF score in descending order and take the top N
        sorted_keywords = sorted(tfidf_scores_dict.items(), key=lambda item: item[1], reverse=True)
        
        return dict(sorted_keywords[:num_keywords])

    except ValueError as e:
        # This can happen if the vocabulary is empty after stop word removal (e.g., very short text with only stop words)
        # print(f"TF-IDF ValueError (e.g., empty vocabulary after stop words): {e}")
        return {}
    except Exception as e:
        # print(f"An unexpected error occurred in get_keywords_tfidf: {e}")
        return {}

print("get_keywords_tfidf function defined.")

# --- Test cases for get_keywords_tfidf ---
print("\n--- Testing get_keywords_tfidf function ---")
test_keyword_texts = [
    ["Renewable energy sources like solar power and wind turbines are crucial for combating climate change. Investment in green technology is increasing."],
    ["The report details financial spending and budget allocations for the upcoming fiscal year. It's a very dry document about economics."],
    ["This is a very short sentence."],
    ["         "], # Text with only whitespace
    ["the a an of"] # Text with only common stop words (using default TfidfVectorizer list)
]

if nlp: # Ensure nlp object is available for stop words
    for i, text_as_list in enumerate(test_keyword_texts):
        keywords_result = get_keywords_tfidf(text_as_list)
        # Joining list for printing, as the function expects a list of strings
        print(f"Text {i+1}: '{' '.join(text_as_list if text_as_list else [''])}'")
        print(f"  Keywords (TF-IDF): {keywords_result}")
else:
    print("spaCy nlp model not loaded. TF-IDF will use default stop words. Results might differ slightly or fail if spaCy stop words were strictly intended.")
    # Re-running tests with 'english' stop words explicitly for this case
    for i, text_as_list in enumerate(test_keyword_texts):
        # Temporarily force TfidfVectorizer to use 'english' if nlp isn't loaded, for test consistency
        original_nlp_status = nlp # store original status
        nlp = None # temporarily disable nlp to force 'english' stop words in function
        keywords_result = get_keywords_tfidf(text_as_list)
        nlp = original_nlp_status # restore
        print(f"Text {i+1}: '{' '.join(text_as_list if text_as_list else [''])}'")
        print(f"  Keywords (TF-IDF) (using default 'english' stop words): {keywords_result}")

get_keywords_tfidf function defined.

--- Testing get_keywords_tfidf function ---
Text 1: 'Renewable energy sources like solar power and wind turbines are crucial for combating climate change. Investment in green technology is increasing.'
  Keywords (TF-IDF): {'technology increasing': 0.1796053020267749, 'green technology': 0.1796053020267749, 'investment green': 0.1796053020267749, 'change investment': 0.1796053020267749, 'climate change': 0.1796053020267749, 'combating climate': 0.1796053020267749, 'crucial combating': 0.1796053020267749, 'turbines crucial': 0.1796053020267749, 'wind turbines': 0.1796053020267749, 'power wind': 0.1796053020267749}
Text 2: 'The report details financial spending and budget allocations for the upcoming fiscal year. It's a very dry document about economics.'
  Keywords (TF-IDF): {'document economics': 0.20851441405707477, 'dry document': 0.20851441405707477, 'year dry': 0.20851441405707477, 'fiscal year': 0.20851441405707477, 'upcoming fiscal': 0.208514



In [6]:
def process_text_content(text_content):
    """
    Processes a single piece of text content to extract sentiment, keywords, and entities.
    Returns a tuple: (sentiment_dict, keywords_dict, entities_dict)
    """
    if not text_content or not isinstance(text_content, str) or not text_content.strip():
        # Return default structures for empty/invalid input to avoid errors downstream
        empty_sentiment = {'score': 0.0, 'label': 'neutral'}
        empty_keywords = {}
        empty_entities = {}
        # print("Warning: Empty or invalid text_content provided to process_text_content.")
        return empty_sentiment, empty_keywords, empty_entities
        
    # Ensure NLP tools are loaded
    if not nlp or not vader_analyzer:
        print("Error: NLP models (spaCy/VADER) not loaded in process_text_content.")
        # Return default structures if tools aren't ready
        return {'score': 0.0, 'label': 'neutral'}, {}, {}

    # 1. Get Sentiment
    sentiment = get_sentiment(text_content)
    
    # 2. Get Entities
    entities = get_entities(text_content)
    
    # 3. Get Keywords
    # Remember, get_keywords_tfidf expects a list of strings.
    # For a single article's content, we pass it as a list containing that one string.
    keywords = get_keywords_tfidf([text_content]) 
    
    return sentiment, keywords, entities

print("process_text_content function defined.")

# --- Test cases for process_text_content ---
print("\n--- Testing process_text_content function ---")

# A more comprehensive test sentence combining different elements
comprehensive_text = ("TechGlobal Corp. announced a groundbreaking solar panel with 50% increased efficiency. "
                      "This innovation, developed in their Berlin labs by Dr. Eva Rostova, "
                      "is expected to significantly impact the renewable energy market in Europe. "
                      "Analysts are very optimistic about this fantastic news, despite some initial high costs.")

empty_text_test = "   " # Test with only whitespace

if nlp and vader_analyzer: # Ensure tools are loaded before testing
    print(f"\nProcessing Comprehensive Text: '{comprehensive_text}'")
    s_comp, k_comp, e_comp = process_text_content(comprehensive_text)
    print(f"  Sentiment: {s_comp}")
    print(f"  Keywords: {k_comp}")
    print(f"  Entities: {e_comp}")

    print(f"\nProcessing Empty Text: '{empty_text_test}'")
    s_empty, k_empty, e_empty = process_text_content(empty_text_test)
    print(f"  Sentiment (empty): {s_empty}")
    print(f"  Keywords (empty): {k_empty}")
    print(f"  Entities (empty): {e_empty}")

    print(f"\nProcessing None input:")
    s_none, k_none, e_none = process_text_content(None) # Test None
    print(f"  Sentiment (None): {s_none}")
    print(f"  Keywords (None): {k_none}")
    print(f"  Entities (None): {e_none}")

else:
    print("spaCy nlp model or VADER analyzer not loaded. Skipping process_text_content tests.")

process_text_content function defined.

--- Testing process_text_content function ---

Processing Comprehensive Text: 'TechGlobal Corp. announced a groundbreaking solar panel with 50% increased efficiency. This innovation, developed in their Berlin labs by Dr. Eva Rostova, is expected to significantly impact the renewable energy market in Europe. Analysts are very optimistic about this fantastic news, despite some initial high costs.'
  Sentiment: {'score': 0.9259, 'label': 'positive'}
  Keywords: {'high costs': 0.12803687993289598, 'initial high': 0.12803687993289598, 'despite initial': 0.12803687993289598, 'news despite': 0.12803687993289598, 'fantastic news': 0.12803687993289598, 'optimistic fantastic': 0.12803687993289598, 'analysts optimistic': 0.12803687993289598, 'europe analysts': 0.12803687993289598, 'market europe': 0.12803687993289598, 'energy market': 0.12803687993289598}
  Entities: {'TechGlobal Corp.': 'ORG', '50%': 'PERCENT', 'Berlin': 'GPE', 'Eva Rostova': 'PERSON', 'Eu

In [7]:
# This cell is optional.
# If you saved your scraped data from Notebook 2, you can load it:
try:
    scraped_df_for_nlp_test = pd.read_csv("scraped_articles_combined_notebook.csv")
except FileNotFoundError:
    print("CSV file 'scraped_articles_combined_notebook.csv' not found. Skipping this optional test.")
    scraped_df_for_nlp_test = pd.DataFrame() # Create empty DataFrame to avoid errors

if not scraped_df_for_nlp_test.empty and nlp and vader_analyzer:
    print("\n--- Processing a Sample of Actual Scraped Articles (from CSV) ---")
    # Take a small sample, e.g., the first 2-3 articles that have content
    sample_for_processing = scraped_df_for_nlp_test[scraped_df_for_nlp_test['full_content'].notna() & (scraped_df_for_nlp_test['full_content'].str.strip() != '')].head(3)
    
    if not sample_for_processing.empty:
        for index, row in sample_for_processing.iterrows():
            print(f"\nProcessing Article URL: {row['url']}")
            print(f"Title: {row['title'][:80]}...")
            
            content_to_process = row['full_content']
            sentiment_result, keywords_result, entities_result = process_text_content(content_to_process)
            
            print(f"  Sentiment: {sentiment_result}")
            print(f"  Keywords: {keywords_result}")
            print(f"  Entities: {entities_result}")
    else:
        print("No suitable articles with content found in the loaded CSV for processing.")
elif not nlp or not vader_analyzer:
    print("NLP tools (spaCy/VADER) not loaded. Skipping processing of scraped articles.")
else:
    print("Skipping optional processing of scraped articles from CSV as no data was loaded.")

# --- Fallback: Process a predefined test article if no CSV is loaded ---
# if not ('scraped_df_for_nlp_test' in locals() and not scraped_df_for_nlp_test.empty) and nlp and vader_analyzer:
#     print("\n--- Processing a Predefined Test Article (as CSV was not loaded/empty) ---")
#     test_article_content_real = """
#     Major advancements in geothermal energy extraction were reported today from a research facility in Iceland.
#     The new technique, pioneered by Dr. Aris Thorne of GeoDynamics Inc., promises to double efficiency.
#     This could make geothermal power a more viable option for countries like Japan and the United States.
#     The financial markets reacted positively, with GeoDynamics Inc. (GEO) shares surging by 15% on the New York Stock Exchange.
#     Environmental groups have lauded this as a significant step towards sustainable energy independence.
#     The project received initial funding of $50 million from the Global Environment Fund.
#     """
#     sentiment_result, keywords_result, entities_result = process_text_content(test_article_content_real)
#     print(f"Test Article Content: '{test_article_content_real[:100]}...'")
#     print(f"  Sentiment: {sentiment_result}")
#     print(f"  Keywords: {keywords_result}")
#     print(f"  Entities: {entities_result}")


--- Processing a Sample of Actual Scraped Articles (from CSV) ---

Processing Article URL: https://www.theguardian.com/environment/2025/may/22/revealed-uranium-from-uk-nuclear-fuel-factory-dumped-into-protected-ribble-estuary
Title: Revealed: three tonnes of uranium legally dumped in protected English estuary in...




  Sentiment: {'score': 0.9928, 'label': 'positive'}
  Keywords: {'environment': 0.2885342830935783, 'uranium': 0.25246749770688104, 'fuels': 0.18033392693348646, 'springfields': 0.18033392693348646, 'discharges': 0.18033392693348646, 'environment agency': 0.16230053424013782, 'said': 0.16230053424013782, 'agency': 0.16230053424013782, 'springfields fuels': 0.14426714154678916, 'radioactivity': 0.14426714154678916}
  Entities: {'The Environment Agency': 'ORG', 'three tonnes': 'QUANTITY', 'England': 'GPE', 'the past nine years': 'DATE', 'Guardian': 'ORG', 'the Ends Report': 'ORG', 'Preston': 'ORG', 'between 2015 and 2024': 'DATE', '2015': 'DATE', '703kg': 'QUANTITY', 'Lea Town': 'PERSON', 'roughly five miles': 'QUANTITY', 'several million': 'CARDINAL', '11': 'CARDINAL', 'about 800': 'CARDINAL', 'SPA': 'ORG', 'Ramsar': 'PERSON', 'November 2024': 'DATE', '2023': 'DATE', 'Springfields Fuels': 'PERSON', 'approximately 4%': 'PERCENT', 'Dr Ian Fairlile': 'ORG', 'UK': 'GPE', '2009': 'DATE', 'th