In [14]:
import newspaper
import feedparser
import pandas as pd
from textblob import TextBlob
from textblob import download_corpora



In [4]:
def scrape_news_from_feed(feed_url):
    articles = []
    feed = feedparser.parse(feed_url)
    for entry in feed.entries:
        # create a newspaper article object
        article = newspaper.Article(entry.link)
        # download and parse the article
        article.download()
        article.parse()
        # extract relevant information
        articles.append({
            'title': article.title,
            'author': article.authors,
            'publish_date': article.publish_date,
            'content': article.text
        })
    return articles

In [5]:
feed_url = 'http://feeds.bbci.co.uk/news/rss.xml'
articles = scrape_news_from_feed(feed_url)

In [18]:
# Convert the articles list to a DataFrame
df = pd.DataFrame(articles)

In [19]:
print(df.info(),df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         38 non-null     object
 1   author        38 non-null     object
 2   publish_date  0 non-null      object
 3   content       38 non-null     object
dtypes: object(4)
memory usage: 1.3+ KB
None                                                     title author publish_date  \
count                                                  38     38            0   
unique                                                 37      3            0   
top     Husband of Thai woman found dead in Yorkshire ...     []          NaN   
freq                                                    2     36          NaN   

                                                  content  
count                                                  38  
unique                                                 37  
top     Husband

In [7]:
# Save the DataFrame to an Excel file
excel_file_name = "news_articles.xlsx"
df.to_excel(excel_file_name, index=False)
print(f"Data saved to {excel_file_name}")

Data saved to news_articles.xlsx


In [22]:
df_cleaned = df[['title', 'content']].dropna(subset=['content']).drop_duplicates()

In [24]:
print(df_cleaned.info(),df_cleaned.describe())

<class 'pandas.core.frame.DataFrame'>
Index: 37 entries, 0 to 37
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    37 non-null     object
 1   content  37 non-null     object
dtypes: object(2)
memory usage: 888.0+ bytes
None                                                     title  \
count                                                  37   
unique                                                 37   
top     Displaced Palestinians wait for Israel to allo...   
freq                                                    1   

                                                  content  
count                                                  37  
unique                                                 37  
top     Palestinians' return on hold as Israel accuses...  
freq                                                    1  


In [25]:
# Function to calculate sentiment
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # Returns a score between -1 (negative) and 1 (positive)

# Apply sentiment analysis
df_cleaned['sentiment_score'] = df_cleaned['content'].apply(get_sentiment)

# Add a sentiment label
df_cleaned['sentiment_label'] = df_cleaned['sentiment_score'].apply(
    lambda score: 'positive' if score > 0 else ('negative' if score < 0 else 'neutral')
)
print(df)

                                                title  \
0   Displaced Palestinians wait for Israel to allo...   
1   'My beauty, you're home': Israeli women soldie...   
2   Holocaust survivors fear Europe is forgetting ...   
3   Leo Ross stabbing: Boy, 14, charged with Hall ...   
4   Rachel Reeves: Why is she suddenly in a rush, ...   
5   I spent 30 years trying to unlock the secret t...   
6   The women who brought a Tinder predator to jus...   
7   The Traitors finale most-watched live episode ...   
8   'I don't want to buy a £4 coffee just so I can...   
9        WH Smith in talks to sell high street stores   
10  Trump purges at least a dozen inspectors gener...   
11  Neil Gaiman's publisher cancels future works a...   
12  The Night We Met, Evergreen and more 2010s son...   
13  Newspaper headlines: Reeves' war on 'blockers'...   
14  Meta wants X-style community notes to replace ...   
15  Mexico prepares to welcome back migrants from ...   
16       The Traitors UK: Twist

In [27]:
# Function to extract top keywords
def get_top_keywords(text, top_n=3):
    analysis = TextBlob(text)
    # Get noun phrases (keywords) and their sentiment
    keywords = [(phrase, TextBlob(phrase).sentiment.polarity) for phrase in analysis.noun_phrases]
    # Sort by polarity and return top N keywords
    top_keywords = sorted(keywords, key=lambda x: abs(x[1]), reverse=True)[:top_n]
    return [kw[0] for kw in top_keywords]
# Apply to DataFrame
df_cleaned['top_keywords'] = df_cleaned['content'].apply(get_top_keywords)

print(df_cleaned)

                                                title  \
0   Displaced Palestinians wait for Israel to allo...   
1   'My beauty, you're home': Israeli women soldie...   
2   Holocaust survivors fear Europe is forgetting ...   
3   Leo Ross stabbing: Boy, 14, charged with Hall ...   
4   Rachel Reeves: Why is she suddenly in a rush, ...   
5   I spent 30 years trying to unlock the secret t...   
6   The women who brought a Tinder predator to jus...   
7   The Traitors finale most-watched live episode ...   
8   'I don't want to buy a £4 coffee just so I can...   
9        WH Smith in talks to sell high street stores   
10  Trump purges at least a dozen inspectors gener...   
11  Neil Gaiman's publisher cancels future works a...   
12  The Night We Met, Evergreen and more 2010s son...   
13  Newspaper headlines: Reeves' war on 'blockers'...   
14  Meta wants X-style community notes to replace ...   
15  Mexico prepares to welcome back migrants from ...   
16       The Traitors UK: Twist

In [28]:
# Save the analyis DataFrame to an Excel file
excel_file_name = "results.xlsx"
df_cleaned.to_excel(excel_file_name, index=False)
print(f"Data saved to {excel_file_name}")

Data saved to results.xlsx
