In [None]:
import requests
from bs4 import BeautifulSoup

# URL of the news website (replace with a real URL if possible)
url = "https://example.com/news"

# Request and parse the webpage
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Extract and store headlines in a list
headlines = [h2.text for h2 in soup.find_all('h2')]

headlines = [
    "Government Announces New Climate Change Policy",
    "Local Schools Embrace Remote Learning Technology",
    "Tech Giants Report Record Profits Amid Economic Downturn",
    "Scientists Discover New Species in the Amazon Rainforest",
    "City Council Approves Plan to Revitalize Downtown Area",
    "Healthcare Workers Demand Better Working Conditions",
    "Breakthrough in Cancer Research Offers New Hope",
    "Sports Team Wins Championship After Decade-Long Drought",
    "Experts Warn of Rising Sea Levels Along Coastal Cities",
    "Controversial Bill Passes in the Senate by Narrow Margin",
    "Study Shows Link Between Diet and Mental Health",
    "Housing Prices Continue to Soar in Major Cities",
    "Startup Disrupts Industry with Innovative New Product",
    "Wildfire Spreads Across Thousands of Acres",
    "Local Artists Hold Exhibition on Climate Awareness",
    "Unemployment Rates Drop as Economy Recovers",
    "Cybersecurity Firm Reports Increase in Data Breaches",
    "New Initiative Aims to Improve Literacy Rates",
    "University Launches AI Research Center",
    "Public Protests Against Government Surveillance Measures"
]

print("Headlines:", headlines)

In [None]:
from transformers import pipeline
import pandas as pd

# Load the sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis")

# Analyze sentiment for each headline
sentiment_results = []
for headline in headlines:
    result = sentiment_analyzer(headline)[0]
    sentiment_results.append({
        'Headline': headline,
        'Sentiment': result['label'],
        'Score': result['score']
    })

# Convert results to a DataFrame for easy viewing and manipulation
sentiment_df = pd.DataFrame(sentiment_results)
sentiment_df.head()

In [None]:
import matplotlib.pyplot as plt

# Count the number of headlines in each sentiment category
sentiment_counts = sentiment_df['Sentiment'].value_counts()

# Plot sentiment distribution as a bar chart
plt.figure(figsize=(8, 6))
plt.bar(sentiment_counts.index, sentiment_counts.values, color=['skyblue', 'salmon'])
plt.xlabel("Sentiment")
plt.ylabel("Number of Headlines")
plt.title("Sentiment Distribution of News Headlines")
plt.show()

In [None]:
from collections import Counter
import re

# Tokenize and remove common stop words
stop_words = {'the', 'is', 'on', 'and', 'a', 'of', 'to', 'in', 'for'}
words = [word.lower() for headline in headlines for word in re.findall(r'\w+', headline) if word.lower() not in stop_words]

# Count word frequencies
word_counts = Counter(words)
common_words = word_counts.most_common(10)
print("Top 10 Common Words:", common_words)

# Convert to DataFrame for visualization
words_df = pd.DataFrame(common_words, columns=['Word', 'Frequency'])

# Plot common words as a bar chart
plt.figure(figsize=(10, 6))
plt.bar(words_df['Word'], words_df['Frequency'], color='lightgreen')
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.title("Top 10 Common Words in Headlines")
plt.show()

In [None]:
### Conclusion
In this notebook, we explored:
- How to scrape news headlines using Python and BeautifulSoup.
- How to use a pre-trained AI model to perform sentiment analysis on news headlines.
- How to visualize trends and patterns in sentiment and keyword frequency.

These techniques provide powerful ways for journalists to analyze news coverage and public sentiment. 
We hope you feel inspired to continue exploring Python and AI for data journalism.

### Test

In [None]:
import json
import pandas as pd

# Load JSON data in chunks
def load_json_in_chunks(file_path, chunk_size=1000):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    articles = list(data.values())
    for i in range(0, len(articles), chunk_size):
        yield articles[i:i + chunk_size]

# Define file path (replace with your file path in Colab)
file_path = 'data_2009.json'

# Load a sample chunk to inspect structure
sample_chunk = next(load_json_in_chunks(file_path))
df_sample = pd.DataFrame(sample_chunk)
df_sample.head(3)

In [None]:
# Function to extract key fields into a DataFrame
def extract_fields(article_chunk):
    data = []
    for article in article_chunk:
        data.append({
            "title": article.get("title"),
            "description": article.get("description"),
            "text": article.get("text"),
            "keywords": article.get("keywords"),
            "author": article.get("author"),
            "date": article.get("date")
        })
    return pd.DataFrame(data)

# Process sample chunk
df_articles = extract_fields(sample_chunk)
df_articles.head()


In [None]:
# Initialize the sentiment analysis pipeline with a German model
sentiment_analyzer = pipeline("sentiment-analysis", model="oliverguhr/german-sentiment-bert")

# Apply sentiment analysis to each description
df_articles['sentiment'] = df_articles['description'].apply(lambda x: sentiment_analyzer(x)[0]['label'] if pd.notnull(x) else None)

# View sentiment analysis results
df_articles[['title', 'description', 'sentiment']].head()