In [34]:
import re
import requests
from bs4 import BeautifulSoup
from time import sleep

In [35]:
headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'https://www.google.com',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'
}

In [36]:
def get_article(card):
    """Extract article information from the raw html card."""
    # Extract the headline
    headline_tag = card.find('h4', class_='s-title')
    headline = headline_tag.text.strip() if headline_tag else 'No Headline Found'

    # Extract the source
    source_tag = card.find('span', class_='s-source')
    source = source_tag.text.strip() if source_tag else 'No Source Found'

    # Extract the time
    time_tag = card.find('span', class_='s-time')
    time = time_tag.text.strip().replace('·', '').strip() if time_tag else 'No Time Found'

    # Extract the description
    description_tag = card.find('p', class_='s-desc')
    description = description_tag.text.strip() if description_tag else 'No Description Available'

    # Extract and clean the link
    link_tag = card.find('a')
    raw_link = link_tag.get('href') if link_tag else 'No Link Available'
    # Assuming the link is already in a usable format, but if not, include additional cleaning logic here.

    return {
        'headline': headline, 
        'source': source, 
        'time': time, 
        'description': description, 
        'link': raw_link
    }


In [54]:
def get_the_news(search, x):
    template = 'https://news.search.yahoo.com/search?p={}'

    # template = 'https://finance.yahoo.com/quote/{}/news'
    url = template.format(search)
    articles = []
    links = set()
    pages_fetched = 0  # Counter for pages fetched
    
    while pages_fetched < x:  # Limit the number of pages to fetch
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            cards = soup.find_all('div', class_='NewsArticle')  # Ensure class_ is used correctly
            
            # If no cards are found, possibly an indication we're being blocked or wrong page structure
            if not cards:
                print("No articles found on the page, trying again in 5 seconds...")
                print(url)
                sleep(5)
                continue  # This continues the loop, retrying the same URL
            
            for card in cards:
                article = get_article(card)
                link = article['link']  # Corrected line to use 'link' key instead of -1 index
                if link not in links:
                    links.add(link)
                    articles.append(article)  # Now article is a dictionary
                    
            try:
                next_page = soup.find('a', class_='next')  # Update this if necessary to match the actual class
                if next_page:
                    url = next_page.get('href')
                pages_fetched += 1  # Increment the counter after successfully fetching a page
                sleep(1)  # Respectful delay between requests
            except AttributeError:
                print("No more pages to fetch.")
                break  # Break the loop if there's no 'next' page or we've reached the limit
        else:
            print(f"Failed to fetch page: Status code {response.status_code}")
            break  # Exit loop on bad status code
            
    return articles


In [69]:
# Example usage
stock_ticker = 'viasat'
news_articles = get_the_news(stock_ticker, 1)


In [70]:
def fetch_article_content(url):
    """Fetch and return the content of an article given its URL, handling JavaScript redirects."""
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # Check if the response contains a JavaScript redirect
            if 'window.location.replace' in response.text:
                soup = BeautifulSoup(response.text, 'html.parser')
                script_tag = soup.find('script')
                if script_tag:
                    # Extract URL from the JavaScript line
                    new_url = script_tag.string.split('"')[1]  # This may need adjustment based on the exact JS syntax
                    # Make a new request to the extracted URL
                    return fetch_article_content(new_url)  # Recursive call with the new URL
            else:
                soup = BeautifulSoup(response.text, 'html.parser')
                paragraphs = soup.find_all('p')
                content = ' '.join(paragraph.text for paragraph in paragraphs)
                return content
        else:
            return "Failed to fetch article content."
    except Exception as e:
        return f"Error fetching article content: {e}"


In [71]:
# Assuming `news_articles` is a list of dictionaries, each containing article info.
for article in news_articles:
    article_content = fetch_article_content(article['link'])
    article['content'] = article_content

In [58]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
import torch

# Load the FinBERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('path/to/finbert')
tokenizer = BertTokenizer.from_pretrained('path/to/finbert')

def predict_sentiment(text):
    """Predict sentiment of the given text using FinBERT."""
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.softmax(outputs.logits, dim=1)
    
    # Assuming the model outputs three sentiments: positive, negative, neutral
    sentiment_scores = predictions.tolist()[0]  # Convert to list
    sentiments = ['positive', 'negative', 'neutral']
    # Choose the sentiment with the highest score
    sentiment = sentiments[sentiment_scores.index(max(sentiment_scores))]
    
    return sentiment

# Assuming `news_articles` is a list of dictionaries with article content
for article in news_articles:
    article['sentiment'] = predict_sentiment(article['content'])


ModuleNotFoundError: No module named 'transformers'

In [72]:
import nltk
# nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize the VADER sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()

# Assuming `news_articles` is a list of dictionaries, each containing article info and content
for article in news_articles:
    # This assumes that the article's content is under the 'content' key
    content = article.get('content', '')
    
    # Get the sentiment scores for the article content
    sentiment_scores = sia.polarity_scores(content)
    
    # VADER returns a dictionary with 4 scores: neg, neu, pos, and compound.
    # The 'compound' score is a normalized, weighted composite score.
    
    # Store the sentiment result in the dictionary
    article['sentiment'] = sentiment_scores['compound']


In [73]:
import pandas as pd

# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(news_articles)

# Display the table
df

Unnamed: 0,headline,source,time,description,link,content,sentiment
0,16 Largest Satellite Companies In The World,Insider Monkey via Yahoo Finance,5 hours ago,"In this article, we will look into the 16 larg...",https://r.search.yahoo.com/_ylt=AwrijSg6DwpmbE...,"In this article, we will look into the 16 larg...",0.9971
1,American Airlines Fliers Can Soon Redeem Miles...,Travel + Leisure,2 days ago,"Most of American’s narrowbody aircraft, such a...",https://r.search.yahoo.com/_ylt=AwrijSg6DwpmbE...,"\nThis summer, the airline will amp up its Wi-...",0.986
2,13 High Growth Value Stocks to Invest in Accor...,Insider Monkey via Yahoo Finance,3 days ago,"In this piece, we will take a look at the 13 h...",https://r.search.yahoo.com/_ylt=AwrijSg6DwpmbE...,"In this piece, we will take a look at the 13 h...",0.9998
3,Major US airline offers new freebie to on dome...,The US Sun,5 days ago,AN American airline has added a freebie to its...,https://r.search.yahoo.com/_ylt=AwrijSg6DwpmbE...,AN American airline has added a freebie to its...,0.9963
4,"Best Internet Providers in Lubbock, TX",USA Today,2 days ago,Lubbock homes can choose a plan to fit their n...,https://r.search.yahoo.com/_ylt=AwrijSg6DwpmbE...,Home Internet Published 10:11 am ET Mar 29 Edi...,0.9932
5,American Airlines enhances inflight connectivi...,Breaking Travel News,7 days ago,AAdvantage® members will soon be able to use m...,https://r.search.yahoo.com/_ylt=AwrijSg6DwpmbE...,Failed to fetch article content.,-0.5106
6,Officials plan for new age of cyber threats to...,Politico via Yahoo News,6 days ago,Moscow has already proven its satellite disrup...,https://r.search.yahoo.com/_ylt=AwrijSg6DwpmbE...,Russia’s push to put an anti-satellite nuclear...,-0.9918
7,Satellite firms cautiously optimistic as DoD b...,SpaceNews,3 days ago,The Pentagon’s proposed budget for fiscal year...,https://r.search.yahoo.com/_ylt=AwrijSg6DwpmbE...,\n\t\t\t\t\tCovering the business and politics...,0.9986
8,Three airlines are dropping Bay Area routes to...,San Francisco Chronicle,2 days ago,JetBlue axes SFO to LAX service; Delta and Ame...,https://r.search.yahoo.com/_ylt=AwrijSg6DwpmbE...,Failed to fetch article content.,-0.5106
9,"Best Internet Providers in Minneapolis, MN",USA Today,2 days ago,Get home Wi-Fi in Minneapolis for as low as $9...,https://r.search.yahoo.com/_ylt=AwrijSg6DwpmbE...,Home Internet Published 8:44 am ET Mar 29 Edit...,0.9995


In [79]:
df['link'][6]

'https://r.search.yahoo.com/_ylt=AwrijSg6DwpmbEMfuk7QtDMD;_ylu=Y29sbwNiZjEEcG9zAzcEdnRpZAMEc2VjA3Ny/RV=2/RE=1711964090/RO=10/RU=https%3a%2f%2fwww.yahoo.com%2fnews%2fofficials-plan-age-cyber-threats-193000136.html%3ffr%3dsycsrp_catchall/RK=2/RS=rpKnynZfNcW71QwbH5YU.x4jLIw-'

In [78]:
df['content'][6]

