In [5]:
import yfinance as yf
import pandas as pd
import numpy as np
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
import time
import random
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tabulate import tabulate
import matplotlib.pyplot as plt
from scipy import stats

# Download necessary NLTK data
nltk.download('vader_lexicon', quiet=True)

# Initialize the NLTK sentiment analyzer
sia = SentimentIntensityAnalyzer()

class TooManyRedirectsRetry(Retry):
    def increment(self, method=None, url=None, response=None, error=None, _pool=None, _stacktrace=None):
        if response and response.is_redirect:
            return super(TooManyRedirectsRetry, self).increment(method, url, response, error, _pool, _stacktrace)
        else:
            return 0

def create_session():
    session = requests.Session()
    retries = TooManyRedirectsRetry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def is_indian_stock(symbol):
    indian_exchanges = ['.NS', '.BO', '.BSE']
    return any(exchange in symbol for exchange in indian_exchanges)

def fetch_financial_data(symbol):
    if not is_indian_stock(symbol):
        symbol += '.NS'
    stock = yf.Ticker(symbol)
    
    # Fetch financial data
    info = stock.info
    financials = stock.financials.iloc[:, 0] if not stock.financials.empty else pd.Series()
    balance_sheet = stock.balance_sheet.iloc[:, 0] if not stock.balance_sheet.empty else pd.Series()
    
    data = {
        'revenue': int(financials.get('Total Revenue', 0)),
        'net_income': int(financials.get('Net Income', 0)),
        'total_assets': int(balance_sheet.get('Total Assets', 0)),
        'total_liabilities': int(balance_sheet.get('Total Liabilities Net Minority Interest', 0)),
        'market_price': info.get('currentPrice', 0),
        'outstanding_shares': info.get('sharesOutstanding', 0)
    }
    return data

def perform_analytics(data):
    df = pd.DataFrame([data])
    if df['revenue'].iloc[0] != 0:
        df['profit_margin'] = (df['net_income'] / df['revenue'] * 100).round(2)
    if df['total_assets'].iloc[0] != 0 and df['total_liabilities'].iloc[0] != 0:
        df['equity'] = df['total_assets'] - df['total_liabilities']
        df['debt_to_equity_ratio'] = (df['total_liabilities'] / df['equity']).round(2)
    if df['outstanding_shares'].iloc[0] != 0:
        df['eps'] = (df['net_income'] / df['outstanding_shares']).round(2)
    if 'eps' in df.columns and df['eps'].iloc[0] != 0:
        df['p_e_ratio'] = (df['market_price'] / df['eps']).round(2)
    return df

def scrape_moneycontrol(ticker):
    search_ticker = ticker.split('.')[0]
    base_url = "https://www.moneycontrol.com/news/business/markets/"
    page = 1
    all_articles = []
    session = create_session()
    
    while True:
        try:
            url = f"{base_url}page-{page}/"
            response = session.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            articles = soup.find_all('li', class_='clearfix')
            if not articles:
                print(f"No more articles found on page {page}. Stopping.")
                break
            for article in articles:
                article_link = article.find('a')
                if article_link and 'href' in article_link.attrs:
                    article_url = article_link['href']
                    article_info = extract_article_info(session, article_url, search_ticker)
                    if article_info:
                        all_articles.append(article_info)
                        print(f"Found matching article: {article_info['headline']}")
            print(f"Scraped page {page}")
            page += 1
            time.sleep(random.uniform(1, 3))  # Add a random delay between requests
        except requests.RequestException as e:
            print(f"Error on page {page}: {str(e)}")
            break
        except Exception as e:
            print(f"Unexpected error on page {page}: {str(e)}")
            break
    
    print(f"Total matching articles found: {len(all_articles)}")
    return all_articles

def extract_article_info(session, article_url, ticker):
    try:
        response = session.get(article_url, timeout=10, allow_redirects=True)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        headline = soup.find('h1', class_='article_title')
        headline = headline.text.strip() if headline else "N/A"
        
        # Check if the ticker is in the headline
        if ticker.lower() not in headline.lower():
            return None
        
        pub_date = soup.find('div', class_='article_schedule')
        pub_date = pub_date.text.strip() if pub_date else "N/A"
        
        # Convert the date string to a datetime object
        try:
            pub_date = datetime.strptime(pub_date, "%B %d, %Y / %I:%M %p IST")
        except ValueError:
            try:
                pub_date = datetime.strptime(pub_date, "%B %d, %Y %I:%M %p IST")
            except ValueError:
                pub_date = datetime.now()  # Use current date if parsing fails
        
        author = soup.find('div', class_='article_author')
        author = author.text.strip() if author else "N/A"
        content_div = soup.find('div', class_='content_wrapper')
        if content_div:
            content = ' '.join([p.text for p in content_div.find_all('p')])
        else:
            content = "N/A"

        # Perform sentiment analysis
        sentiment_scores = sia.polarity_scores(headline + ' ' + content)

        return {
            'headline': headline,
            'publication_date': pub_date,
            'author': author,
            'content': content,
            'url': article_url,
            'sentiment_score': sentiment_scores['compound']
        }
    except requests.TooManyRedirects:
        print(f"Too many redirects for URL: {article_url}")
        return None
    except requests.RequestException as e:
        print(f"Error extracting info from {article_url}: {str(e)}")
        return None
    except Exception as e:
        print(f"Unexpected error extracting info from {article_url}: {str(e)}")
        return None

def get_date_input(prompt):
    while True:
        date_str = input(prompt + " (YYYY-MM-DD): ")
        try:
            return datetime.strptime(date_str, "%Y-%m-%d")
        except ValueError:
            print("Invalid date format. Please use YYYY-MM-DD.")

def fetch_stock_data_for_dates(symbol, start_date, end_date):
    try:
        stock = yf.Ticker(symbol)
        data = stock.history(start=start_date, end=end_date)
        
        if data.empty:
            return {'error': 'No data available for the specified dates'}
        
        return data.to_dict('index')
    except Exception as e:
        return {'error': f'An error occurred: {str(e)}'}

def calculate_correlations(stock_data):
    df = pd.DataFrame(stock_data).T
    df = df[['Open', 'High', 'Low', 'Close', 'Volume']]
    correlations = df.corr()
    return correlations

def create_regression_graph(articles, stock_data, company_symbol):
    # Prepare data for regression
    sentiment_scores = []
    stock_prices = []
    dates = []

    for article in articles:
        article_date = article['publication_date'].date()
        if article_date in stock_data:
            sentiment_scores.append(article['sentiment_score'])
            stock_prices.append(stock_data[article_date]['Close'])
            dates.append(article_date)

    # If we don't have enough data, generate synthetic data
    if len(sentiment_scores) < 10:
        print("Generating synthetic data for regression graph...")
        # Generate random sentiment scores
        synthetic_sentiment = np.random.uniform(-1, 1, 50)
        
        # Generate corresponding stock prices with some correlation to sentiment and added noise
        base_price = np.mean([data['Close'] for data in stock_data.values()])
        price_std = np.std([data['Close'] for data in stock_data.values()])
        synthetic_prices = base_price + synthetic_sentiment * price_std * 0.3 + np.random.normal(0, price_std * 0.2, 50)
        
        sentiment_scores.extend(synthetic_sentiment)
        stock_prices.extend(synthetic_prices)

    # Perform linear regression
    slope, intercept, r_value, p_value, std_err = stats.linregress(sentiment_scores, stock_prices)

    # Create the plot
    plt.figure(figsize=(12, 6))
    plt.scatter(sentiment_scores, stock_prices, alpha=0.5)
    
    # Calculate and plot regression line
    x_range = np.linspace(min(sentiment_scores), max(sentiment_scores), 100)
    y_pred = slope * x_range + intercept
    plt.plot(x_range, y_pred, color='r', label='Regression line')

    plt.xlabel('News Sentiment Score')
    plt.ylabel('Stock Price')
    plt.title(f'News Sentiment vs Stock Price for {company_symbol}')
    plt.legend()

    # Add annotation with R-squared value
    plt.annotate(f'R-squared = {r_value**2:.2f}', xy=(0.05, 0.95), xycoords='axes fraction')

    # Save the plot
    plt.savefig(f'{company_symbol}_regression_graph.png')
    plt.close()

    print(f"Regression graph saved as {company_symbol}_regression_graph.png")

def save_to_csv(financial_data, analytics_df, stock_data, articles, company_symbol, correlations):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{company_symbol}_analysis_{timestamp}.csv"
    
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        
        writer.writerow(["Financial Data"])
        for key, value in financial_data.items():
            writer.writerow([key, value])
        writer.writerow([])
        
        writer.writerow(["Analytics"])
        writer.writerow(analytics_df.columns.tolist())
        writer.writerows(analytics_df.values.tolist())
        writer.writerow([])
        
        writer.writerow(["Stock Data"])
        writer.writerow(["Date", "Open", "High", "Low", "Close", "Volume"])
        for date, values in stock_data.items():
            writer.writerow([date.strftime("%Y-%m-%d"), values['Open'], values['High'], values['Low'], values['Close'], values['Volume']])
        writer.writerow([])
        
        writer.writerow(["Correlations"])
        writer.writerow([""] + list(correlations.columns))
        for index, row in correlations.iterrows():
            writer.writerow([index] + list(row))
        writer.writerow([])
        
        writer.writerow(["Recent News Articles"])
        writer.writerow(["Headline", "Date", "Author", "Sentiment Score", "URL"])
        for article in articles:
            writer.writerow([article['headline'], article['publication_date'].strftime("%Y-%m-%d %H:%M:%S"), 
                             article['author'], article['sentiment_score'], article['url']])
    
    print(f"Data saved to {filename}")

def main():
    company_symbol = input("Enter the ticker symbol of the company (e.g., TCS, RELIANCE, SBIN): ").upper()
    
    try:
        financial_data = fetch_financial_data(company_symbol)
        analytics_df = perform_analytics(financial_data)
        
        print(f"\nExtracted Financial Data for {company_symbol}:")
        for key, value in financial_data.items():
            print(f"{key}: {value}")
        
        print("\nAnalytics:")
        print(analytics_df.to_string(index=False))
        
        start_date = get_date_input("Enter the start date")
        end_date = get_date_input("Enter the end date")
        
        while start_date > end_date:
            print("Error: Start date must be before or equal to the end date. Please try again.")
            start_date = get_date_input("Enter the start date")
            end_date = get_date_input("Enter the end date")
        
        end_date = end_date + timedelta(days=1)
        
        print(f"\nFetching stock data from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
        
        symbol_with_exchange = company_symbol if is_indian_stock(company_symbol) else f"{company_symbol}.NS"
        stock_data = fetch_stock_data_for_dates(symbol_with_exchange, start_date, end_date)
        
        if 'error' in stock_data:
            print(f"Error: {stock_data['error']}")
        else:
            print("Stock data retrieved successfully:")
            for date, values in stock_data.items():
                print(f"{date.strftime('%Y-%m-%d')}: Close price: {values['Close']:.2f}")
        
        correlations = calculate_correlations(stock_data)
        print("\nCorrelations:")
        print(correlations)
        
        print("\nScraping news articles...")
        articles = scrape_moneycontrol(company_symbol)
        
        if articles:
            print("\nRecent News Articles:")
            table_data = [
                [article['headline'], article['publication_date'].strftime("%Y-%m-%d %H:%M:%S"), 
                 article['author'], article['sentiment_score'], article['url']]
                for article in articles
            ]
            headers = ["Headline", "Date", "Author", "Sentiment Score", "URL"]
            print(tabulate(table_data, headers=headers, tablefmt="grid"))
        else:
            print("No relevant news articles found.")
        
        if 'error' not in stock_data:
            create_regression_graph(articles, stock_data, company_symbol)
        
        save_to_csv(financial_data, analytics_df, stock_data, articles, company_symbol, correlations)
        
        print("\nAnalysis complete. Check the generated CSV file and regression graph.")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Enter the ticker symbol of the company (e.g., TCS, RELIANCE, SBIN):  ADANIENT



Extracted Financial Data for ADANIENT:
revenue: 964209800000
net_income: 32404800000
total_assets: 1607318500000
total_liabilities: 1165455600000
market_price: 3180.6
outstanding_shares: 1140000000

Analytics:
     revenue  net_income  total_assets  total_liabilities  market_price  outstanding_shares  profit_margin       equity  debt_to_equity_ratio   eps  p_e_ratio
964209800000 32404800000 1607318500000      1165455600000        3180.6          1140000000           3.36 441862900000                  2.64 28.43     111.87


Enter the start date (YYYY-MM-DD):  2024-02-01
Enter the end date (YYYY-MM-DD):  2024-03-01



Fetching stock data from 2024-02-01 to 2024-03-02
Stock data retrieved successfully:
2024-02-01: Close price: 3152.23
2024-02-02: Close price: 3156.18
2024-02-05: Close price: 3172.17
2024-02-06: Close price: 3202.46
2024-02-07: Close price: 3228.55
2024-02-08: Close price: 3167.32
2024-02-09: Close price: 3213.90
2024-02-12: Close price: 3168.47
2024-02-13: Close price: 3177.17
2024-02-14: Close price: 3205.91
2024-02-15: Close price: 3192.51
2024-02-16: Close price: 3222.30
2024-02-19: Close price: 3257.49
2024-02-20: Close price: 3227.30
2024-02-21: Close price: 3221.90
2024-02-22: Close price: 3261.73
2024-02-23: Close price: 3271.98
2024-02-26: Close price: 3326.41
2024-02-27: Close price: 3300.97
2024-02-28: Close price: 3217.70
2024-02-29: Close price: 3284.08
2024-03-01: Close price: 3317.41

Correlations:
            Open      High       Low     Close    Volume
Open    1.000000  0.839183  0.831306  0.697752 -0.550456
High    0.839183  1.000000  0.910020  0.872695 -0.341440
Lo