In [67]:
import csv
import requests
from collections import deque
import scrapy
from scrapy.http import HtmlResponse
from scrapy.selector import Selector
import pytesseract
from PIL import Image
from io import BytesIO
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import sqlite3
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pickle
import time

In [56]:
def read_seed_urls_from_csv(csv_file):
    seed_urls = []
    with open(csv_file, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            # Assuming the URLs are in the first column of the CSV file
            if row:
                seed_urls.append(row[0].strip())
    return seed_urls

In [51]:
def extract_information(web_page_content):
    # Initialize Scrapy selector with the web page content
    selector = Selector(text=web_page_content)

    # Extract relevant information using Scrapy selectors and XPath/CSS
    title = selector.xpath("//title/text()").get()
    publication_date = selector.xpath("//meta[@name='publication_date']/@content").get()
    author = selector.xpath("//meta[@name='author']/@content").get()
    content = selector.xpath("//div[@class='article-content']//p//text()").getall()

    # Extract text from figures using OCR
    figure_texts = []
    figures = selector.xpath("//div[@class='figure']")
    for figure in figures:
        # Assuming the figure is an image
        image_url = figure.xpath(".//@src").get()
        if image_url:
            try:
                # Send a request to the image URL and retrieve the image content
                image_response = requests.get(image_url)
                if image_response.status_code == 200:
                    # Convert image content to PIL Image
                    image = Image.open(BytesIO(image_response.content))

                    # Perform OCR on the image to extract text
                    figure_text = pytesseract.image_to_string(image)
                    figure_texts.append(figure_text)
            except Exception as e:
                # Handle any exceptions that might occur during image processing or OCR
                print(f"Error processing image: {image_url}")
                print(e)

    table_data = []
    table_rows = selector.xpath("//table[@class='data-table']//tr")
    for row in table_rows:
        row_data = row.xpath(".//td//text()").getall()
        table_data.append(row_data)

    # Return the extracted data as a dictionary or relevant data structure
    extracted_data = {
        'title': title,
        'publication_date': publication_date,
        'author': author,
        'content': content,
        'figure_texts': figure_texts,
        'table_data': table_data,  # Uncomment if you want to extract data from a table
    }
    return extracted_data


In [52]:
def relevance_estimation(extracted_data):
    # Rule-based heuristics to quickly determine relevance
    rule_based_score = 0
    relevant_keywords = ['stock', 'market', 'finance', 'investment', 'earnings', 'dividend', 'shareholder', 'market trend']
    for keyword in relevant_keywords:
        if keyword.lower() in extracted_data['content'].lower():
            rule_based_score = 1
            break

    # Use BERT for encoding the text content into contextualized word embeddings
    model_name = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    input_text = extracted_data['content']
    inputs = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    # Prepare data for machine learning model
    data = [embeddings]  # Use BERT embeddings as input for machine learning model
    labels = [rule_based_score]

    # Machine learning model for further refinement
    xgb_model = xgb.XGBClassifier()
    xgb_model.fit(data, labels)

    # Predict using the XGBoost classifier
    xgb_score = xgb_model.predict(data)[0]

    # Combine rule-based and XGBoost scores to get the final relevance score
    relevance_score = max(rule_based_score, xgb_score)

    return relevance_score

In [53]:
def extract_bert_embeddings(texts):
    model_name = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    inputs = tokenizer(texts, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

def document_similarity(embeddings1, embeddings2):
    similarity_score = cosine_similarity(embeddings1, embeddings2)
    return similarity_score[0][0]

In [35]:
def store_data(extracted_data, relevance_score):
    # Create a connection to the SQLite database
    conn = sqlite3.connect('financial_news.db')
    cursor = conn.cursor()

    # Create a table if it doesn't exist
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS financial_news (
            title TEXT,
            publication_date TEXT,
            author TEXT,
            content TEXT,
            figure_texts TEXT,
            table_data TEXT,
            relevance_score INTEGER
        )
    ''')

    # Insert the extracted data and relevance score into the database
    cursor.execute('''
        INSERT INTO financial_news (title, publication_date, author, content, figure_texts, table_data, relevance_score)
        VALUES (?, ?, ?, ?, ?, ?, ?)
    ''', (extracted_data['title'], extracted_data['publication_date'], extracted_data['author'],
          extracted_data['content'], '\n'.join(extracted_data['figure_texts']),
          '\n'.join([','.join(row) for row in extracted_data['table_data']]), relevance_score))

    # Commit changes and close the connection
    conn.commit()
    conn.close()

In [36]:
def extract_additional_urls(web_page_content):
    # Parse the web page content using Scrapy's HtmlResponse
    response = HtmlResponse(url='', body=web_page_content, encoding='utf-8')

    # Extract URLs from anchor tags
    additional_urls = response.css('a::attr(href)').extract()
    # Filter and normalize URLs to include only absolute URLs starting with 'http'
    additional_urls = [url for url in additional_urls if url.startswith('http')]

    return additional_urls

In [58]:
def autonomous_crawler(seed_urls):
    visited_urls = set()
    queue = deque(seed_urls)

    while queue:
        current_url = queue.popleft()

        if current_url in visited_urls:
            continue

        try:
            # Send a request to the URL and retrieve the web page content
            response = requests.get(current_url)
            if response.status_code == 200:
                # Extract relevant information from the web page
                extracted_data = extract_information(response.text)

                # Apply the document relevance estimation using a large language model
                relevance_score = relevance_estimation(extracted_data)

                # Calculate BERT embeddings for the extracted content
                content_embeddings = extract_bert_embeddings(extracted_data['content'])
                
                # Calculate document similarity with previously stored documents
                conn = sqlite3.connect('financial_news.db')
                cursor = conn.cursor()
                cursor.execute('SELECT content FROM financial_news WHERE relevance_score=1')
                stored_contents = cursor.fetchall()
                conn.close()

                # Check similarity with all relevant stored documents
                similarity_scores = []
                for stored_content in stored_contents:
                    stored_content_embeddings = extract_bert_embeddings(stored_content[0])
                    similarity = document_similarity(content_embeddings, stored_content_embeddings)
                    similarity_scores.append(similarity)

                # Check if any relevant documents are similar
                if np.max(similarity_scores) > 0.8:
                    relevance_score = 0  # Set to 0 if similarity threshold is met

                # Store the extracted information and processed data
                store_data(extracted_data, relevance_score)

                # Extract and enqueue additional URLs from the web page for further crawling
                additional_urls = extract_additional_urls(response.text)
                queue.extend(additional_urls)

                visited_urls.add(current_url)

        except Exception as e:
            # Handle any exceptions that might occur during crawling
            print(f"Error crawling URL: {current_url}")
            print(e)

In [66]:
def build_model_for_relevance_ranking():
    # Load data from the database
    conn = sqlite3.connect('financial_news.db')
    cursor = conn.cursor()
    cursor.execute('SELECT content, relevance_score FROM financial_news')
    data = cursor.fetchall()
    conn.close()

    # Prepare feature vectors and target labels
    texts = [row[0] for row in data]
    relevance_scores = [row[1] for row in data]

    # Calculate BERT embeddings for the content
    embeddings = extract_bert_embeddings(texts)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(embeddings, relevance_scores, test_size=0.2, random_state=42)

    # Train the Random Forest Regression model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # Predict on the testing set
    y_pred = rf_model.predict(X_test)

    # Evaluate the model using Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, y_pred)
    print("Mean Squared Error:", mse)

    # Return the trained model for relevance ranking
    return rf_model

In [68]:
def continuously_extend_model():
    while True:
        # Run the autonomous crawler to collect more web documents and data
        csv_file = "Dataset/input_urls.csv"
        seed_urls = read_seed_urls_from_csv(csv_file)
        autonomous_crawler(seed_urls)

        # Update the relevance ranking model
        rf_model = build_model_for_relevance_ranking()

        # Save the updated model for future use
        with open('relevance_ranking_model.pkl', 'wb') as model_file:
            pickle.dump(rf_model, model_file)

        # Continue the loop to periodically extend the model
        # You can adjust the time interval based on your specific requirements
        time.sleep(3600)  # Wait for an hour before extending the model again

In [None]:
if __name__ == "__main__":
    # Call the continuously_extend_model() function
    continuously_extend_model()