In [None]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Function to load the preprocessed data from a file
def load_preprocessed_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        return json.load(file)

# Function to create a TF-IDF vectorizer
def create_tfidf_vectorizer(texts):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return vectorizer, tfidf_matrix

# Function to get the most relevant articles based on the query
def get_most_relevant_articles(query, tfidf_vectorizer, tfidf_matrix, articles, top_n=3):
    query_vec = tfidf_vectorizer.transform([query])  # Transform the query using the vectorizer
    cosine_similarities = np.dot(tfidf_matrix, query_vec.T).toarray()  # Compute cosine similarities
    
    # Get the indices of the top N most relevant articles
    best_match_indices = cosine_similarities.flatten().argsort()[-top_n:][::-1]
    return [articles[idx] for idx in best_match_indices]  # Return the top N relevant articles

# Function to combine summaries of multiple articles
def combine_summaries(articles):
    combined_summary = " ".join([article['summary']['text_en'] for article in articles])
    return combined_summary

# Main QA function
def wiki_qa_system(query, preprocessed_data, top_n=3):
    all_articles = []
    
    # Flatten the articles from all topics
    for topic, data in preprocessed_data.items():
        if 'articles' in data and isinstance(data['articles'], list):  # Check for 'articles' key
            for article in data['articles']:
                if isinstance(article, dict) and 'preprocessed_summary' in article:
                    all_articles.append(article)
        else:
            print(f"Unexpected format for topic: {topic}")

    # Ensure all_articles is populated
    if not all_articles:
        raise ValueError("No valid articles found in the preprocessed data.")

    # Use preprocessed summaries for matching
    articles_summaries = [article['preprocessed_summary'] for article in all_articles]
    tfidf_vectorizer, tfidf_matrix = create_tfidf_vectorizer(articles_summaries)

    # Get the top N most relevant articles based on the query
    most_relevant_articles = get_most_relevant_articles(query, tfidf_vectorizer, tfidf_matrix, all_articles, top_n)

    # Combine the summaries of the most relevant articles
    combined_summary = combine_summaries(most_relevant_articles)

    # Return the combined summary and the relevant article titles and URLs
    answers = [{
        'title': article['title'],
        'summary': article['summary']['text_en'],
        'url': article['url']
    } for article in most_relevant_articles]

    return {
        'combined_summary': combined_summary,
        'answers': answers
    }

# Example usage
if __name__ == "__main__":
    preprocessed_filename = 'preprocessed_data.json'  # Your preprocessed data file
    
    try:
        # Load preprocessed data
        preprocessed_data = load_preprocessed_data(preprocessed_filename)
        
        # Debugging: Check structure
        print("Preprocessed data loaded successfully.")
        print("Sample data structure:")
        print(json.dumps(preprocessed_data, indent=2)[:500])  # Print the first 500 characters for inspection

        # Accept user input for the query
        query = input("Please enter your query: ")

        # Get the result from the wiki QA system
        result = wiki_qa_system(query, preprocessed_data, top_n=3)

        # Display results
        print("\nCombined Summary:")
        print(result['combined_summary'])
        print("\nRelevant Articles:")
        for answer in result['answers']:
            print(f"Title: {answer['title']}")
            print(f"Summary: {answer['summary']}")
            print(f"URL: {answer['url']}")
            print()

    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
