In [4]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Function to preprocess text
def preprocess_text(text):
    """
    Preprocess text by tokenizing, removing stopwords, and lemmatizing.
    """
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    
    # Tokenize text
    words = word_tokenize(text.lower())
    
    # Remove stopwords and lemmatize
    filtered_words = [
        lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stop_words
    ]
    return " ".join(filtered_words)

# Function to load the preprocessed data from a file
def load_preprocessed_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        return json.load(file)

# Function to create a TF-IDF vectorizer
def create_tfidf_vectorizer(texts):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return vectorizer, tfidf_matrix

# Function to get the most relevant articles based on the query
def get_most_relevant_articles(query, tfidf_vectorizer, tfidf_matrix, articles, top_n=3):
    query_vec = tfidf_vectorizer.transform([query])  # Transform the query using the vectorizer
    cosine_similarities = np.dot(tfidf_matrix, query_vec.T).toarray()  # Compute cosine similarities
    
    # Get the indices of the top N most relevant articles
    best_match_indices = cosine_similarities.flatten().argsort()[-top_n:][::-1]
    return [
        articles[idx] for idx in best_match_indices 
        if idx < len(articles)  # Ensure index is within bounds
    ]

# Function to combine summaries of multiple articles
def combine_summaries(articles):
    combined_summary = " ".join([
        article['summary']['text_en'] if 'summary' in article and 'text_en' in article['summary'] else "No Summary"
        for article in articles
    ])
    return combined_summary

# Main QA function
def wiki_qa_system(query, preprocessed_data, top_n=3):
    all_articles = []
    
    # Flatten the articles from all topics
    for topic, data in preprocessed_data.items():
        if isinstance(data, dict) and 'articles' in data and isinstance(data['articles'], list):
            for article in data['articles']:
                if isinstance(article, dict) and 'preprocessed_summary' in article:
                    all_articles.append(article)
        else:
            print(f"Unexpected format for topic: {topic}. Skipping...")

    # Ensure all_articles is populated
    if not all_articles:
        raise ValueError("No valid articles found in the preprocessed data.")

    # Use preprocessed summaries for matching
    articles_summaries = [
        preprocess_text(article['preprocessed_summary']) for article in all_articles
        if 'preprocessed_summary' in article
    ]

    if not articles_summaries:
        raise ValueError("No preprocessed summaries found for matching.")

    tfidf_vectorizer, tfidf_matrix = create_tfidf_vectorizer(articles_summaries)

    # Get the top N most relevant articles based on the query
    most_relevant_articles = get_most_relevant_articles(query, tfidf_vectorizer, tfidf_matrix, all_articles, top_n)

    # Combine the summaries of the most relevant articles
    combined_summary = combine_summaries(most_relevant_articles)

    # Return the combined summary and the relevant article titles and URLs
    answers = [{
        'title': article.get('title', "No Title"),
        'summary': article['summary']['text_en'] if 'summary' in article and 'text_en' in article['summary'] else "No Summary",
        'url': article.get('url', "No URL")
    } for article in most_relevant_articles]

    return {
        'combined_summary': combined_summary,
        'answers': answers
    }

# Example usage
if __name__ == "__main__":
    preprocessed_filename = 'preprocessed_data.json'  # Your preprocessed data file
    
    try:
        # Load preprocessed data
        preprocessed_data = load_preprocessed_data(preprocessed_filename)
        
        # Debugging: Check structure
        print("Preprocessed data loaded successfully.")
        print("Sample data structure:")
        print(list(preprocessed_data.items())[:1])  # Print one topic for debugging

        # Accept user input for the query
        query = input("Please enter your query: ")

        # Get the result from the wiki QA system
        result = wiki_qa_system(query, preprocessed_data, top_n=3)

        # Display results
        print("\nCombined Summary:")
        print(result['combined_summary'])
        print("\nRelevant Articles:")
        for answer in result['answers']:
            print(f"Title: {answer['title']}")
            print(f"Summary: {answer['summary']}")
            print(f"URL: {answer['url']}")
            print()

    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


[nltk_data] Downloading package punkt to /Users/anirudh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anirudh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/anirudh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Please enter your query: How does global warming impact crop production?
Unexpected format for topic: combined_keywords. Skipping...

Combined Summary:
A cash crop also called profit crop is an agricultural crop which is grown to sell for profit It is typically purchased by parties separate from a farm The term is used to differentiate marketed crops from staple crop subsistence crop in subsistence agriculture which are those fed to the producers own livestock or grown as food for the producers familyIn earlier times cash crops were usually only a small but vital part of a farms total yield while today especially in developed countries and among smallholders almost all crops are mainly grown for revenue In the least developed countries cash crops are usually crops which attract demand in more developed nations and hence have some export valuePrices for major cash crops are set in international trade markets with global scope with some local variation termed as basis based on freight co