In [1]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

  from pandas.core import (
[nltk_data] Downloading package punkt to /Users/anirudh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anirudh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/anirudh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Download necessary NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Load the T5 model for summarization
t5_model_name = "t5-base"  # You can also use "t5-large" for better performance
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)

# Reload the zero-shot classification model and tokenizer
load_directory = "./saved_zero_shot_model"
tokenizer = AutoTokenizer.from_pretrained(load_directory)
model = AutoModelForSequenceClassification.from_pretrained(load_directory)

# Create a classification pipeline with the loaded model
pipe = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)

# Define the topics
topics = [
    "Health",
    "Environment",
    "Technology",
    "Economy",
    "Entertainment",
    "Sports",
    "Politics",
    "Education",
    "Travel",
    "Food",
]

# Define the hypothesis template
hypothesis_template = "This query is related to {}."

# Improved topic classification function
def classify_multi_topics(query, pipe, topics, threshold=0.1, top_n=5):
    """
    Classifies the query into multiple topics using zero-shot classification.
    """
    try:
        # Perform zero-shot classification
        output = pipe(query, topics, hypothesis_template=hypothesis_template)

        # Extract labels and scores
        labels = output["labels"]
        scores = output["scores"]

        # Filter topics based on the threshold
        relevant_topics = [
            {"topic": label, "score": score}
            for label, score in zip(labels, scores)
            if score > threshold
        ]

        # Sort topics by score in descending order
        relevant_topics = sorted(relevant_topics, key=lambda x: x["score"], reverse=True)

        # Debugging: Show all topic scores
        print(f"Query: {query}")
        print(f"Classification Scores:")
        for t in zip(labels, scores):
            print(f"  - {t[0]}: {t[1]:.2f}")

        # Return top_n topics if none pass the threshold
        if not relevant_topics:
            print("No topics passed the threshold. Returning top N topics instead.")
            relevant_topics = [
                {"topic": label, "score": score}
                for label, score in zip(labels[:top_n], scores[:top_n])
            ]

        return relevant_topics
    except Exception as e:
        print(f"Error during classification: {e}")
        return []


# Preprocess text function
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text.lower())
    filtered_words = [
        lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stop_words
    ]
    return " ".join(filtered_words)

# Load preprocessed data function
def load_preprocessed_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        return json.load(file)

# Create a TF-IDF vectorizer function
def create_tfidf_vectorizer(texts):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return vectorizer, tfidf_matrix

# Retrieve most relevant articles with unique URLs
def get_most_relevant_articles(query, tfidf_vectorizer, tfidf_matrix, articles, top_n=3):
    query_vec = tfidf_vectorizer.transform([query])
    cosine_similarities = np.dot(tfidf_matrix, query_vec.T).toarray().flatten()
    sorted_indices = cosine_similarities.argsort()[::-1]

    unique_articles = []
    seen_urls = set()

    for idx in sorted_indices:
        if len(unique_articles) >= top_n:
            break
        article = articles[idx]
        article_url = article.get('url', None)
        if article_url and article_url not in seen_urls:
            unique_articles.append(article)
            seen_urls.add(article_url)

    return unique_articles

# Generate a meaningful summary using T5
def generate_meaningful_summary_t5(combined_text, max_length=800, min_length=100):
    inputs = t5_tokenizer.encode("summarize: " + combined_text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = t5_model.generate(
        inputs,
        max_length=max_length,
        min_length=min_length,
        length_penalty=1.0,
        num_beams=4,
        early_stopping=True
    )
    return t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Combine summaries of the most relevant articles
def combine_summaries(articles):
    combined_text = " ".join([
        article['summary']['text_en'] if 'summary' in article and 'text_en' in article['summary'] else "No Summary"
        for article in articles
    ])
    return generate_meaningful_summary_t5(combined_text)

# Main QA function
def wiki_qa_system(query, preprocessed_data, top_n=3):  # Set top_n to 5
    """
    Main Wiki QA system function.
    """
    relevant_topics_data = classify_multi_topics(query, pipe, topics, threshold=0.1, top_n=5)
    relevant_topics = [t["topic"] for t in relevant_topics_data]
    print(f"Relevant Topics: {relevant_topics}")

    all_articles = []
    for topic, data in preprocessed_data.items():
        if topic in relevant_topics and 'articles' in data and isinstance(data['articles'], list):
            for article in data['articles']:
                if isinstance(article, dict) and 'preprocessed_summary' in article:
                    article['topic'] = topic
                    all_articles.append(article)

    if not all_articles:
        raise ValueError("No valid articles found for the relevant topics.")

    articles_summaries = [
        preprocess_text(article['preprocessed_summary']) for article in all_articles
        if 'preprocessed_summary' in article
    ]

    if not articles_summaries:
        raise ValueError("No preprocessed summaries found for matching.")

    tfidf_vectorizer, tfidf_matrix = create_tfidf_vectorizer(articles_summaries)
    most_relevant_articles = get_most_relevant_articles(query, tfidf_vectorizer, tfidf_matrix, all_articles, top_n)
    combined_summary = combine_summaries(most_relevant_articles)

    answers = [{
        'title': article.get('title', "No Title"),
        'topic': article.get('topic', "No Topic"),
        'url': article.get('url', "No URL")
    } for article in most_relevant_articles]

    return {
        'combined_summary': combined_summary,
        'answers': answers
    }

# Example usage
if __name__ == "__main__":
    preprocessed_filename = 'preprocessed_data.json'
    try:
        preprocessed_data = load_preprocessed_data(preprocessed_filename)
        print("Preprocessed data loaded successfully.")
        query = input("Please enter your query: ")
        result = wiki_qa_system(query, preprocessed_data, top_n=3)
        print("\nSummary:")
        print(result['combined_summary'])
        print("\nRelevant Articles:")
        for answer in result['answers']:
            print(f"Title: {answer['title']}")
            print(f"Topic: {answer['topic']}")
            print(f"URL: {answer['url']}")
            print()
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


[nltk_data] Downloading package punkt to /Users/anirudh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anirudh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/anirudh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Preprocessed data loaded successfully.
Please enter your query: Health benefits of swimming?
Query: Health benefits of swimming?
Classification Scores:
  - Health: 0.61
  - Sports: 0.36
  - Education: 0.01
  - Technology: 0.00
  - Environment: 0.00
  - Entertainment: 0.00
  - Food: 0.00
  - Politics: 0.00
  - Travel: 0.00
  - Economy: 0.00
Relevant Topics: ['Health', 'Sports']

Summary:
swimming involves repeated motions known as strokes to propel the body forward while the front crawl is widely regarded as the fastest of the four main strokes . swimmers may find themselves incapacitated by panic and exhaustion both potential causes of death by drowning . health human resources (hrh) is defined as all people engaged in actions whose primary intent is to enhance positive health outcomes according to the world health organizations world health report 2006 . the field of HHR deals with issues such as workforce planning and policy evaluation recruitment

Relevant Articles:
Title: Swimming
