 installing the necessary libraries:

In [1]:

!pip install nltk
!pip install scikit-learn
!pip install spacy
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [71]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import spacy
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [72]:
def preprocess(text):
    try:
        tokens = word_tokenize(text.lower())
    except AttributeError:
        print(f"Error processing text: {text}")
        raise
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    porter = PorterStemmer()
    tokens = [porter.stem(word) for word in tokens]
    return " ".join(tokens)

def preprocess_documents_and_queries(documents, queries):
    try:
        preprocessed_documents = [preprocess(doc) for doc in documents]
        preprocessed_queries = [preprocess(query) for query in queries]
    except AttributeError:
        print(f"Error processing query: {queries}")
        raise
    return preprocessed_documents, preprocessed_queries

In [73]:
def feature_based_grammar(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    named_entities = [(ent.text, ent.label_) for ent in doc.ents]
    pos_tags = [(token.text, token.tag_) for token in doc]
    return named_entities, pos_tags

In [74]:
def train_neural_network(X_train, y_train, X_test, y_test):
    classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=10)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return classifier, accuracy


In [75]:
def classify_query(classifier, vectorizer, query):
    _, preprocessed_queries = preprocess_documents_and_queries([], [query])
    preprocessed_query = preprocessed_queries[0]  # Get the first (and only) preprocessed query string
    query_vector = vectorizer.transform([preprocessed_query]).toarray()
    category = classifier.predict(query_vector)
    return category[0]


In [76]:
def get_top_k_documents(classifier, vectorizer, X, category, k):
    probabilities = classifier.predict_proba(X.toarray())
    category_index = list(encoder.classes_).index(category)
    category_probabilities = probabilities[:, category_index]
    top_k_indices = np.argsort(category_probabilities)[-k:]
    return [documents[i] for i in top_k_indices]


In [77]:
def document_query_pipeline(documents, queries, k):
    preprocessed_documents, preprocessed_queries = preprocess_documents_and_queries(documents, queries)

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(preprocessed_documents)

    categories = ["sports", "politics", "entertainment"]
    y = [np.random.choice(categories) for _ in documents]  # Dummy labels for example purposes

    encoder = LabelEncoder()  # Add this line to define the 'encoder'
    y_encoded = encoder.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2)

    classifier, accuracy = train_neural_network(X_train, y_train, X_test, y_test)
    print(f"Neural network accuracy: {accuracy:.2f}")

    result = {}
    for query in queries:
        category = classify_query(classifier, vectorizer, query)
        top_k_docs = get_top_k_documents(classifier, vectorizer, X, category, k)
        result[query] = top_k_docs

    return result

   

In [78]:
nltk.download('reuters')

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [79]:
from nltk.corpus import reuters

document_ids = reuters.fileids()
documents = [' '.join(reuters.words(doc_id)) for doc_id in document_ids[:500]]


In [80]:
queries = [
    "What is the latest news in sports?",
    "Who won the election?",
    "Recent developments in the entertainment industry"
]
