In [1]:
import spacy
from spacy import displacy
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # Tokenize and clean text
    doc = nlp(text)
    cleaned_tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(cleaned_tokens), doc

# Example text
text = "Rahul wakes up early every day. He goes to college in the morning and comes back at 3 pm. At present, Rahul is outside. He has to buy the snacks for all of us."

# Preprocess text
cleaned_text, doc = preprocess_text(text)
print("Cleaned Text:", cleaned_text)

Cleaned Text: rahul wakes early day goes college morning comes 3 pm present rahul outside buy snacks


In [3]:
def identify_tasks(doc):
    tasks = []
    for sent in doc.sents:
        for token in sent:
            # Check for actionable verbs
            if token.pos_ == "VERB" and token.lemma_ in ["buy", "clean", "review", "schedule"]:  # Add more verbs
                task = {"action": token.lemma_, "who": None, "deadline": None}
                # Extract subject (who)
                for child in token.children:
                    if child.dep_ == "nsubj":
                        task["who"] = child.text
                # Extract deadline
                for child in token.children:
                    if child.dep_ == "prep" and child.text == "by":
                        task["deadline"] = next(child.rights).text
                tasks.append(task)
    return tasks

# Identify tasks
tasks = identify_tasks(doc)
print("Identified Tasks:", tasks)

Identified Tasks: [{'action': 'buy', 'who': None, 'deadline': None}]


In [4]:
def categorize_tasks(task_texts):
    # Convert task texts into a matrix of token counts
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(task_texts)
    
    # Apply Latent Dirichlet Allocation (LDA)
    lda = LatentDirichletAllocation(n_components=3, random_state=42)  # Adjust n_components as needed
    lda.fit(X)
    
    # Assign categories to tasks
    categories = lda.transform(X)
    return categories

# Extract task texts
task_texts = [task["action"] for task in tasks]

# Categorize tasks
categories = categorize_tasks(task_texts)
print("Task Categories:", categories)

Task Categories: [[0.33333333 0.33333333 0.33333333]]


In [5]:
def generate_output(tasks, categories):
    output = []
    for task, category in zip(tasks, categories):
        output.append({
            "task": task["action"],
            "who": task["who"],
            "deadline": task["deadline"],
            "category": category.argmax()  # Assign the most probable category
        })
    return output

# Generate output
output = generate_output(tasks, categories)
print("Structured Output:", output)

Structured Output: [{'task': 'buy', 'who': None, 'deadline': None, 'category': 0}]


In [6]:
test_text = "John needs to clean the room by 5 pm today. Sarah has to review the report by tomorrow."
cleaned_text, doc = preprocess_text(test_text)
tasks = identify_tasks(doc)
task_texts = [task["action"] for task in tasks]
categories = categorize_tasks(task_texts)
output = generate_output(tasks, categories)
print("Test Output:", output)

Test Output: [{'task': 'clean', 'who': None, 'deadline': 'pm', 'category': 2}, {'task': 'review', 'who': None, 'deadline': 'tomorrow', 'category': 1}]
