In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

import spacy
from spacy.tokens import DocBin
from spacy.training.example import Example
import random

import joblib

## Training Data 

In [2]:
# Example training data (input text and intent labels)
train_texts = [
    # Add Task
    "Add buy groceries to my list",
    "Add finish homework to the to-do list",
    "Create a task to clean the kitchen",
    "Put 'water the plants' on my agenda",
    "Add 'read a book' to my plans",
    "Note down 'submit assignment'",
    "Include 'organize desk' in my tasks",
    "Add workout to my daily routine",
    "Add plan birthday party to the list",
    "Schedule an email follow-up task",

    # Set Priority
    "Set priority to high for cleaning",
    "Make 'pay bills' a top priority",
    "Mark 'study for exam' as important",
    "Set low priority to wash the car",
    "Give medium priority to replying emails",
    "Label 'book tickets' as urgent",
    "Make 'revise notes' a low priority",
    "Change priority to high for project report",
    "Flag 'team meeting' as critical",
    "Adjust priority of 'file taxes' to high",

    # Set Deadline
    "Remind me to call mom by tomorrow",
    "Schedule meeting at 3pm next Friday",
    "Set a deadline for the budget report by Monday",
    "Remind me to send the email tonight",
    "Finish reading by Sunday night",
    "Deadline for submitting forms is next Wednesday",
    "I need to complete this task before Friday",
    "Can you remind me to buy a gift by the weekend?",
    "Set reminder to renew license by next month",
    "Mark 'pay rent' due on the 1st of next month",

    # Delete Task
    "Remove 'buy groceries' from my list",
    "Delete the task to call mom",
    "Get rid of the workout reminder",
    "Take off 'read a book' from today’s tasks",
    "Erase 'clean kitchen' from my schedule",
    "Clear the task about meeting John",
    "Discard the reminder to water the plants",
    "Delete finish homework task",
    "Remove 'email follow-up' from the list",
    "Cancel the plan to visit grandma",

    # Edit Task
    "Edit task 'buy groceries' to 'buy veggies'",
    "Change 'meeting with Sam' to 'meeting with Sarah'",
    "Update deadline for 'project report' to next Friday",
    "Modify 'book flight' task to include return ticket",
    "Change time of call mom to 6pm",
    "Update 'email manager' to 'email HR'",
    "Edit priority of 'taxes' to medium",
    "Update the task to clean the kitchen today",
    "Reschedule 'plan party' to Saturday",
    "Change description of the homework task",
]

train_labels = [
    # Add Task
    "Add Task", "Add Task", "Add Task", "Add Task", "Add Task",
    "Add Task", "Add Task", "Add Task", "Add Task", "Add Task",

    # Set Priority
    "Set Priority", "Set Priority", "Set Priority", "Set Priority", "Set Priority",
    "Set Priority", "Set Priority", "Set Priority", "Set Priority", "Set Priority",

    # Set Deadline
    "Set Deadline", "Set Deadline", "Set Deadline", "Set Deadline", "Set Deadline",
    "Set Deadline", "Set Deadline", "Set Deadline", "Set Deadline", "Set Deadline",

    # Delete Task
    "Delete Task", "Delete Task", "Delete Task", "Delete Task", "Delete Task",
    "Delete Task", "Delete Task", "Delete Task", "Delete Task", "Delete Task",

    # Edit Task
    "Edit Task", "Edit Task", "Edit Task", "Edit Task", "Edit Task",
    "Edit Task", "Edit Task", "Edit Task", "Edit Task", "Edit Task",
]





## Pipeline (vectorization and logistic regression)

In [3]:

# Build a pipeline: TF-IDF vectorizer + logistic regression classifier
intent_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression()),
])

# Train the classifier
intent_clf.fit(train_texts, train_labels)

## Test intent 

In [4]:
# Predict intent on a new input
test_input = "Please remove finish homework from my tasks"
predicted_intent = intent_clf.predict([test_input])[0]
print(f"Predicted intent: {predicted_intent}")


Predicted intent: Delete Task


In [5]:

# Test data
test_texts = [
    "Please add 'buy dog food' to my to-do list",
    "Add take out trash as a new task",
    "Put 'practice coding' into my planner",

    "Mark 'finish project' as top priority",
    "Set the priority for 'study math' to low",
    "Make 'clean garage' an urgent task",

    "Remind me to book dentist by next Thursday",
    "Finish assignment before next weekend",
    "Set deadline for 'insurance renewal' to Aug 5",

    "Delete the task about going to the gym",
    "Remove 'text Alice' from my tasks",
    "Please get rid of 'review notes' from my list",

    "Change the priority of 'do taxes' to low",
    "Edit the task to say 'email professor instead'",
    "Update 'go shopping' to 'go grocery shopping'",
]

test_labels = [
    "Add Task", "Add Task", "Add Task",
    "Set Priority", "Set Priority", "Set Priority",
    "Set Deadline", "Set Deadline", "Set Deadline",
    "Delete Task", "Delete Task", "Delete Task",
    "Edit Task", "Edit Task", "Edit Task",
]

# Predict
predicted_labels = intent_clf.predict(test_texts)

# Evaluate
print(classification_report(test_labels, predicted_labels))


              precision    recall  f1-score   support

    Add Task       1.00      1.00      1.00         3
 Delete Task       1.00      1.00      1.00         3
   Edit Task       1.00      0.67      0.80         3
Set Deadline       1.00      1.00      1.00         3
Set Priority       0.75      1.00      0.86         3

    accuracy                           0.93        15
   macro avg       0.95      0.93      0.93        15
weighted avg       0.95      0.93      0.93        15



In [6]:
text = "Label 'book tickets' as urgent"
print(text[19:22])  # Should print: high


' a


## save intent classifier 

In [7]:
# Save model to file
joblib.dump(intent_clf, "models/intent_classifier.pkl")
print("Intent classifier saved as intent_classifier.pkl")



Intent classifier saved as intent_classifier.pkl


to load classifier use:
# Load the trained classifier
import joblib

intent_clf = joblib.load("intent_classifier.pkl")

# Test it
test_text = "Remind me to call Ali tomorrow"
print("Predicted intent:", intent_clf.predict([test_text])[0])


## Entity Classifier 

In [8]:

# Load a blank English pipeline
nlp = spacy.blank("en")

# Add NER pipe if not already there
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Define entity labels
ner.add_label("PRIORITY")
ner.add_label("DEADLINE")
ner.add_label("TASK")
ner.add_label("LOCATION")
ner.add_label("RECURRENCE")
ner.add_label("DURATION")

# Training examples (char indices MUST be correct!)
TRAIN_DATA = [
       # --- Priority Examples ---
    ("Make pay bills a top priority", 
        {"entities": [(5, 14, "TASK"), (17, 20, "PRIORITY")]}),  
    ("Label book tickets as urgent", 
        {"entities": [(6, 18, "TASK"), (22, 28, "PRIORITY")]}),  
    ("Set low priority to wash the car", 
        {"entities": [(4, 7, "PRIORITY"), (20, 32, "TASK")]}),  

    # --- Deadline Examples ---
    ("Remind me to call mom by tomorrow", 
        {"entities": [(13, 21, "TASK"), (25, 33, "DEADLINE")]}),  
    ("Schedule meeting at 3pm next Friday", 
        {"entities": [(0, 16, "TASK"), (20, 35, "DEADLINE")]}),  
    ("Finish reading by Sunday night", 
        {"entities": [(0, 14, "TASK"), (18, 30, "DEADLINE")]}),  
    ("Deadline for submitting forms is next Wednesday", 
        {"entities": [(13, 29, "TASK"), (33, 47, "DEADLINE")]}),  
    ("Call Ali at 7pm", 
        {"entities": [(0, 8, "TASK"),  (12, 15, "DEADLINE")]}),  
    ("Email Professor John by tomorrow morning", 
        {"entities": [(0, 20, "TASK"), (24, 35, "DEADLINE")]}),  

    # --- Location Examples ---
    ("Pick up medicine from pharmacy at 6pm", 
        {"entities": [(0, 16, "TASK"), (22, 30, "LOCATION"), (34, 37, "DEADLINE")]}),  
    ("Meet Ali at Starbucks tomorrow", 
        {"entities": [(0, 8, "TASK"), (12, 21, "LOCATION"), (22, 30, "DEADLINE")]}),  

    # --- Recurrence + Duration ---
    ("Water the plants every Monday", 
        {"entities": [(0, 16, "TASK"), (17, 29, "RECURRENCE")]}),  
    ("Exercise for 45 mins every evening", 
        {"entities": [(0, 8, "TASK"), (13, 20, "DURATION"), (21, 34, "RECURRENCE")]}),  
    ("Study for 30 minutes daily", 
        {"entities": [(0, 5, "TASK"), (10, 20, "DURATION"), (21, 26, "RECURRENCE")]}), 
        
    # --- Simple Add Task ---
    ("Add buy groceries to my list", 
        {"entities": [(4, 17, "TASK")]}),  

    ("Set priority to high for cleaning", 
        {"entities": [(21, 33, "TASK"), (16, 20, "PRIORITY")]}),  

    ("Mark task as urgent", 
        {"entities": [(5, 9, "TASK"), (13, 19, "PRIORITY")]}),  

    # --- With Deadlines ---
    ("Remind me to submit report by tomorrow 5pm", 
        {"entities": [(13, 26, "TASK"), (30, 42, "DEADLINE")]}),  

    ("Schedule meeting at 3pm next Friday", 
        {"entities": [(9, 16, "TASK"), (20, 35, "DEADLINE")]}),  

    ("Finish reading by Sunday night", 
        {"entities": [(7, 14, "TASK"), (18, 30, "DEADLINE")]}),  
        
    ("Call Ali at 7pm", 
        {"entities": [(0, 8, "TASK"), (12, 15, "DEADLINE")]}),  

    ("Email Professor John by tomorrow morning", 
        {"entities": [(0, 20, "TASK"), (24, 40, "DEADLINE")]}),  

    ("Remind Sarah to pay bills tonight", 
        {"entities": [(0, 12, "TASK"), (26, 33, "DEADLINE")]}),  

    # --- With Location ---
    ("Pick up medicine from pharmacy at 6pm", 
        {"entities": [(0, 16, "TASK"), (22, 30, "LOCATION"), (34, 37, "DEADLINE")]}),  

    ("Meet Ali at Starbucks tomorrow", 
        {"entities": [(0, 8, "TASK"), (12, 21, "LOCATION"), (22, 30, "DEADLINE")]}),  

    # --- With Recurrence ---
    ("Water the plants every Monday", 
        {"entities": [(0, 16, "TASK"), (17, 29, "RECURRENCE")]}),  

    ("Send weekly report every Friday morning", 
        {"entities": [(0, 18, "TASK"), (19, 39, "RECURRENCE")]}),  

    ("Remind me to check emails daily at 9am", 
        {"entities": [(13, 25, "TASK"), (26, 31, "RECURRENCE"), (35, 38, "DEADLINE")]}),  

    # --- With Duration ---
    ("Work on project for 2 hours", 
        {"entities": [(0, 15, "TASK"), (20, 27, "DURATION")]}),  

    ("Study for 30 minutes after lunch", 
        {"entities": [(0, 5, "TASK"), (10, 20, "DURATION")]}),  

    ("Exercise for 45 mins every evening", 
        {"entities": [(0, 8, "TASK"), (13, 20, "DURATION"), (21, 34, "RECURRENCE")]}),  

]


In [9]:
TRAIN_DATA += [
    # --- TASK + PRIORITY ---
    ("Add buy groceries to my list", {"entities": [(4, 17, "TASK")]}),
    ("Mark task as urgent", {"entities": [(5, 9, "TASK"), (13, 19, "PRIORITY")]}),
    ("Set priority to high for cleaning", {"entities": [(16, 20, "PRIORITY"), (25, 33, "TASK")]}),
    ("Label book tickets as low priority", {"entities": [(6, 18, "TASK"), (22, 25, "PRIORITY")]}),
    ("Mark complete the assignment as important", {"entities": [(5, 28, "TASK"), (32, 41, "PRIORITY")]}),

    # --- TASK + DEADLINE ---
    ("Remind me to submit report by tomorrow 5pm", {"entities": [(13, 26, "TASK"), (30, 42, "DEADLINE")]}),
    ("Finish reading by Sunday night", {"entities": [(0, 14, "TASK"), (18, 30, "DEADLINE")]}),
    ("Submit assignment by next Friday", {"entities": [(0, 17, "TASK"), (21, 32, "DEADLINE")]}),
    ("Complete project before Monday evening", {"entities": [(0, 16, "TASK"), (24, 38, "DEADLINE")]}),
    ("Check emails after lunch today", {"entities": [(0, 12, "TASK"), (13, 30, "DEADLINE")]}),
    ("Call Umaima at 7pm", {"entities": [(0, 11, "TASK"), (15, 18, "DEADLINE")]}),
    ("Message John about the meeting", {"entities": [(0, 12, "TASK")]}),
    ("Email Professor Khan tomorrow morning", {"entities": [(0, 20, "TASK"), (21, 37, "DEADLINE")]}),
    ("Remind Sarah to send files tonight", {"entities": [(0, 12, "TASK"), (27, 34, "DEADLINE")]}),
    ("Call Dad every Sunday evening", {"entities": [(0, 8, "TASK"), (9, 29, "RECURRENCE")]}),

    # --- TASK + LOCATION ---
    ("Pick up medicine from pharmacy at 6pm", {"entities": [(0, 16, "TASK"), (22, 30, "LOCATION"), (34, 37, "DEADLINE")]}),
    ("Meet Muhammad at Starbucks tomorrow", {"entities": [(0, 13, "TASK"), (17, 26, "LOCATION"), (27, 35, "DEADLINE")]}),
    ("Drop package at the post office", {"entities": [(0, 12, "TASK"), (16, 31, "LOCATION")]}),
    ("Buy coffee from McDonald's in the morning", {"entities": [(0, 10, "TASK"), (16, 26, "LOCATION"), (27, 41, "DEADLINE")]}),
    ("Go jogging in the park at 6am", {"entities": [(0, 10, "TASK"), (14, 22, "LOCATION"), (26, 29, "DEADLINE")]}),

    # --- TASK + RECURRENCE + DURATION ---
    ("Water the plants every Monday", {"entities": [(0, 16, "TASK"), (17, 29, "RECURRENCE")]}),
    ("Take out trash every morning", {"entities": [(0, 14, "TASK"), (15, 28, "RECURRENCE")]}),
    ("Exercise for 45 mins every evening", {"entities": [(0, 8, "TASK"), (13, 20, "DURATION"), (21, 34, "RECURRENCE")]}),
    ("Study for 30 minutes daily", {"entities": [(0, 5, "TASK"), (10, 20, "DURATION"), (21, 26, "RECURRENCE")]}),
    ("Walk the dog for 20 minutes every night", {"entities": [(0, 12, "TASK"), (17, 27, "DURATION"), (28, 39, "RECURRENCE")]}),
]


In [10]:

# Create DocBin for training
db = DocBin()
for text, annot in TRAIN_DATA:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label)
        if span is None:
            print(f"Skipping span: '{text[start:end]}' — not aligned")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)


# Begin training
optimizer = nlp.begin_training()
for i in range(100):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, annotations in TRAIN_DATA:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], drop=0.25, sgd=optimizer, losses=losses)
    print(f"Losses at iteration {i}: {losses}")

# 🔍 Test the trained model
test_text = "Set priority to low and submit report by next Monday"
doc = nlp(test_text)
for ent in doc.ents:
    print(ent.text, ent.label_)


Skipping span: 'tomorrow mo' — not aligned




Losses at iteration 0: {'ner': np.float32(207.89276)}
Losses at iteration 1: {'ner': np.float32(177.68985)}
Losses at iteration 2: {'ner': np.float32(138.72874)}
Losses at iteration 3: {'ner': np.float32(101.85443)}
Losses at iteration 4: {'ner': np.float32(51.36254)}
Losses at iteration 5: {'ner': np.float32(43.804623)}
Losses at iteration 6: {'ner': np.float32(39.510166)}
Losses at iteration 7: {'ner': np.float32(15.684928)}
Losses at iteration 8: {'ner': np.float32(24.777859)}
Losses at iteration 9: {'ner': np.float32(16.540432)}
Losses at iteration 10: {'ner': np.float32(14.705913)}
Losses at iteration 11: {'ner': np.float32(21.059753)}
Losses at iteration 12: {'ner': np.float32(12.10612)}
Losses at iteration 13: {'ner': np.float32(24.996252)}
Losses at iteration 14: {'ner': np.float32(12.697174)}
Losses at iteration 15: {'ner': np.float32(18.69332)}
Losses at iteration 16: {'ner': np.float32(15.811174)}
Losses at iteration 17: {'ner': np.float32(13.012798)}
Losses at iteration 18:

In [11]:
# Test sentences
test_texts = [
    "Remind me to call Ali tomorrow at 5pm",
    "Schedule a meeting at Starbucks every Monday at 10am",
    "Submit assignment by next Friday evening",
    "Go for a 30 minute walk daily",
]

print("\n--- Testing NER Model ---")
for text in test_texts:
    doc = nlp(text)
    print(f"\nInput: {text}")
    for ent in doc.ents:
        print(f"  {ent.text} -> {ent.label_}")



--- Testing NER Model ---

Input: Remind me to call Ali tomorrow at 5pm
  call Ali -> TASK
  5pm -> DEADLINE

Input: Schedule a meeting at Starbucks every Monday at 10am
  Schedule a -> TASK
  Starbucks -> LOCATION
  every Monday -> RECURRENCE
  10am -> DEADLINE

Input: Submit assignment by next Friday evening
  Submit assignment -> TASK
  next Friday -> DEADLINE

Input: Go for a 30 minute walk daily
  Go for -> TASK
  30 minute -> DURATION
  walk daily -> TASK


In [12]:
nlp.to_disk("models/entity_clf")


In [13]:
#text = "Deadline for submitting forms is next Wednesday"
#doc = nlp.make_doc(text)
#for token in doc:
#    print(f"{token.text} -> {token.idx}")