In [2]:
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [3]:
load_dotenv()
openai_client = OpenAI()

In [4]:
def get_embeddings(texts):
    response = openai_client.embeddings.create(
        model="text-embedding-3-small",
        input=texts
    )
    # Extract the embedding vector
    return [item.embedding for item in response.data]

In [5]:
data_df = pd.read_csv('intents.csv', header=0, names=['text', 'intent'])

# Create embeddings array
X = np.array(get_embeddings(data_df['text'].tolist()))
y = data_df['intent'].values

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
import joblib
# Train a simple classifier
clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)

lr = clf
coef = lr.coef_        # shape: (n_classes, n_features)
intercept = lr.intercept_  # shape: (n_classes,)
classes = lr.classes_

print("Classes:", classes)
print("Coef shape:", coef.shape)
print("Intercept shape:", intercept.shape)

# Save them to disk for Lambda use
np.savez("lr_params.npz", coef=coef, intercept=intercept, classes=classes)

Classes: ['general' 'overview' 'summarize']
Coef shape: (3, 1536)
Intercept shape: (3,)


In [7]:
# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     general       0.81      1.00      0.90        13
    overview       1.00      0.80      0.89        15
   summarize       1.00      1.00      1.00         6

    accuracy                           0.91        34
   macro avg       0.94      0.93      0.93        34
weighted avg       0.93      0.91      0.91        34



In [13]:
# Example queries to test
example_queries = [
    "I don't know where to start on homework 5",
    "Are the grades for homework 2 released yet?",
    "Can you summarize the main Piazza posts recently?",
    "What's the deadline for project 1?",
    "I'm stuck on question 3, help me understand it"
]

# Generate embeddings for these queries
example_embeddings = np.array(get_embeddings(example_queries))

# Make predictions
predicted_intents = clf.predict(example_embeddings)

# Print results
for query, intent in zip(example_queries, predicted_intents):
    print(f"Query: {query}\nPredicted Intent: {intent}\n")

Query: I don't know where to start on homework 5
Predicted Intent: overview

Query: Are the grades for homework 2 released yet?
Predicted Intent: general

Query: Can you summarize the main Piazza posts recently?
Predicted Intent: summarize

Query: What's the deadline for project 1?
Predicted Intent: general

Query: I'm stuck on question 3, help me understand it
Predicted Intent: overview



In [8]:
# Example queries to test
example_queries = [
    "im really stuck on hw3",
    "tell me everything you know about hw1",
    "how do I register for examlet 1?",
    "I'm unable to get a number anywhere close to N=500,000 without the student servers timing out, is this okay?",
    "recap what I've missed",
    "can I miss lecture?"
]

# Generate embeddings for these queries
example_embeddings = np.array(get_embeddings(example_queries))

# Make predictions
predicted_intents = clf.predict(example_embeddings)

# Print results
for query, intent in zip(example_queries, predicted_intents):
    print(f"Query: {query}\nPredicted Intent: {intent}\n")

Query: im really stuck on hw3
Predicted Intent: A

Query: tell me everything you know about hw1
Predicted Intent: A

Query: how do I register for examlet 1?
Predicted Intent: B

Query: I'm unable to get a number anywhere close to N=500,000 without the student servers timing out, is this okay?
Predicted Intent: B

Query: recap what I've missed
Predicted Intent: C

Query: can I miss lecture?
Predicted Intent: B



In [9]:
# Example queries
example_queries = [
    "I don't know where to start on homework 5",
    "Are the grades for homework 2 released yet?",
    "Can you summarize the main Piazza posts recently?",
    "What's the deadline for project 1?",
    "I'm stuck on question 3, help me understand it",
    "im really stuck on hw3",
    "tell me everything you know about hw1",
    "how do I register for examlet 1?",
    "I'm unable to get a number anywhere close to N=500,000 without the student servers timing out, is this okay?",
    "recap what I've missed",
    "can I miss lecture?"
]

# Generate embeddings
example_embeddings = np.array(get_embeddings(example_queries))

# Get predicted probabilities
probs = clf.predict_proba(example_embeddings)

# Get predicted intent and confidence
predicted_intents = clf.classes_[np.argmax(probs, axis=1)]
confidences = np.max(probs, axis=1)

for query, intent, conf in zip(example_queries, predicted_intents, confidences):
    print(f"Query: {query}\nPredicted Intent: {intent} (Confidence: {conf:.2f})\n")

Query: I don't know where to start on homework 5
Predicted Intent: A (Confidence: 0.59)

Query: Are the grades for homework 2 released yet?
Predicted Intent: B (Confidence: 0.73)

Query: Can you summarize the main Piazza posts recently?
Predicted Intent: C (Confidence: 0.73)

Query: What's the deadline for project 1?
Predicted Intent: B (Confidence: 0.71)

Query: I'm stuck on question 3, help me understand it
Predicted Intent: A (Confidence: 0.54)

Query: im really stuck on hw3
Predicted Intent: A (Confidence: 0.53)

Query: tell me everything you know about hw1
Predicted Intent: A (Confidence: 0.45)

Query: how do I register for examlet 1?
Predicted Intent: B (Confidence: 0.80)

Query: I'm unable to get a number anywhere close to N=500,000 without the student servers timing out, is this okay?
Predicted Intent: B (Confidence: 0.79)

Query: recap what I've missed
Predicted Intent: C (Confidence: 0.72)

Query: can I miss lecture?
Predicted Intent: B (Confidence: 0.71)



In [10]:
# Example queries
example_queries = [
    "I don't know where to start on homework 5",
    "Are the grades for homework 2 released yet?",
    "Can you summarize the main Piazza posts recently?",
    "What's the deadline for project 1?",
    "I'm stuck on hw1 question 3, help me understand it",
    "im really stuck on hw3",
    "tell me everything you know about hw1",
    "how do I register for examlet 1?",
    "I'm unable to get a number anywhere close to N=500,000 without the student servers timing out, is this okay?",
    "recap what I've missed",
    "can I miss lecture?",
    "is pika 6 released yet?",
    "do I need to do q3 on hw2?",
    "any advice for homework 2?",
    "how do I register for examlet 0?",
    "how many in class exercises are required for full marks?"
]

# Generate embeddings
example_embeddings = np.array(get_embeddings(example_queries))

# Get predicted probabilities
probs = clf.predict_proba(example_embeddings)

# Get predicted intent and confidence
predicted_intents = clf.classes_[np.argmax(probs, axis=1)]
confidences = np.max(probs, axis=1)

intent_map = {
    'A': 'Overview of Assignment',
    'B': 'General Query',
    'C': 'Summary'
}
for query, intent, conf in zip(example_queries, predicted_intents, confidences):
    print(f"Query: {query}\nPredicted Intent: {intent_map[intent]} (Confidence: {conf:.2f})\n")

Query: I don't know where to start on homework 5
Predicted Intent: Overview of Assignment (Confidence: 0.59)

Query: Are the grades for homework 2 released yet?
Predicted Intent: General Query (Confidence: 0.73)

Query: Can you summarize the main Piazza posts recently?
Predicted Intent: Summary (Confidence: 0.73)

Query: What's the deadline for project 1?
Predicted Intent: General Query (Confidence: 0.71)

Query: I'm stuck on hw1 question 3, help me understand it
Predicted Intent: Overview of Assignment (Confidence: 0.65)

Query: im really stuck on hw3
Predicted Intent: Overview of Assignment (Confidence: 0.53)

Query: tell me everything you know about hw1
Predicted Intent: Overview of Assignment (Confidence: 0.45)

Query: how do I register for examlet 1?
Predicted Intent: General Query (Confidence: 0.80)

Query: I'm unable to get a number anywhere close to N=500,000 without the student servers timing out, is this okay?
Predicted Intent: General Query (Confidence: 0.79)

Query: recap w

In [9]:
# Example queries
example_queries = [
    "what are people saying about pset 5?"
]

# Generate embeddings
example_embeddings = np.array(get_embeddings(example_queries))

# Get predicted probabilities
probs = clf.predict_proba(example_embeddings)

# Get predicted intent and confidence
predicted_intents = clf.classes_[np.argmax(probs, axis=1)]
confidences = np.max(probs, axis=1)

for query, intent, conf in zip(example_queries, predicted_intents, confidences):
    print(f"Query: {query}\nPredicted Intent: {intent} (Confidence: {conf:.2f})\n")

Query: what are people saying about pset 5?
Predicted Intent: general (Confidence: 0.55)

