In [2]:
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [3]:
load_dotenv()
openai_client = OpenAI()

In [4]:
def get_embeddings(texts):
    response = openai_client.embeddings.create(
        model="text-embedding-3-small",
        input=texts
    )
    # Extract the embedding vector
    return [item.embedding for item in response.data]

In [10]:
data_df = pd.read_csv('responses.csv', header=0, names=['text', 'intent'])

# Create embeddings array
X = np.array(get_embeddings(data_df['text'].tolist()))
y = data_df['intent'].values

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# Train a simple classifier
clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,500


In [12]:
# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       1.00      0.85      0.92        13
        True       0.71      1.00      0.83         5

    accuracy                           0.89        18
   macro avg       0.86      0.92      0.88        18
weighted avg       0.92      0.89      0.89        18



In [13]:
# Example queries to test
example_queries = [
    "Would you like me to post to Piazza for you?",
    "According to Piazza posts, it seems like this question has already been answered.",
    "Make sure to check Piazza before posting your question.",
    "Would you like me to post this question to Piazza to get an official answer?",
]

# Generate embeddings for these queries
example_embeddings = np.array(get_embeddings(example_queries))

# Make predictions
predicted_intents = clf.predict(example_embeddings)

# Print results
for query, intent in zip(example_queries, predicted_intents):
    print(f"Query: {query}\nPredicted Intent: {intent}\n")

Query: Would you like me to post to Piazza for you?
Predicted Intent:  True

Query: According to Piazza posts, it seems like this question has already been answered.
Predicted Intent:  False

Query: Make sure to check Piazza before posting your question.
Predicted Intent:  False

Query: Would you like me to post this question to Piazza to get an official answer?
Predicted Intent:  True



In [14]:
# Example queries to test
example_queries = [
    "im really stuck on hw3",
    "tell me everything you know about hw1",
    "how do I register for examlet 1?",
    "I'm unable to get a number anywhere close to N=500,000 without the student servers timing out, is this okay?",
    "recap what I've missed",
    "can I miss lecture?"
]

# Generate embeddings for these queries
example_embeddings = np.array(get_embeddings(example_queries))

# Make predictions
predicted_intents = clf.predict(example_embeddings)

# Print results
for query, intent in zip(example_queries, predicted_intents):
    print(f"Query: {query}\nPredicted Intent: {intent}\n")

Query: im really stuck on hw3
Predicted Intent:  False

Query: tell me everything you know about hw1
Predicted Intent:  False

Query: how do I register for examlet 1?
Predicted Intent:  False

Query: I'm unable to get a number anywhere close to N=500,000 without the student servers timing out, is this okay?
Predicted Intent:  False

Query: recap what I've missed
Predicted Intent:  False

Query: can I miss lecture?
Predicted Intent:  False



In [15]:
# Example queries
example_queries = [
    "I don't know where to start on homework 5",
    "Are the grades for homework 2 released yet?",
    "Can you summarize the main Piazza posts recently?",
    "What's the deadline for project 1?",
    "I'm stuck on question 3, help me understand it",
    "im really stuck on hw3",
    "tell me everything you know about hw1",
    "how do I register for examlet 1?",
    "I'm unable to get a number anywhere close to N=500,000 without the student servers timing out, is this okay?",
    "recap what I've missed",
    "can I miss lecture?"
]

# Generate embeddings
example_embeddings = np.array(get_embeddings(example_queries))

# Get predicted probabilities
probs = clf.predict_proba(example_embeddings)

# Get predicted intent and confidence
predicted_intents = clf.classes_[np.argmax(probs, axis=1)]
confidences = np.max(probs, axis=1)

for query, intent, conf in zip(example_queries, predicted_intents, confidences):
    print(f"Query: {query}\nPredicted Intent: {intent} (Confidence: {conf:.2f})\n")

Query: I don't know where to start on homework 5
Predicted Intent:  False (Confidence: 0.74)

Query: Are the grades for homework 2 released yet?
Predicted Intent:  False (Confidence: 0.70)

Query: Can you summarize the main Piazza posts recently?
Predicted Intent:  False (Confidence: 0.55)

Query: What's the deadline for project 1?
Predicted Intent:  False (Confidence: 0.73)

Query: I'm stuck on question 3, help me understand it
Predicted Intent:  False (Confidence: 0.61)

Query: im really stuck on hw3
Predicted Intent:  False (Confidence: 0.74)

Query: tell me everything you know about hw1
Predicted Intent:  False (Confidence: 0.72)

Query: how do I register for examlet 1?
Predicted Intent:  False (Confidence: 0.73)

Query: I'm unable to get a number anywhere close to N=500,000 without the student servers timing out, is this okay?
Predicted Intent:  False (Confidence: 0.69)

Query: recap what I've missed
Predicted Intent:  False (Confidence: 0.75)

Query: can I miss lecture?
Predicted

In [10]:
# Example queries
example_queries = [
    "I don't know where to start on homework 5",
    "Are the grades for homework 2 released yet?",
    "Can you summarize the main Piazza posts recently?",
    "What's the deadline for project 1?",
    "I'm stuck on hw1 question 3, help me understand it",
    "im really stuck on hw3",
    "tell me everything you know about hw1",
    "how do I register for examlet 1?",
    "I'm unable to get a number anywhere close to N=500,000 without the student servers timing out, is this okay?",
    "recap what I've missed",
    "can I miss lecture?",
    "is pika 6 released yet?",
    "do I need to do q3 on hw2?",
    "any advice for homework 2?",
    "how do I register for examlet 0?",
    "how many in class exercises are required for full marks?"
]

# Generate embeddings
example_embeddings = np.array(get_embeddings(example_queries))

# Get predicted probabilities
probs = clf.predict_proba(example_embeddings)

# Get predicted intent and confidence
predicted_intents = clf.classes_[np.argmax(probs, axis=1)]
confidences = np.max(probs, axis=1)

intent_map = {
    'A': 'Overview of Assignment',
    'B': 'General Query',
    'C': 'Summary'
}
for query, intent, conf in zip(example_queries, predicted_intents, confidences):
    print(f"Query: {query}\nPredicted Intent: {intent_map[intent]} (Confidence: {conf:.2f})\n")

Query: I don't know where to start on homework 5
Predicted Intent: Overview of Assignment (Confidence: 0.59)

Query: Are the grades for homework 2 released yet?
Predicted Intent: General Query (Confidence: 0.73)

Query: Can you summarize the main Piazza posts recently?
Predicted Intent: Summary (Confidence: 0.73)

Query: What's the deadline for project 1?
Predicted Intent: General Query (Confidence: 0.71)

Query: I'm stuck on hw1 question 3, help me understand it
Predicted Intent: Overview of Assignment (Confidence: 0.65)

Query: im really stuck on hw3
Predicted Intent: Overview of Assignment (Confidence: 0.53)

Query: tell me everything you know about hw1
Predicted Intent: Overview of Assignment (Confidence: 0.45)

Query: how do I register for examlet 1?
Predicted Intent: General Query (Confidence: 0.80)

Query: I'm unable to get a number anywhere close to N=500,000 without the student servers timing out, is this okay?
Predicted Intent: General Query (Confidence: 0.79)

Query: recap w