In [1]:
# 1. Mount Google Drive to access dataset files (train.csv and test.csv)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Define the base path to the project folder in Drive (update this as needed for your Drive structure)
base_path = "/content/drive/My Drive/Colab Notebooks/COMS5790/Final Project/"

In [4]:
# 2. Load the training and testing datasets using pandas
# We assume 'train.csv' and 'test.csv' contain the short answer data with a 'label' column.
import pandas as pd
df_train = pd.read_csv(base_path + "train.csv", encoding='latin-1')
df_test = pd.read_csv(base_path + "test.csv", encoding='latin-1')

# Inspect the first few rows to understand the structure (optional)
print(df_train.head())  # This will show columns like Question, Response, CorrectAnswer, label, etc.

# 3. Define a simple rule-based grading function.
# This function will compare the student's response with the correct answer.
# It uses keyword overlap:
#    - If the response contains ALL keywords from the correct answer, we mark it as Correct.
#    - If the response contains SOME (at least ~half) of the keywords, we mark it as Partially Correct.
#    - Otherwise, mark the response as Incorrect.
import math
import string

# Prepare a basic list of stopwords (common words to ignore in overlap) to focus on meaningful keywords
stopwords = {"the", "a", "an", "of", "to", "and", "is", "are", "in", "for", "on", "this", "that", "it"}

def grade_response(correct_answer: str, student_answer: str) -> int:
    """
    Determine the grade label for a student answer given the correct answer.
    Returns 1 for Correct, 0 for Partially Correct, and -1 for Incorrect.
    """
    # Convert both answers to lowercase and remove punctuation for fair comparison
    correct = correct_answer.lower().translate(str.maketrans("", "", string.punctuation))
    response = student_answer.lower().translate(str.maketrans("", "", string.punctuation))

    # Split into individual words
    correct_words = [w for w in correct.split() if w not in stopwords]
    response_words = [w for w in response.split() if w not in stopwords]

    # Use set for unique words to avoid double counting duplicates
    correct_keywords = set(correct_words)
    response_words_set = set(response_words)

    if len(correct_keywords) == 0:
        # Edge case: if correct answer has no meaningful keywords (very short or all stopwords),
        # handle by simple direct string match as fallback
        if response.strip().lower() == correct.strip().lower():
            return 1  # if the answers match exactly
        else:
            return -1

    # Count how many of the correct answer's keywords appear in the student response
    overlap_count = sum(1 for word in correct_keywords if word in response_words_set)

    # Determine thresholds for partial vs correct
    total_keywords = len(correct_keywords)
    # Threshold for partial credit: at least half of the keywords (rounded up) should match
    partial_threshold = math.ceil(total_keywords * 0.5)

    # Apply the rules:
    if overlap_count == total_keywords:
        # Student response contains all keywords from the correct answer
        return 1  # Correct
    elif overlap_count >= partial_threshold and overlap_count > 0:
        # Student response contains a significant portion of the keywords (but not all)
        return 0  # Partially Correct
    else:
        # Student response matches very few or none of the keywords
        return -1  # Incorrect

# 4. Generate predictions on the test set using the rule-based function
y_true = df_test['label'].tolist()        # True labels from the dataset (-1, 0, or 1)
y_pred = []                               # This will hold our predicted labels
for idx, row in df_test.iterrows():
    # For each answer in the test set, apply the grading function
    correct_ans = row['CorrectAnswer']
    student_ans = row['Response']

    # Check if student_ans is a float and represents a missing value (NaN)
    if isinstance(student_ans, float) and pd.isna(student_ans):
        # Handle missing values - you might assign a default label, skip, or impute
        # Here, we'll assign -1 (Incorrect) for missing responses
        pred_label = -1
    else:
        pred_label = grade_response(correct_ans, student_ans)

    y_pred.append(pred_label)

# 5. Evaluate the model's performance using common metrics: Accuracy, Precision, Recall, F1-score, and Confusion Matrix.
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Map the original labels to 0,1,2 for consistency (Incorrect->0, Partially Correct->1, Correct->2)
label_mapping = {-1: 0, 0: 1, 1: 2}
y_true_ids = [label_mapping[l] for l in y_true]
y_pred_ids = [label_mapping[l] for l in y_pred]

# Calculate accuracy
accuracy = accuracy_score(y_true_ids, y_pred_ids)
print(f"Test Accuracy: {accuracy:.3f}")

# Print detailed classification report (precision, recall, F1 for each class)
target_names = ["Incorrect", "Partially Correct", "Correct"]
print("Classification Report:")
print(classification_report(y_true_ids, y_pred_ids, target_names=target_names, digits=2, zero_division=0))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_true_ids, y_pred_ids))


   Experiment    Topic   ID                        Question  \
0           1  Physics  104  How thin can a fiber optic be?   
1           1  Physics  126  How thin can a fiber optic be?   
2           1  Physics  130  How thin can a fiber optic be?   
3           1  Physics  131  How thin can a fiber optic be?   
4           1  Physics  156  How thin can a fiber optic be?   

                                 Response            CorrectAnswer  label  
0                        a strand of hair  As thin as a human hair      1  
1                  Really thin and small   As thin as a human hair     -1  
2                 as thin as a human hair  As thin as a human hair      1  
3  Very thin smaller than a pice of hair   As thin as a human hair      1  
4     Less than the width of a human hair  As thin as a human hair      1  
Test Accuracy: 0.700
Classification Report:
                   precision    recall  f1-score   support

        Incorrect       0.73      0.95      0.83     16614
Pa