In [2]:
!pip install pandas numpy scikit-learn nltk spacy
!python -m spacy download en_core_web_sm


Collecting nltk
  Using cached nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting spacy
  Using cached spacy-3.8.11-cp313-cp313-win_amd64.whl.metadata (28 kB)
Collecting click (from nltk)
  Using cached click-8.3.1-py3-none-any.whl.metadata (2.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.11.3-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.15-cp313-cp313-win_amd64.whl.metadata (2.3 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.13-cp313-cp313-win_amd64.whl.metadata (9.9 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.12-cp313-cp313-win_amd64.whl.metadata (2.6 kB)
Collecting thinc<8.4.0,>=8.

In [3]:
import pandas as pd

# Sample Q&A dataset
data = {
    "Question": [
        "What are the benefits of exercise?",
        "Explain the water cycle."
    ],
    "Expected_Answer": [
        "Exercise improves cardiovascular health, strengthens muscles, and boosts mood.",
        "The water cycle involves evaporation, condensation, precipitation, and collection."
    ],
    "Student_Answer": [
        "Exercise strengthens muscles and improves mood.",
        "Water evaporates and then rains back."
    ]
}

df = pd.DataFrame(data)
df


Unnamed: 0,Question,Expected_Answer,Student_Answer
0,What are the benefits of exercise?,"Exercise improves cardiovascular health, stren...",Exercise strengthens muscles and improves mood.
1,Explain the water cycle.,"The water cycle involves evaporation, condensa...",Water evaporates and then rains back.


In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_keywords(text):
    doc = nlp(text.lower())
    keywords = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return keywords

df["Expected_Keywords"] = df["Expected_Answer"].apply(extract_keywords)
df["Student_Keywords"] = df["Student_Answer"].apply(extract_keywords)
df


Unnamed: 0,Question,Expected_Answer,Student_Answer,Expected_Keywords,Student_Keywords
0,What are the benefits of exercise?,"Exercise improves cardiovascular health, stren...",Exercise strengthens muscles and improves mood.,"[exercise, improve, cardiovascular, health, st...","[exercise, strengthen, muscle, improve, mood]"
1,Explain the water cycle.,"The water cycle involves evaporation, condensa...",Water evaporates and then rains back.,"[water, cycle, involve, evaporation, condensat...","[water, evaporate, rain]"


In [5]:
def detect_gap(expected, student):
    missing = [word for word in expected if word not in student]
    return missing

df["Knowledge_Gap"] = df.apply(lambda x: detect_gap(x["Expected_Keywords"], x["Student_Keywords"]), axis=1)
df


Unnamed: 0,Question,Expected_Answer,Student_Answer,Expected_Keywords,Student_Keywords,Knowledge_Gap
0,What are the benefits of exercise?,"Exercise improves cardiovascular health, stren...",Exercise strengthens muscles and improves mood.,"[exercise, improve, cardiovascular, health, st...","[exercise, strengthen, muscle, improve, mood]","[cardiovascular, health, boost]"
1,Explain the water cycle.,"The water cycle involves evaporation, condensa...",Water evaporates and then rains back.,"[water, cycle, involve, evaporation, condensat...","[water, evaporate, rain]","[cycle, involve, evaporation, condensation, pr..."
