In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack, csr_matrix
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from imblearn.over_sampling import RandomOverSampler
import contractions 
import unicodedata
import lightgbm as lgb

### Preprocessing Strategy 
Ultimate goal: predict (1) whether there is a misconception and (2) if there is a misconception, what kind? 

Preprocessing steps:

- Clean up `StudentExplanation` like we did in *01_data_wrangling_eda.ipynb*
- Clean up `Misconception` like we did in *01_data_wrangling_eda.ipynb*
- Use answer key dictionary with question_id and the correct answer, then use this to make a new feature, `is_correct` (I'm assuming a teacher would have an answer key to check whether the MC answer is correct, and we don't need ML for this part)
- Collapse `Category` into Correct, Misconception, and Neither. We don't need the True/False label because we don't need ML to tell us if the MC answer is correct. 


In [2]:
# Load the datasets
directory = "map-charting-student-math-misunderstandings"
train = pd.read_csv(f"{directory}/train.csv")
# test = pd.read_csv(f"{directory}/test.csv") # this csv only has 3 rows and 2 of the 15 questions


#### Preprocessing StudentExplanation

In [3]:
# Text preprocessing function
stop_words = set(stopwords.words('english'))

def preprocess(text):
    # All lower case
    lower_text = text.lower() 

    # No accented characters
    no_accents = unicodedata.normalize('NFKD', lower_text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # Expand contractions
    expanded = contractions.fix(no_accents)

    # Remove repeated words
    no_repeats = re.sub(r'\b(\w+)( \1\b)+', r'\1', expanded, flags=re.IGNORECASE)

    # Remove digits and special characters
    no_special_char = re.sub(r'[^a-zA-Z\s]', '', no_repeats)


    # Remove stop words
    words = no_special_char.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    return ' '.join(filtered_words)

train['PreprocessedExplanation'] = train['StudentExplanation'].apply(preprocess)

In [4]:
# Standardize labels & remove special characters
train.Misconception = train.Misconception.str.lower() # all lower case
train.Misconception = train.Misconception.str.replace('-','') # remove hyphens
train.Misconception = train.Misconception.str.replace('_','') # remove underscores
train.Misconception = train.Misconception.str.replace(' ','') # remove spaces
train.Misconception.unique()

array([nan, 'incomplete', 'wnb', 'swapdividend', 'mult', 'flipchange',
       'irrelevant', 'wrongfraction', 'additive', 'notvariable',
       'addingterms', 'inverseoperation', 'inversion', 'duplication',
       'wrongoperation', 'wholenumberslarger', 'longerisbigger',
       'ignoreszeroes', 'shorterisbigger', 'addingacross',
       'denominatoronlychange', 'incorrectequivalentfractionaddition',
       'division', 'subtraction', 'unknowable', 'definition', 'interior',
       'positive', 'tacking', 'wrongterm', 'firstterm', 'baserate',
       'multiplyingby4', 'certainty', 'scale'], dtype=object)

In [5]:
# Fill NA values in Misconception
train['Misconception'] = train['Misconception'].fillna('none')


In [6]:
# Logically combine similar misconception categories 
wholenumber_miscs = ['wholenumberbias', 'wholenumberslarger', 'longerisbigger', 'shorterisbigger','wnb']
fraction_miscs = ['wrongfraction','denominatoronlychange','incorrectequivalentfractionaddition','swapdividend']
operation_miscs = ['wrongoperation','inverseoperation','inversion','division']
multiplication_miscs = ['multiplyingby4','mult']
addition_miscs = ['additive','addingterms','addingacross']
variable_miscs = ['notvariable','wrongterm','firstterm','tacking']
definition_miscs = ['definition','baserate','scale','certainty','interior','unknowable']
procedure_miscs = ['duplication','irrelevant','flipchange','ignoreszeroes']


# Replace 
train['Misconception'] = train['Misconception'].replace(wholenumber_miscs, 'wholenumber')
train['Misconception'] = train['Misconception'].replace(fraction_miscs, 'fraction')
train['Misconception'] = train['Misconception'].replace(operation_miscs, 'operation')
train['Misconception'] = train['Misconception'].replace(multiplication_miscs, 'multiplication')
train['Misconception'] = train['Misconception'].replace(addition_miscs, 'addition')
train['Misconception'] = train['Misconception'].replace(variable_miscs, 'variable')
train['Misconception'] = train['Misconception'].replace(definition_miscs, 'definition')
train['Misconception'] = train['Misconception'].replace(procedure_miscs, 'procedure')

In [7]:
# Rename Misconception to MisconceptionType for clarity
train = train.rename(columns={'Misconception':'MisconceptionType'})

In [8]:
# Use answer_key to make is_correct
answer_key = pd.read_csv(f"{directory}/answer_key.csv")

# Merge answer key into train 
train = train.merge(answer_key.rename(columns={"MC_Answer": "Correct_Answer"}),
                    on="QuestionId", how="left")

# Create is_correct column
train["is_correct"] = (train["MC_Answer"] == train["Correct_Answer"]).astype(int)

# Remember, an error makes it necessary to manually set QuestionId 31778 to correct if the student answered 9. 
train.loc[(train['QuestionId'] == 31778) & (train['MC_Answer']=='\( 9 \)'), 'is_correct'] = 1 

In [9]:
# Collapse Category into NewCategory
train['NewCategory'] = train["Category"].str.split("_").str[1]
train.head()

Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation,Category,MisconceptionType,PreprocessedExplanation,Correct_Answer,is_correct,NewCategory
0,0,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),0ne third is equal to tree nineth,True_Correct,none,ne third equal tree nineth,\( \frac{1}{3} \),1,Correct
1,1,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 / 3 because 6 over 9 is 2 thirds and 1 third...,True_Correct,none,thirds third shaded,\( \frac{1}{3} \),1,Correct
2,2,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),"1 3rd is half of 3 6th, so it is simplee to un...",True_Neither,none,rd half th simplee understand,\( \frac{1}{3} \),1,Neither
3,3,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 goes into everything and 3 goes into nine,True_Neither,none,goes everything goes nine,\( \frac{1}{3} \),1,Neither
4,4,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 out of every 3 isn't coloured,True_Correct,none,every coloured,\( \frac{1}{3} \),1,Correct


### Strategy
Because I'm trying to predict 2 target variables (`Category` and `Misconception`), I'm going to take a hierarchical approach. First I'll attempt to predict `Category`, then if there is a misconception, I'll go on to predict `Misconception`. For the baseline, I'm going to start with a Multinomial Naive Bayes model & TfidfVectorizer. 

In [10]:
# Try it with the original Category first
X_train, X_test, y_train, y_test = train_test_split(
    train[["QuestionId", "PreprocessedExplanation",'is_correct']], train["Category"], test_size=0.2, random_state=42
)

# Preprocess explanation, fit only on training data
def tok(text):
    tt = ToktokTokenizer()
    return tt.tokenize(text)

tfidf = TfidfVectorizer(ngram_range=(1,1), max_features=10000,tokenizer=tok)
X_train_text = tfidf.fit_transform(X_train["PreprocessedExplanation"])
X_test_text  = tfidf.transform(X_test["PreprocessedExplanation"])

# Combine features
X_train_final = hstack([X_train_text, X_train[['QuestionId','is_correct']]])
X_test_final = hstack([X_test_text, X_test[['QuestionId','is_correct']]])

# Train MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_final, y_train)

# predict
y_pred = nb.predict(X_test_final)
print(classification_report(y_test, y_pred))




                     precision    recall  f1-score   support

      False_Correct       0.00      0.00      0.00        39
False_Misconception       0.70      0.71      0.71      1984
      False_Neither       0.76      0.33      0.46      1238
       True_Correct       0.64      0.99      0.78      2969
 True_Misconception       0.00      0.00      0.00        79
       True_Neither       0.81      0.15      0.25      1031

           accuracy                           0.67      7340
          macro avg       0.49      0.36      0.37      7340
       weighted avg       0.69      0.67      0.62      7340



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
# Then try with NewCategory
X_train, X_test, y_train, y_test = train_test_split(
    train[["QuestionId", "PreprocessedExplanation",'is_correct']], train["NewCategory"], test_size=0.2, random_state=42
)

# Preprocess explanation, fit only on training data
def tok(text):
    tt = ToktokTokenizer()
    return tt.tokenize(text)

tfidf = TfidfVectorizer(ngram_range=(1,1), max_features=10000,tokenizer=tok)
X_train_text = tfidf.fit_transform(X_train["PreprocessedExplanation"])
X_test_text  = tfidf.transform(X_test["PreprocessedExplanation"])

# Combine features
X_train_final = hstack([X_train_text, X_train[['QuestionId','is_correct']]])
X_test_final = hstack([X_test_text, X_test[['QuestionId','is_correct']]])

# Train MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_final, y_train)

# predict
y_pred = nb.predict(X_test_final)
print(classification_report(y_test, y_pred))




               precision    recall  f1-score   support

      Correct       0.68      0.90      0.78      3008
Misconception       0.72      0.64      0.68      2063
      Neither       0.61      0.40      0.49      2269

     accuracy                           0.68      7340
    macro avg       0.67      0.65      0.65      7340
 weighted avg       0.67      0.68      0.66      7340



Tune alpha

In [12]:
from sklearn.model_selection import GridSearchCV

param_grid = {"alpha": [0.01, 0.1, 0.5, 1.0, 2.0, 5.0]}
grid = GridSearchCV(MultinomialNB(), param_grid, scoring="f1_macro", cv=5, n_jobs=-1)
grid.fit(X_train_final, y_train)

print("Best params:", grid.best_params_)
print("Best CV score:", grid.best_score_)

y_pred = grid.best_estimator_.predict(X_test_final)
print(classification_report(y_test, y_pred))


Best params: {'alpha': 0.1}
Best CV score: 0.6405889498290076
               precision    recall  f1-score   support

      Correct       0.69      0.89      0.78      3008
Misconception       0.71      0.66      0.68      2063
      Neither       0.59      0.41      0.48      2269

     accuracy                           0.67      7340
    macro avg       0.66      0.65      0.65      7340
 weighted avg       0.67      0.67      0.66      7340



Tune TFIDF vectorizer

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    train[["QuestionId", "PreprocessedExplanation",'is_correct']], train["NewCategory"], test_size=0.25, random_state=42
)

# Preprocess explanation, fit only on training data
def tok(text):
    tt = ToktokTokenizer()
    return tt.tokenize(text)

# Tune ngram_range (1,1) (1,2) or (1,3), max_features 10000 20000 or 50000, and set sublinear_tf = True
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=50000,sublinear_tf=True,tokenizer=tok)
X_train_text = tfidf.fit_transform(X_train["PreprocessedExplanation"])
X_test_text  = tfidf.transform(X_test["PreprocessedExplanation"])

# Combine features
X_train_final = hstack([X_train_text, X_train[['QuestionId','is_correct']]])
X_test_final = hstack([X_test_text, X_test[['QuestionId','is_correct']]])

# Handle class imbalance
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X_train_final, y_train)

# Train MultinomialNB
nb_stage1 = MultinomialNB(alpha=0.5)
nb_stage1.fit(X_res, y_res)

# predict
y_pred = nb_stage1.predict(X_test_final)
print('Stage 1 Report:\n',classification_report(y_test, y_pred))




Stage 1 Report:
                precision    recall  f1-score   support

      Correct       0.76      0.82      0.79      3765
Misconception       0.73      0.78      0.76      2549
      Neither       0.60      0.50      0.55      2860

     accuracy                           0.71      9174
    macro avg       0.70      0.70      0.70      9174
 weighted avg       0.70      0.71      0.70      9174





In [14]:
# Assume you have a column in your train DataFrame: 'MisconceptionType'
# This is the fine-grained label

# Align features after resampling with the MisconceptionType column
# First, get the mask for rows that are Misconception in the resampled data
mask_stage2_train = (y_res == "Misconception")

# Features for Stage 2
X_train_stage2 = X_res[mask_stage2_train]

# Target for Stage 2 (aligned with X_train_stage2)
# To align, we need to get the corresponding rows from the original train DataFrame
# Use the indices returned by RandomOverSampler
miscon_indices = ros.sample_indices_[mask_stage2_train]  # indices of Misconception rows in the original X_train_final
y_train_stage2 = train.iloc[miscon_indices]["MisconceptionType"]


In [15]:
nb_stage2 = MultinomialNB(alpha=0.1)
nb_stage2.fit(X_train_stage2, y_train_stage2)

In [16]:
# Stage 1 predictions
y_pred_stage1 = nb_stage1.predict(X_test_final)

# Mask rows predicted as Misconception
mask_stage2_test = (y_pred_stage1 == "Misconception")

# Convert to CSR for slicing
X_test_final = X_test_final.tocsr()

# Stage 2 features
X_test_stage2 = X_test_final[mask_stage2_test]

# Stage 2 predictions
y_pred_stage2 = nb_stage2.predict(X_test_stage2)

# Combine results
results = pd.DataFrame({"category_pred": y_pred_stage1}, index=X_test.index)
results.loc[mask_stage2_test, "misconception_type_pred"] = y_pred_stage2
results["misconception_type_pred"].fillna("N/A", inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  results["misconception_type_pred"].fillna("N/A", inplace=True)


In [17]:
# Stage 1 evaluation
print("Stage 1 report:\n", classification_report(y_test, results["category_pred"]))

# Stage 2 evaluation (only on true Misconception rows in test set)
mask_true_mis = (y_test == "Misconception")
true_types = train.loc[X_test.index[mask_true_mis], "MisconceptionType"]
pred_types = results.loc[mask_true_mis, "misconception_type_pred"]
print("Stage 2 report:\n", classification_report(true_types, pred_types))


Stage 1 report:
                precision    recall  f1-score   support

      Correct       0.76      0.82      0.79      3765
Misconception       0.73      0.78      0.76      2549
      Neither       0.60      0.50      0.55      2860

     accuracy                           0.71      9174
    macro avg       0.70      0.70      0.70      9174
 weighted avg       0.70      0.71      0.70      9174

Stage 2 report:
                 precision    recall  f1-score   support

           N/A       0.00      0.00      0.00         0
      addition       0.00      0.00      0.00       337
    definition       0.00      0.00      0.00       161
      fraction       0.00      0.00      0.00       322
    incomplete       0.25      0.00      0.01       384
multiplication       0.00      0.00      0.00       121
          none       0.00      0.00      0.00         0
     operation       0.00      0.00      0.00       114
      positive       0.00      0.00      0.00       149
     procedure   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 2 step classification notes
When we only take the predicted Misconception category from the test set, it's too small to give a nice prediction of MisconceptionType. Compare to below where I just predict MisconceptionType from the full data set. Maybe I should just combine NewCategory and MisconceptionType into one target variable, then predict that with Naive Bayes or a tree method.

### Multinomial Naive Bayes Summary (Baseline Model)
- Overall, not a bad baseline model 
- Collapsed NewCategory works better than Category because it's so hard to categorize the rare cases (e.g., False_Correct & True_Misconception)
- Perhaps unsurprisingly, the '_Neither' categories are the most difficult to categorize, though this improves slightly with tuning

Now let's train the Misconception types

In [18]:
# Try with y = Misconception only for values with a misconception
train_subset = train[train['NewCategory']=='Misconception']
   
X_train, X_test, y_train, y_test = train_test_split(
    train_subset[["QuestionId", "PreprocessedExplanation",'is_correct']], train_subset["MisconceptionType"], test_size=0.2, random_state=42
)

# Preprocess explanation, fit only on training data
def tok(text):
    tt = ToktokTokenizer()
    return tt.tokenize(text)

tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=50000,tokenizer=tok,sublinear_tf=True)
X_train_text = tfidf.fit_transform(X_train["PreprocessedExplanation"])
X_test_text  = tfidf.transform(X_test["PreprocessedExplanation"])

# Combine features
X_train_final = hstack([X_train_text, X_train[['QuestionId','is_correct']]])
X_test_final = hstack([X_test_text, X_test[['QuestionId','is_correct']]])

# Train MultinomialNB
nb = MultinomialNB(alpha=0.1)
nb.fit(X_train_final, y_train)

# predict
y_pred = nb.predict(X_test_final)
print(classification_report(y_test, y_pred))




                precision    recall  f1-score   support

      addition       0.85      0.81      0.83       269
    definition       0.92      0.96      0.94       114
      fraction       0.65      0.77      0.71       237
    incomplete       0.85      0.95      0.89       301
multiplication       0.91      0.51      0.65        94
     operation       0.84      0.34      0.49       108
      positive       0.95      0.92      0.93       119
     procedure       0.71      0.82      0.76       268
   subtraction       0.94      0.84      0.89       122
      variable       0.74      0.94      0.83       217
   wholenumber       0.96      0.57      0.71       123

      accuracy                           0.80      1972
     macro avg       0.85      0.77      0.78      1972
  weighted avg       0.82      0.80      0.80      1972



### Ask Aditya
How would I set this up to 1st predict if there is a misconception, then predict what kind only for those the model predicted to be a misconception?

What model would you try after this? A neural network (CNN?), a transformer (BERT,GPT)? 

Data set size: ~37K rows

But <10K records with a misconception


Remake X with y_pred then put thru next model 
My accuracy score is pretty good, but BERT could advance it. Might not need all the features, just the PreprocessedExplanation. Can try it, but keep it simple. If you see a score around 68 or 70, it's promising

For many categories, may need to try RandomForest or XGBoost or GradientBoost or CatBoost 


### Combining NewCategory and MisconceptionType

In [19]:
# Make a copy to avoid modifying the original
train_combined = train.copy()

# Replace 'Misconception' in NewCategory with the MisconceptionType
train_combined["CombinedTarget"] = train_combined.apply(
    lambda row: row["MisconceptionType"] if row["NewCategory"] == "Misconception" 
                else row["NewCategory"],
    axis=1
)

# Check results
train_combined[["NewCategory", "MisconceptionType", "CombinedTarget"]].tail(10)


Unnamed: 0,NewCategory,MisconceptionType,CombinedTarget
36686,Neither,none,Neither
36687,Neither,none,Neither
36688,Misconception,definition,definition
36689,Misconception,definition,definition
36690,Neither,none,Neither
36691,Neither,none,Neither
36692,Neither,none,Neither
36693,Neither,none,Neither
36694,Neither,none,Neither
36695,Neither,none,Neither


In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    train_combined[["QuestionId", "PreprocessedExplanation",'is_correct']], train_combined["CombinedTarget"], test_size=0.25, random_state=42
)

# Preprocess explanation, fit only on training data
def tok(text):
    tt = ToktokTokenizer()
    return tt.tokenize(text)

# Tune ngram_range (1,1) (1,2) or (1,3), max_features 10000 20000 or 50000, and set sublinear_tf = True
tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=50000,sublinear_tf=True,tokenizer=tok)
X_train_text = tfidf.fit_transform(X_train["PreprocessedExplanation"])
X_test_text  = tfidf.transform(X_test["PreprocessedExplanation"])

# Combine features
X_train_final = hstack([X_train_text, X_train[['QuestionId','is_correct']]])
X_test_final = hstack([X_test_text, X_test[['QuestionId','is_correct']]])

# Handle class imbalance
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X_train_final, y_train)

# Train MultinomialNB
nb = MultinomialNB(alpha=0.1)
nb.fit(X_res, y_res)

# predict
y_pred = nb.predict(X_test_final)
print('Combined Target Report:\n',classification_report(y_test, y_pred))




Combined Target Report:
                 precision    recall  f1-score   support

       Correct       0.79      0.70      0.74      3765
       Neither       0.65      0.37      0.48      2860
      addition       0.65      0.69      0.66       337
    definition       0.43      0.90      0.58       161
      fraction       0.46      0.54      0.50       322
    incomplete       0.61      0.86      0.71       384
multiplication       0.32      0.75      0.45       121
     operation       0.38      0.54      0.44       114
      positive       0.43      0.93      0.59       149
     procedure       0.49      0.66      0.56       357
   subtraction       0.55      0.86      0.68       153
      variable       0.30      0.80      0.43       290
   wholenumber       0.39      0.74      0.51       161

      accuracy                           0.61      9174
     macro avg       0.50      0.72      0.56      9174
  weighted avg       0.66      0.61      0.61      9174



### Combined Target with Naive Bayes
It looks like the combined target is worth pursuing. I'm getting better results than I did with the transformer models and neural networks, which all ended up with an F1 score around 0.41. I may be able to get even better results with a a tree model that's better suited to many classes.

### Random Forest Model on the Combined Target

In [21]:
# -----------------------
# Train/test split
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(
    train_combined[["QuestionId", "PreprocessedExplanation", "is_correct"]],
    train_combined["CombinedTarget"],
    test_size=0.25,
    random_state=42,
    stratify=train_combined["CombinedTarget"]
)

# -----------------------
# Text preprocessing (TF-IDF)
# -----------------------
def tok(text):
    tt = ToktokTokenizer()
    return tt.tokenize(text)

tfidf = TfidfVectorizer(
    ngram_range=(1, 3),
    max_features=50000,
    sublinear_tf=True,
    tokenizer=tok
)

X_train_text = tfidf.fit_transform(X_train["PreprocessedExplanation"])
X_test_text  = tfidf.transform(X_test["PreprocessedExplanation"])

# -----------------------
# Encode categorical QuestionId
# -----------------------
le_qid = LabelEncoder()
X_train_qid = le_qid.fit_transform(X_train["QuestionId"])
X_test_qid  = le_qid.transform(X_test["QuestionId"])

# Convert to sparse column
X_train_qid = csr_matrix(X_train_qid.reshape(-1, 1))
X_test_qid  = csr_matrix(X_test_qid.reshape(-1, 1))

# Binary feature is_correct
X_train_ic = csr_matrix(X_train[["is_correct"]].values)
X_test_ic  = csr_matrix(X_test[["is_correct"]].values)

# -----------------------
# Combine features
# -----------------------
X_train_final = hstack([X_train_text, X_train_qid, X_train_ic])
X_test_final  = hstack([X_test_text, X_test_qid, X_test_ic])

# -----------------------
# Handle class imbalance
# -----------------------
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X_train_final, y_train)

# -----------------------
# Train LightGBM classifier
# -----------------------
lgbm = lgb.LGBMClassifier(
    boosting_type="gbdt",
    objective="multiclass",
    num_class=len(np.unique(y_res)),
    class_weight="balanced",
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    random_state=42,
    n_jobs=-1
)

lgbm.fit(X_res, y_res)

# -----------------------
# Evaluate
# -----------------------
y_pred = lgbm.predict(X_test_final)
print("Combined Target Report:\n", classification_report(y_test, y_pred))




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.085070 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 85339
[LightGBM] [Info] Number of data points in the train set: 146536, number of used features: 11161
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.5



Combined Target Report:
                 precision    recall  f1-score   support

       Correct       0.83      0.87      0.85      3757
       Neither       0.68      0.57      0.62      2952
      addition       0.77      0.90      0.83       333
    definition       0.77      0.75      0.76       152
      fraction       0.62      0.79      0.70       310
    incomplete       0.74      0.78      0.76       364
multiplication       0.68      0.76      0.72       112
     operation       0.62      0.66      0.64       126
      positive       0.75      0.77      0.76       142
     procedure       0.72      0.80      0.75       325
   subtraction       0.71      0.83      0.76       155
      variable       0.69      0.80      0.74       277
   wholenumber       0.66      0.76      0.71       169

      accuracy                           0.75      9174
     macro avg       0.71      0.77      0.74      9174
  weighted avg       0.75      0.75      0.75      9174



I ran an Optuna (code block is commented below because it took 5 hours to run) and got these as best hyperparameters for the LightGBM. However, even with these paramenters, the F1 score remained 0.75. Precision went down, recall went up, and the model takes longer to run using these, so I'm going to stick with the LightGBM model above. 

Best Params: {'learning_rate': 0.17023808662117607, 'num_leaves': 132, 'max_depth': 18, 'min_child_samples': 15, 'subsample': 0.6953521929320011, 'colsample_bytree': 0.6003865687794745, 'reg_alpha': 0.11758490315889479, 'reg_lambda': 1.01328368773827}
Best CV F1: 0.9505423352733698

In [None]:
# # DON'T RUN UNLESS YOU HAVE 5 HOURS 

# # -----------------------
# # Train/test split
# # -----------------------
# X_train, X_test, y_train, y_test = train_test_split(
#     train_combined[["QuestionId", "PreprocessedExplanation", "is_correct"]],
#     train_combined["CombinedTarget"],
#     test_size=0.25,
#     random_state=42,
#     stratify=train_combined["CombinedTarget"]
# )

# # -----------------------
# # Text preprocessing (TF-IDF)
# # -----------------------
# def tok(text):
#     tt = ToktokTokenizer()
#     return tt.tokenize(text)

# tfidf = TfidfVectorizer(
#     ngram_range=(1, 3),
#     max_features=50000,
#     sublinear_tf=True,
#     tokenizer=tok
# )

# X_train_text = tfidf.fit_transform(X_train["PreprocessedExplanation"])
# X_test_text  = tfidf.transform(X_test["PreprocessedExplanation"])

# # -----------------------
# # Encode categorical QuestionId
# # -----------------------
# le_qid = LabelEncoder()
# X_train_qid = le_qid.fit_transform(X_train["QuestionId"])
# X_test_qid  = le_qid.transform(X_test["QuestionId"])

# X_train_qid = csr_matrix(X_train_qid.reshape(-1, 1))
# X_test_qid  = csr_matrix(X_test_qid.reshape(-1, 1))

# # -----------------------
# # Binary feature is_correct
# # -----------------------
# X_train_ic = csr_matrix(X_train[["is_correct"]].values)
# X_test_ic  = csr_matrix(X_test[["is_correct"]].values)

# # -----------------------
# # Combine features
# # -----------------------
# X_train_final = hstack([X_train_text, X_train_qid, X_train_ic])
# X_test_final  = hstack([X_test_text, X_test_qid, X_test_ic])

# # -----------------------
# # Handle class imbalance
# # -----------------------
# ros = RandomOverSampler(random_state=42)
# X_res, y_res = ros.fit_resample(X_train_final, y_train)

# # Encode y for LightGBM
# le_y = LabelEncoder()
# y_res_enc = le_y.fit_transform(y_res)
# y_test_enc = le_y.transform(y_test)

# # -----------------------
# # Optuna Objective
# # -----------------------
# def objective(trial):
#     params = {
#         "n_estimators": 500,
#         "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
#         "num_leaves": trial.suggest_int("num_leaves", 20, 150),
#         "max_depth": trial.suggest_int("max_depth", -1, 20),
#         "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
#         "subsample": trial.suggest_float("subsample", 0.5, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
#         "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),
#         "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
#         "objective": "multiclass",
#         "num_class": len(np.unique(y_res_enc)),
#         "random_state": 42,
#         "n_jobs": -1
#     }

#     cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
#     f1_scores = []

#     for train_idx, valid_idx in cv.split(X_res, y_res_enc):
#         X_tr, X_val = X_res[train_idx], X_res[valid_idx]
#         y_tr, y_val = y_res_enc[train_idx], y_res_enc[valid_idx]

#         model = LGBMClassifier(**params)
#         model.fit(X_tr, y_tr)

#         preds = model.predict(X_val)
#         f1_scores.append(f1_score(y_val, preds, average="macro"))

#     return np.mean(f1_scores)

# # -----------------------
# # Run Optuna
# # -----------------------
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=30)

# print("Best Params:", study.best_params)
# print("Best CV F1:", study.best_value)

# # -----------------------
# # Train final model with best params
# # -----------------------
# best_model = LGBMClassifier(**study.best_params)
# best_model.fit(X_res, y_res_enc)

# # -----------------------
# # Evaluate on test set
# # -----------------------
# y_pred = best_model.predict(X_test_final)
# print("Combined Target Report:\n", classification_report(y_test_enc, y_pred, target_names=le_y.classes_))
