In [1]:
%load_ext cudf.pandas

import numpy as np
import cudf
import cuml
import pandas as pd
import sklearn
from cuml.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold

print('RAPIDS',cuml.__version__)

RAPIDS 25.02.01


In [2]:
train = pd.read_csv("/kaggle/input/map-charting-student-math-misunderstandings/train.csv")
test = pd.read_csv("/kaggle/input/map-charting-student-math-misunderstandings/test.csv")

train['Misconception'] = train['Misconception'].fillna('NA')
train['Misconception'] = train['Misconception'].map(str)
train['target_cat'] = train.apply(lambda x: x['Category'] + ":" + x['Misconception'], axis=1)

print(train.shape, test.shape)
train.head()

(36696, 8) (3, 5)


Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation,Category,Misconception,target_cat
0,0,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),0ne third is equal to tree nineth,True_Correct,,True_Correct:NA
1,1,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 / 3 because 6 over 9 is 2 thirds and 1 third...,True_Correct,,True_Correct:NA
2,2,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),"1 3rd is half of 3 6th, so it is simplee to un...",True_Neither,,True_Neither:NA
3,3,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 goes into everything and 3 goes into nine,True_Neither,,True_Neither:NA
4,4,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 out of every 3 isn't coloured,True_Correct,,True_Correct:NA


In [3]:
train['target_cat'].value_counts()

target_cat
True_Correct:NA                            14802
False_Neither:NA                            6542
True_Neither:NA                             5265
False_Misconception:Incomplete              1446
False_Misconception:Additive                 891
                                           ...  
True_Misconception:Adding_across               1
True_Misconception:Base_rate                   1
True_Misconception:Longer_is_bigger            1
True_Misconception:Not_variable                1
True_Misconception:Whole_numbers_larger        1
Name: count, Length: 65, dtype: int64

In [4]:
map_target1 = train['Category'].value_counts().to_frame()
map_target1['count'] = np.arange(len(map_target1))
map_target1 = map_target1.to_dict()['count']

map_target2 = train['Misconception'].value_counts().to_frame()
map_target2['count'] = np.arange(len(map_target2))
map_target2 = map_target2.to_dict()['count']

map_target1

{'True_Correct': 0,
 'False_Misconception': 1,
 'False_Neither': 2,
 'True_Neither': 3,
 'True_Misconception': 4,
 'False_Correct': 5}

In [5]:
train['target1'] = train['Category'].map(map_target1)
train['target2'] = train['Misconception'].map(map_target2)

train['Category'].value_counts()

Category
True_Correct           14802
False_Misconception     9457
False_Neither           6542
True_Neither            5265
True_Misconception       403
False_Correct            227
Name: count, dtype: int64

In [6]:
train['Misconception'].value_counts()

Misconception
NA                                        26836
Incomplete                                 1454
Additive                                    929
Duplication                                 704
Subtraction                                 620
Positive                                    566
Wrong_term                                  558
Irrelevant                                  497
Wrong_fraction                              418
Inversion                                   414
Mult                                        353
Denominator-only_change                     336
Whole_numbers_larger                        329
Adding_across                               307
WNB                                         299
Tacking                                     290
Unknowable                                  282
Wrong_Fraction                              273
SwapDividend                                206
Scale                                       179
Not_variable              

# Train Target Category

In [7]:
train['sentence'] = train.apply(lambda x: f"Question: {x['QuestionText']}\nAnswer: {x['MC_Answer']}\nExplanation: {x['StudentExplanation']}", axis=1)
test['sentence'] = test.apply(lambda x: f"Question: {x['QuestionText']}\nAnswer: {x['MC_Answer']}\nExplanation: {x['StudentExplanation']}", axis=1)

model = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), analyzer='word', max_df=0.95, min_df=2)
model.fit(pd.concat([train, test]).sentence)

train_embeddings = model.transform(train.sentence)
print('Train sparse shape is',train_embeddings.shape)

test_embeddings = model.transform(test.sentence)
print('Test sparse shape is',test_embeddings.shape)

Train sparse shape is (36696, 37434)
Test sparse shape is (3, 37434)


In [8]:
ytrain1 = np.zeros((len(train), len(map_target1)))
ytest1 = np.zeros((len(test), len(map_target1)))

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
for i, (train_index, valid_index) in enumerate(skf.split(train_embeddings, train['target1'])):
    print(f"Fold {i}, {len(train_index)}, {len(valid_index)}:")
    model = cuml.LogisticRegression()
    model.fit(train_embeddings[train_index], train['target1'].iloc[train_index])
    ytrain1[valid_index] = model.predict_proba(train_embeddings[valid_index]).get()
    ytest1 += (model.predict_proba(test_embeddings).get() / 10.)

print("ACC:", np.mean( train['target1'] == np.argmax(ytrain1, 1) ) )
print("F1:", sklearn.metrics.f1_score(train['target1'] , np.argmax(ytrain1, 1), average='weighted') )

Fold 0, 33026, 3670:
Fold 1, 33026, 3670:
Fold 2, 33026, 3670:
Fold 3, 33026, 3670:
Fold 4, 33026, 3670:
Fold 5, 33026, 3670:
Fold 6, 33027, 3669:
Fold 7, 33027, 3669:
Fold 8, 33027, 3669:
Fold 9, 33027, 3669:
ACC: 0.7458578591672117
F1: 0.7282006660973415


# Train Target Misconception

In [9]:
model = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), analyzer='word', max_df=0.95, min_df=2)

model.fit(pd.concat([train, test]).sentence)

train_embeddings = model.transform(train.sentence)
print('Train sparse shape is',train_embeddings.shape)

test_embeddings = model.transform(test.sentence)
print('Test sparse shape is',test_embeddings.shape)

Train sparse shape is (36696, 17652)
Test sparse shape is (3, 17652)


In [10]:
ytrain2 = np.zeros((len(train), len(map_target2)))
ytest2 = np.zeros((len(test), len(map_target2)))

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
for i, (train_index, valid_index) in enumerate(skf.split(train_embeddings, train['target2'])):
    print(f"Fold {i}, {len(train_index)}, {len(valid_index)}:")
    model = cuml.LogisticRegression(class_weight='balanced')
    model.fit(train_embeddings[train_index], train['target2'].iloc[train_index])
    ytrain2[valid_index] = model.predict_proba(train_embeddings[valid_index]).get()
    ytest2 += (model.predict_proba(test_embeddings).get() / 10.)

print("ACC:", np.mean( train['target2'] == np.argmax(ytrain2, 1) ) )
print("F1:", sklearn.metrics.f1_score(train['target2'] , np.argmax(ytrain2, 1), average='weighted') )

Fold 0, 33026, 3670:




Fold 1, 33026, 3670:
Fold 2, 33026, 3670:
Fold 3, 33026, 3670:
Fold 4, 33026, 3670:
Fold 5, 33026, 3670:
Fold 6, 33027, 3669:
Fold 7, 33027, 3669:
Fold 8, 33027, 3669:
Fold 9, 33027, 3669:
ACC: 0.8471495530848049
F1: 0.8294698356955198


In [11]:
map_inverse1 = {map_target1[k]:k for k in map_target1}
map_inverse2 = {map_target2[k]:k for k in map_target2}

In [12]:
ytrain2[:, 0] = 0
predicted1 = np.argsort(-ytrain1, 1)[:,:3]
predicted2 = np.argsort(-ytrain2, 1)[:,:3]

In [13]:
predict = []
for i in range(len(predicted1)):
    pred = []
    for j in range(3):
        p1 = map_inverse1[predicted1[i, j]]
        p2 = map_inverse2[predicted2[i, j]]        
        if 'Misconception' in p1:
            pred.append(p1 + ":" + p2 )
        else:
            pred.append(p1 + ":NA")
    predict.append(pred)

#Acc 1
print( np.mean(train['target_cat'] == [p[0] for p in predict]) )
#Acc 2
print( np.mean(train['target_cat'] == [p[1] for p in predict]) )
#Acc 3
print( np.mean(train['target_cat'] == [p[2] for p in predict]) )

0.7328864181382166
0.15429474602136473
0.03090255068672335


In [14]:
def map3(target_list, pred_list):
    score = 0.
    for t, p in zip(target_list, pred_list):
        if t == p[0]:
            score+=1.
        elif t == p[1]:
            score+=1/2
        elif t == p[2]:
            score+=1/3
    return score / len(target_list)
        
print(f"MAP@3: {map3(train['target_cat'].tolist(), predict)}")

MAP@3: 0.8203346413778065


# Generate Test Predictions:

In [15]:
ytest2[:, 0] = 0
predicted1 = np.argsort(-ytest1, 1)[:,:3]
predicted2 = np.argsort(-ytest2, 1)[:,:3]

predict = []
for i in range(len(predicted1)):
    pred = []
    for j in range(3):
        p1 = map_inverse1[predicted1[i, j]]
        p2 = map_inverse2[predicted2[i, j]]        
        if 'Misconception' in p1:
            pred.append(p1 + ":" + p2 )
        else:
            pred.append(p1 + ":NA")
    predict.append(" ".join(pred))

sub = pd.read_csv("/kaggle/input/map-charting-student-math-misunderstandings/sample_submission.csv")
sub['Category:Misconception'] = predict
sub.to_csv("submission.csv", index=False)
sub

Unnamed: 0,row_id,Category:Misconception
0,36696,True_Correct:NA True_Neither:NA False_Neither:NA
1,36697,False_Misconception:Incomplete True_Correct:NA...
2,36698,True_Correct:NA True_Neither:NA False_Misconce...
