In [1]:
%%time

%load_ext cudf.pandas

import numpy as np
import cudf
import cuml
import pandas as pd
import sklearn
from cuml.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
import sklearn.metrics
from scipy import sparse
from sklearn.model_selection import StratifiedKFold

import re
from nltk.stem import WordNetLemmatizer
import nltk

import warnings
warnings.filterwarnings('ignore')

print('RAPIDS',cuml.__version__)

RAPIDS 25.02.01
CPU times: user 6 s, sys: 1.38 s, total: 7.38 s
Wall time: 20.2 s


In [2]:
%%time

train = pd.read_csv("/kaggle/input/map-charting-student-math-misunderstandings/train.csv")
test = pd.read_csv("/kaggle/input/map-charting-student-math-misunderstandings/test.csv")

train['Misconception'] = train['Misconception'].fillna('NA')
train['Misconception'] = train['Misconception'].map(str)
train['target_cat'] = train.apply(lambda x: x['Category'] + ":" + x['Misconception'], axis=1)

print(train.shape, test.shape)
train.head()

(36696, 8) (3, 5)
CPU times: user 2.72 s, sys: 225 ms, total: 2.95 s
Wall time: 3.95 s


Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation,Category,Misconception,target_cat
0,0,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),0ne third is equal to tree nineth,True_Correct,,True_Correct:NA
1,1,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 / 3 because 6 over 9 is 2 thirds and 1 third...,True_Correct,,True_Correct:NA
2,2,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),"1 3rd is half of 3 6th, so it is simplee to un...",True_Neither,,True_Neither:NA
3,3,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 goes into everything and 3 goes into nine,True_Neither,,True_Neither:NA
4,4,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 out of every 3 isn't coloured,True_Correct,,True_Correct:NA


In [3]:
%%time

train['target_cat'].value_counts()

CPU times: user 46.2 ms, sys: 38.2 ms, total: 84.4 ms
Wall time: 289 ms


target_cat
True_Correct:NA                            14802
False_Neither:NA                            6542
True_Neither:NA                             5265
False_Misconception:Incomplete              1446
False_Misconception:Additive                 891
                                           ...  
True_Misconception:Adding_across               1
True_Misconception:Base_rate                   1
True_Misconception:Longer_is_bigger            1
True_Misconception:Not_variable                1
True_Misconception:Whole_numbers_larger        1
Name: count, Length: 65, dtype: int64

In [4]:
%%time

map_target1 = train['Category'].value_counts().to_frame()
map_target1['count'] = np.arange(len(map_target1))
map_target1 = map_target1.to_dict()['count']

map_target2 = train['Misconception'].value_counts().to_frame()
map_target2['count'] = np.arange(len(map_target2))
map_target2 = map_target2.to_dict()['count']

map_target1

CPU times: user 21.7 ms, sys: 2.94 ms, total: 24.6 ms
Wall time: 27.4 ms


{'True_Correct': 0,
 'False_Misconception': 1,
 'False_Neither': 2,
 'True_Neither': 3,
 'True_Misconception': 4,
 'False_Correct': 5}

In [5]:
%%time

train['target1'] = train['Category'].map(map_target1)
train['target2'] = train['Misconception'].map(map_target2)

train['Category'].value_counts()

CPU times: user 46.7 ms, sys: 32.6 ms, total: 79.3 ms
Wall time: 160 ms


Category
True_Correct           14802
False_Misconception     9457
False_Neither           6542
True_Neither            5265
True_Misconception       403
False_Correct            227
Name: count, dtype: int64

In [6]:
%%time

train['Misconception'].value_counts()

CPU times: user 7.81 ms, sys: 964 µs, total: 8.78 ms
Wall time: 7.72 ms


Misconception
NA                                        26836
Incomplete                                 1454
Additive                                    929
Duplication                                 704
Subtraction                                 620
Positive                                    566
Wrong_term                                  558
Irrelevant                                  497
Wrong_fraction                              418
Inversion                                   414
Mult                                        353
Denominator-only_change                     336
Whole_numbers_larger                        329
Adding_across                               307
WNB                                         299
Tacking                                     290
Unknowable                                  282
Wrong_Fraction                              273
SwapDividend                                206
Scale                                       179
Not_variable              

# Train Target Category

In [7]:
%%time

train['sentence'] = "Question: " + train['QuestionText'].astype(str) + \
                    " Answer: " + train['MC_Answer'].astype(str) + \
                    " Explanation: " + train['StudentExplanation'].astype(str)

test['sentence'] = "Question: " + test['QuestionText'].astype(str) + \
                   " Answer: " + test['MC_Answer'].astype(str) + \
                   " Explanation: " + test['StudentExplanation'].astype(str)

clean_newlines = re.compile(r'\n+')
clean_spaces = re.compile(r'\s+')
clean_punct = re.compile(r'[^a-zA-Z0-9\s]')

def fast_clean(text):
    text = clean_newlines.sub(' ', text)
    text = clean_spaces.sub(' ', text)
    text = clean_punct.sub('', text)
    return text.strip().lower()

train['sentence'] = train['sentence'].apply(fast_clean)
test['sentence'] = test['sentence'].apply(fast_clean)

lemmatizer = WordNetLemmatizer()

def fast_lemmatize(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

train['sentence'] = train['sentence'].apply(fast_lemmatize)
test['sentence'] = test['sentence'].apply(fast_lemmatize)

model = TfidfVectorizer(stop_words='english', ngram_range=(1, 4), analyzer='word', max_df=0.95, min_df=2,)

model.fit(pd.concat([train['sentence'], test['sentence']]))

train_embeddings = model.transform(train['sentence'])
test_embeddings = model.transform(test['sentence'])
print('Train sparse shape is', train_embeddings.shape)
print('Test sparse shape is', test_embeddings.shape)

Train sparse shape is (36696, 62255)
Test sparse shape is (3, 62255)
CPU times: user 13.4 s, sys: 698 ms, total: 14.1 s
Wall time: 15.8 s


In [8]:
%%time

ytrain1 = np.zeros((len(train), len(map_target1)))
ytest1 = np.zeros((len(test), len(map_target1)))

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
for i, (train_index, valid_index) in enumerate(skf.split(train_embeddings, train['target1'])):
    print(f"Fold {i}, {len(train_index)}, {len(valid_index)}:")
    model = cuml.LogisticRegression()
    model.fit(train_embeddings[train_index], train['target1'].iloc[train_index])
    ytrain1[valid_index] = model.predict_proba(train_embeddings[valid_index]).get()
    ytest1 += (model.predict_proba(test_embeddings).get() / 10.)

print("ACC:", np.mean( train['target1'] == np.argmax(ytrain1, 1) ) )
print("F1:", sklearn.metrics.f1_score(train['target1'] , np.argmax(ytrain1, 1), average='weighted') )

Fold 0, 33026, 3670:
Fold 1, 33026, 3670:
Fold 2, 33026, 3670:
Fold 3, 33026, 3670:
Fold 4, 33026, 3670:
Fold 5, 33026, 3670:
Fold 6, 33027, 3669:
Fold 7, 33027, 3669:
Fold 8, 33027, 3669:
Fold 9, 33027, 3669:
ACC: 0.7796762589928058
F1: 0.7638567799062028
CPU times: user 5.42 s, sys: 707 ms, total: 6.13 s
Wall time: 6.4 s


# Train Target Misconception

In [9]:
%%time

model = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), analyzer='word', max_df=0.95, min_df=2)

model.fit(pd.concat([train, test]).sentence)

train_embeddings = model.transform(train.sentence)
print('Train sparse shape is',train_embeddings.shape)

test_embeddings = model.transform(test.sentence)
print('Test sparse shape is',test_embeddings.shape)

Train sparse shape is (36696, 41190)
Test sparse shape is (3, 41190)
CPU times: user 806 ms, sys: 121 ms, total: 927 ms
Wall time: 916 ms


In [10]:
%%time

ytrain2 = np.zeros((len(train), len(map_target2)))
ytest2 = np.zeros((len(test), len(map_target2)))

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
for i, (train_index, valid_index) in enumerate(skf.split(train_embeddings, train['target2'])):
    print(f"Fold {i}, {len(train_index)}, {len(valid_index)}:")
    model = cuml.LogisticRegression(class_weight='balanced')
    model.fit(train_embeddings[train_index], train['target2'].iloc[train_index])
    ytrain2[valid_index] = model.predict_proba(train_embeddings[valid_index]).get()
    ytest2 += (model.predict_proba(test_embeddings).get() / 10.)

print("ACC:", np.mean( train['target2'] == np.argmax(ytrain2, 1) ) )
print("F1:", sklearn.metrics.f1_score(train['target2'] , np.argmax(ytrain2, 1), average='weighted') )

Fold 0, 33026, 3670:
Fold 1, 33026, 3670:
Fold 2, 33026, 3670:
Fold 3, 33026, 3670:
Fold 4, 33026, 3670:
Fold 5, 33026, 3670:
Fold 6, 33027, 3669:
Fold 7, 33027, 3669:
Fold 8, 33027, 3669:
Fold 9, 33027, 3669:
ACC: 0.8718661434488773
F1: 0.8599159013343904
CPU times: user 5.98 s, sys: 471 ms, total: 6.45 s
Wall time: 6.43 s


In [11]:
%%time

map_inverse1 = {map_target1[k]:k for k in map_target1}
map_inverse2 = {map_target2[k]:k for k in map_target2}

CPU times: user 15 µs, sys: 1 µs, total: 16 µs
Wall time: 18.6 µs


In [12]:
%%time

ytrain2[:, 0] = 0
predicted1 = np.argsort(-ytrain1, 1)[:,:3]
predicted2 = np.argsort(-ytrain2, 1)[:,:3]

CPU times: user 24 ms, sys: 5 ms, total: 29 ms
Wall time: 28.3 ms


In [13]:
%%time

predict = []
for i in range(len(predicted1)):
    pred = []
    for j in range(3):
        p1 = map_inverse1[predicted1[i, j]]
        p2 = map_inverse2[predicted2[i, j]]        
        if 'Misconception' in p1:
            pred.append(p1 + ":" + p2 )
        else:
            pred.append(p1 + ":NA")
    predict.append(pred)

#Acc 1
print( np.mean(train['target_cat'] == [p[0] for p in predict]) )
#Acc 2
print( np.mean(train['target_cat'] == [p[1] for p in predict]) )
#Acc 3
print( np.mean(train['target_cat'] == [p[2] for p in predict]) )

0.77068345323741
0.1551122738173098
0.012671680837148463
CPU times: user 356 ms, sys: 2.1 ms, total: 358 ms
Wall time: 356 ms


In [14]:
%%time

def map3(target_list, pred_list):
    score = 0.
    for t, p in zip(target_list, pred_list):
        if t == p[0]:
            score+=1.
        elif t == p[1]:
            score+=1/2
        elif t == p[2]:
            score+=1/3
    return score / len(target_list)
        
print(f"MAP@3: {map3(train['target_cat'].tolist(), predict)}") 

MAP@3: 0.852463483758447
CPU times: user 16.8 ms, sys: 0 ns, total: 16.8 ms
Wall time: 16.5 ms


# Generate Test Predictions:

In [15]:
%%time

ytest2[:, 0] = 0
predicted1 = np.argsort(-ytest1, 1)[:,:3]
predicted2 = np.argsort(-ytest2, 1)[:,:3]

predict = []
for i in range(len(predicted1)):
    pred = []
    for j in range(3):
        p1 = map_inverse1[predicted1[i, j]]
        p2 = map_inverse2[predicted2[i, j]]        
        if 'Misconception' in p1:
            pred.append(p1 + ":" + p2 )
        else:
            pred.append(p1 + ":NA")
    predict.append(" ".join(pred))

sub = pd.read_csv("/kaggle/input/map-charting-student-math-misunderstandings/sample_submission.csv")
sub['Category:Misconception'] = predict
sub.to_csv("submission.csv", index=False)
sub

CPU times: user 12.9 ms, sys: 2.21 ms, total: 15.1 ms
Wall time: 38.7 ms


Unnamed: 0,row_id,Category:Misconception
0,36696,True_Correct:NA True_Neither:NA False_Neither:NA
1,36697,False_Misconception:WNB False_Neither:NA True_...
2,36698,True_Neither:NA True_Correct:NA False_Misconce...
