In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip /content/drive/MyDrive/AIChallenges/Zalo22/WikiQA/data/e2eqa-train+public_test-v1

Archive:  /content/drive/MyDrive/AIChallenges/Zalo22/WikiQA/data/e2eqa-train+public_test-v1.zip
   creating: e2eqa-train+public_test-v1/
  inflating: e2eqa-train+public_test-v1/zac2022_testa_sample_submission.json  
  inflating: e2eqa-train+public_test-v1/zac2022_testa_only_question.json  
  inflating: e2eqa-train+public_test-v1/zac2022_train_merged_final.json  


In [33]:
import json
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [None]:
with open('e2eqa-train+public_test-v1/zac2022_train_merged_final.json', 'r') as f:
    raw_data = json.load(f)['data']

In [51]:
def is_date(answer_lower):
    checks = ['năm', 'tháng', 'ngày', 'thế kỷ', 'thời', 'thiên niên kỷ']
    for c in checks:
        if c in answer_lower:
            return True
    return False

In [52]:
def get_category(answer):
    answer_lower = answer.lower()
    if answer_lower.startswith('wiki'):
        return 'entity'
    if is_date(answer_lower):
        return 'date'
    if answer_lower.isdigit():
        return 'number'
    return None

In [53]:
filtered = []
for doc in raw_data:
    if doc['category'] != 'FULL_ANNOTATION':
        continue
    doc['question_type'] = get_category(doc['answer'])
    filtered.append(doc)

In [54]:
df = pd.DataFrame(filtered)

In [55]:
df['question_type'].value_counts()

entity    4608
date       279
number      93
Name: question_type, dtype: int64

In [73]:
df = df[~df.question_type.isnull()] # filter quest that are unable to categorized
df['question_type_int'] = df['question_type'].map({'entity':0, 'date':1, 'number':2})

In [64]:
vec = TfidfVectorizer()
vec.fit(df['question'])

TfidfVectorizer()

In [74]:
X = vec.transform(df['question'])
y = df['question_type_int']

In [75]:
model = LogisticRegression()

In [76]:
cross_val_score(model, X, y, cv=5, scoring='accuracy')

array([0.96586345, 0.96987952, 0.97088353, 0.96987952, 0.95883534])

In [77]:
model.fit(X, y)
model.score(X, y)

0.9813253012048193

In [78]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('vectorizer', TfidfVectorizer()), ('model', LogisticRegression())])

In [79]:
pipe.fit(df['question'], df['question_type_int'])

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('model', LogisticRegression())])

In [80]:
pipe.score(df['question'], df['question_type_int'])

0.9813253012048193

In [87]:
import joblib

save_path = '/content/drive/MyDrive/AIChallenges/Zalo22/WikiQA/models/question_classifier.pkl'

joblib.dump(pipe, save_path)

['/content/drive/MyDrive/AIChallenges/Zalo22/WikiQA/models/question_classifier.pkl']

In [88]:
joblib.load(save_path)

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('model', LogisticRegression())])