### Import nescessary library

In [2]:
!pip install pyspellchecker
!pip install catboost

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1
Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [3]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

import pandas as pd
import polars as pl
import numpy as np
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer
from spellchecker import SpellChecker
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

nltk.download("omw-1.4") # Open Multilingual WordNet
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("wordnet2022")
nltk.download("punkt")

Mounted at /gdrive


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package wordnet2022 to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet2022.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Read Data

In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [5]:
train = pd.read_csv('/content/drive/MyDrive/PTDLTM - Project /Notebook/learning-agency-lab-automated-essay-scoring-2/train.csv')
train

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3
...,...,...,...
17302,ffd378d,"the story "" The Challenge of Exploing Venus "" ...",2
17303,ffddf1f,Technology has changed a lot of ways that we l...,4
17304,fff016d,If you don't like sitting around all day than ...,2
17305,fffb49b,"In ""The Challenge of Exporing Venus,"" the auth...",1


### Clean text data

In [6]:
def clean_text(text):
    # Chuyển chữ viết hoa thành chữ thường
    text = text.lower()

    # Xóa các thẻ HTML
    text = re.compile(r'<.*?>').sub(r'', text)

    # Xóa các tag tên (mention)
    text = re.sub(r'@\w+\s*', '', text)

    # Xóa hashtag (dấu #)
    text = re.sub(r'#\w+', '', text)

    # Xóa các liên kết URL
    text = re.sub(r'http\S+|www\S+', '', text)

    # Xóa các ký tự không mong muốn như \xa0
    text = text.replace(u'\xa0', ' ')

    # Xóa chữ số
    text = re.sub(r'\d+', '', text)

    # Thay thế các khoảng trắng liên tiếp bằng một khoảng trắng duy nhất
    text = re.sub(r'\s+', ' ', text)

    # Thay thế các dấu chấm và dấu phẩy liên tiếp bằng một dấu duy nhất
    text = re.sub(r'\.+', '.', text)
    text = re.sub(r'\,+', ',', text)

    # Xóa các khoảng trắng ở đầu và cuối chuỗi
    text = text.strip()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    text = ' '.join([word for word in words if word not in stop_words])
    return text

### Extract feature of text

In [7]:
def extract_features(text):
    sentences = sent_tokenize(text)
    num_sentences = len(sentences)
    avg_sens_length = sum(len(sentence) for sentence in sentences) / num_sentences if num_sentences > 0 else 0

    words = word_tokenize(text)
    num_words = len(words)
    avg_word_length = sum(len(word) for word in words) / num_words if num_words > 0 else 0
    return num_sentences,avg_sens_length, num_words, avg_word_length

In [8]:
def score_normalise(n):
    temp = (n*6)/10
    temp = round(temp)
    if temp < 1:
        temp = 1
    elif temp > 6:
        temp = 6
    else:
        temp = temp
    return temp

def pred_processor(pred):
    predic = []
    for i in range(len(pred)):
        predic.append(pred[i][0])

    predic = list(map(round, predic))
    final_pred = [i if i >= 1 else 1 for i in predic]
    final_pred = [i if i <= 6 else 6 for i in final_pred]
    return final_pred

In [9]:
train['cleaned_essay_text'] = train['full_text'].apply(clean_text)
train[['full_text', 'cleaned_essay_text']].head()

Unnamed: 0,full_text,cleaned_essay_text
0,Many people have car where they live. The thin...,many people car live . thing n't know use car ...
1,I am a scientist at NASA that is discussing th...,scientist nasa discussing `` face '' mars . ex...
2,People always wish they had the same technolog...,"people always wish technology seen movies , be..."
3,"We all heard about Venus, the planet without a...","heard venus , planet without almost oxygen ear..."
4,"Dear, State Senator\n\nThis is a letter to arg...","dear , state senator letter argue favor keepin..."


In [10]:
# Apply feature extraction
train['sentences_count'], train['avg_sentence_length'], train['word_count'], train['avg_word_length'] = zip(*train['cleaned_essay_text'].apply(extract_features))

# Display the features
train[['cleaned_essay_text','sentences_count','avg_sentence_length', 'word_count', 'avg_word_length']].head()

Unnamed: 0,cleaned_essay_text,sentences_count,avg_sentence_length,word_count,avg_word_length
0,many people car live . thing n't know use car ...,13,127.538462,280,4.967857
1,scientist nasa discussing `` face '' mars . ex...,21,45.190476,174,4.574713
2,"people always wish technology seen movies , be...",24,82.833333,328,5.134146
3,"heard venus , planet without almost oxygen ear...",20,97.65,302,5.533113
4,"dear , state senator letter argue favor keepin...",15,100.8,225,5.786667


In [11]:
train = train.reset_index(drop=True)
y = train['score']
X = train.drop(columns=["full_text","essay_id","score"])
temp = train.drop(columns=["full_text","essay_id"])
train.head()

Unnamed: 0,essay_id,full_text,score,cleaned_essay_text,sentences_count,avg_sentence_length,word_count,avg_word_length
0,000d118,Many people have car where they live. The thin...,3,many people car live . thing n't know use car ...,13,127.538462,280,4.967857
1,000fe60,I am a scientist at NASA that is discussing th...,3,scientist nasa discussing `` face '' mars . ex...,21,45.190476,174,4.574713
2,001ab80,People always wish they had the same technolog...,4,"people always wish technology seen movies , be...",24,82.833333,328,5.134146
3,001bdc0,"We all heard about Venus, the planet without a...",4,"heard venus , planet without almost oxygen ear...",20,97.65,302,5.533113
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,"dear , state senator letter argue favor keepin...",15,100.8,225,5.786667


In [12]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 3),
    min_df=0.05,
    max_df=0.95,
    sublinear_tf=True,
    max_features=5000
)

# Fit and transform the text data
X_tfidf = tfidf_vectorizer.fit_transform(train['cleaned_essay_text'])

# Convert to DataFrame
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Combine TF-IDF features with other features
train = pd.concat([X_tfidf_df, temp], axis=1)
train = train.drop(columns="cleaned_essay_text")

In [13]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score

# Define the function to calculate QWK
def calculate_quadratic_weighted_kappa(y_true, y_pred, a=0.5):
    y_true_adjusted = (y_true + a).round()
    y_pred_adjusted = (y_pred + a).clip(1, 6).round()
    qwk_score = cohen_kappa_score(y_true_adjusted, y_pred_adjusted, weights="quadratic")
    return qwk_score

# Prepare StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train['fold'] = -1

for i, (_, val_index) in enumerate(skf.split(train, train['score'])):
    train.loc[val_index, 'fold'] = i

# Training loop
results = []

for fold in range(5):
    # Split the data into training and validation sets
    train_data = train[train['fold'] != fold]
    val_data = train[train['fold'] == fold]

    X_train_tf = train_data.drop(columns=['score', 'fold']).values
    y_train_tf = train_data['score'].values
    X_val_tf = val_data.drop(columns=['score', 'fold']).values
    y_val_tf = val_data['score'].values

    # Build the model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1)
    ])

    model.compile(optimizer='adam',
                  loss='mean_squared_error')

    # Custom callback to calculate QWK at the end of each epoch
    class QWKCallback(tf.keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs=None):
            y_val_pred = self.model.predict(X_val_tf).flatten()
            qwk_score = calculate_quadratic_weighted_kappa(y_val_tf, y_val_pred)
            print(f"Epoch {epoch + 1} QWK: {qwk_score}")

    # Train the model with QWKCallback
    model.fit(X_train_tf, y_train_tf, epochs=20, batch_size=80, verbose=0, callbacks=[QWKCallback()])

    # Evaluate the model on the validation set
    y_val_pred = model.predict(X_val_tf).flatten()
    val_loss = model.evaluate(X_val_tf, y_val_tf, verbose=0)
    qwk_score = calculate_quadratic_weighted_kappa(y_val_tf, y_val_pred)
    results.append((val_loss, qwk_score))

    print(f"Fold {fold}: Validation Loss = {val_loss}, QWK = {qwk_score}")

# Final results
for fold, (val_loss, qwk_score) in enumerate(results):
    print(f"Fold {fold}: Validation Loss = {val_loss}, QWK = {qwk_score}")

Epoch 1 QWK: 0.6345615312483768
Epoch 2 QWK: 0.6259359704379207
Epoch 3 QWK: 0.6483003922932391
Epoch 4 QWK: 0.6600667372444375
Epoch 5 QWK: 0.6573501889029943
Epoch 6 QWK: 0.686858450983013
Epoch 7 QWK: 0.5627617231706663
Epoch 8 QWK: 0.6766070265813586
Epoch 9 QWK: 0.6786831461392666
Epoch 10 QWK: 0.6745628877608573
Epoch 11 QWK: 0.6592926328867674
Epoch 12 QWK: 0.6746113187128927
Epoch 13 QWK: 0.6976582193568677
Epoch 14 QWK: 0.705552838935235
Epoch 15 QWK: 0.6370639648049229
Epoch 16 QWK: 0.6642831797041544
Epoch 17 QWK: 0.6794503337307811
Epoch 18 QWK: 0.6758605826679095
Epoch 19 QWK: 0.6654633571237433
Epoch 20 QWK: 0.668463320525918
Fold 0: Validation Loss = 0.5575118064880371, QWK = 0.668463320525918
Epoch 1 QWK: 0.5603739447648548
Epoch 2 QWK: 0.6514104859174088
Epoch 3 QWK: 0.6550700540956567
Epoch 4 QWK: 0.6794660518003203
Epoch 5 QWK: 0.6786375390715544
Epoch 6 QWK: 0.6611132779450574
Epoch 7 QWK: 0.5673913972272984
Epoch 8 QWK: 0.6850971011045335
Epoch 9 QWK: 0.65731009934

In [14]:
y_val_pred = model.predict(X_val_tf).flatten()
val_loss = model.evaluate(X_val_tf, y_val_tf, verbose=0)
qwk_score = calculate_quadratic_weighted_kappa(y_val_tf, y_val_pred)
print(f"After trained, Validation Loss = {val_loss}, QWK = {qwk_score}")

After trained, Validation Loss = 0.39022210240364075, QWK = 0.6872008864800343


### Áp dụng cho tập test

In [15]:
test = pd.read_csv('/content/drive/MyDrive/PTDLTM - Project /Notebook/learning-agency-lab-automated-essay-scoring-2/test.csv')
test_essay_id = test['essay_id']
test

Unnamed: 0,essay_id,full_text
0,000d118,Many people have car where they live. The thin...
1,000fe60,I am a scientist at NASA that is discussing th...
2,001ab80,People always wish they had the same technolog...


In [16]:
test['cleaned_essay_text'] = test['full_text'].apply(clean_text)
test['sentences_count'], test['avg_sentence_length'], test['word_count'], test['avg_word_length'] = zip(*test['cleaned_essay_text'].apply(extract_features))
test


Unnamed: 0,essay_id,full_text,cleaned_essay_text,sentences_count,avg_sentence_length,word_count,avg_word_length
0,000d118,Many people have car where they live. The thin...,many people car live . thing n't know use car ...,13,127.538462,280,4.967857
1,000fe60,I am a scientist at NASA that is discussing th...,scientist nasa discussing `` face '' mars . ex...,21,45.190476,174,4.574713
2,001ab80,People always wish they had the same technolog...,"people always wish technology seen movies , be...",24,82.833333,328,5.134146


In [17]:
test = test.reset_index(drop=True)
test= test.drop(columns=["full_text", "essay_id"])

In [18]:
X_test_tfidf = tfidf_vectorizer.transform(test['cleaned_essay_text'])
X_test_tfidf_df = pd.DataFrame(X_test_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

test = pd.concat([X_test_tfidf_df, test], axis=1)
test = test.drop(columns="cleaned_essay_text")

In [19]:
nn_pred = model.predict(test).flatten()



In [20]:
nn_pred = np.round(nn_pred, 0).astype(int)
nn_pred

array([2, 3, 5])

#### Sử dụng model Cat Boost

In [26]:
from sklearn.linear_model import LogisticRegression
import catboost
from catboost import CatBoostClassifier, Pool

In [34]:
X = train.drop(columns= ['score','fold'])
y = train['score']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [35]:
CAT = CatBoostClassifier()
CAT.fit(X,y)

Learning rate set to 0.091511
0:	learn: 1.6743384	total: 7.88s	remaining: 2h 11m 14s
1:	learn: 1.5786241	total: 14.1s	remaining: 1h 57m 17s
2:	learn: 1.5086154	total: 18.1s	remaining: 1h 40m 2s
3:	learn: 1.4461990	total: 22s	remaining: 1h 31m 19s
4:	learn: 1.3965453	total: 27.9s	remaining: 1h 32m 27s
5:	learn: 1.3546922	total: 31.9s	remaining: 1h 27m 58s
6:	learn: 1.3136987	total: 35.7s	remaining: 1h 24m 27s
7:	learn: 1.2822583	total: 40.4s	remaining: 1h 23m 25s
8:	learn: 1.2552980	total: 45.5s	remaining: 1h 23m 27s
9:	learn: 1.2285151	total: 49.4s	remaining: 1h 21m 26s
10:	learn: 1.2100955	total: 53.5s	remaining: 1h 20m 7s
11:	learn: 1.1912197	total: 59.2s	remaining: 1h 21m 15s
12:	learn: 1.1709875	total: 1m 3s	remaining: 1h 20m 20s
13:	learn: 1.1566059	total: 1m 7s	remaining: 1h 18m 56s
14:	learn: 1.1398709	total: 1m 12s	remaining: 1h 18m 50s
15:	learn: 1.1259343	total: 1m 17s	remaining: 1h 19m 14s
16:	learn: 1.1135491	total: 1m 21s	remaining: 1h 18m 7s
17:	learn: 1.1050350	total: 1m

<catboost.core.CatBoostClassifier at 0x79bb392ab550>

In [36]:
cat_test = CAT.predict(test)
cat_pred = cat_test.ravel()
cat_pred

array([3, 3, 4])