### Import nescessary library

In [2]:
#!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


In [5]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

import pandas as pd
import polars as pl
import numpy as np
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer
from spellchecker import SpellChecker
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

nltk.download("omw-1.4") # Open Multilingual WordNet
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("wordnet2022")
nltk.download("punkt")

Mounted at /gdrive


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet2022 to /root/nltk_data...
[nltk_data]   Package wordnet2022 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Read Data

In [13]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [16]:
train = pd.read_csv('/content/drive/MyDrive/PTDLTM - Project /Notebook/learning-agency-lab-automated-essay-scoring-2/train.csv')
train

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3
...,...,...,...
17302,ffd378d,"the story "" The Challenge of Exploing Venus "" ...",2
17303,ffddf1f,Technology has changed a lot of ways that we l...,4
17304,fff016d,If you don't like sitting around all day than ...,2
17305,fffb49b,"In ""The Challenge of Exporing Venus,"" the auth...",1


In [17]:
test = pd.read_csv('/content/drive/MyDrive/PTDLTM - Project /Notebook/learning-agency-lab-automated-essay-scoring-2/test.csv')
test_essay_id = test['essay_id']
test

Unnamed: 0,essay_id,full_text
0,000d118,Many people have car where they live. The thin...
1,000fe60,I am a scientist at NASA that is discussing th...
2,001ab80,People always wish they had the same technolog...


### Clean text data

In [18]:
def clean_text(text):
    # Chuyển chữ viết hoa thành chữ thường
    text = text.lower()

    # Xóa các thẻ HTML
    text = re.compile(r'<.*?>').sub(r'', text)

    # Xóa các tag tên (mention)
    text = re.sub(r'@\w+\s*', '', text)

    # Xóa hashtag (dấu #)
    text = re.sub(r'#\w+', '', text)

    # Xóa các liên kết URL
    text = re.sub(r'http\S+|www\S+', '', text)

    # Xóa các ký tự không mong muốn như \xa0
    text = text.replace(u'\xa0', ' ')

    # Xóa chữ số
    text = re.sub(r'\d+', '', text)

    # Thay thế các khoảng trắng liên tiếp bằng một khoảng trắng duy nhất
    text = re.sub(r'\s+', ' ', text)

    # Mở rộng các từ viết tắt
    # text = expandContractions(text)

    # Thay thế các dấu chấm và dấu phẩy liên tiếp bằng một dấu duy nhất
    text = re.sub(r'\.+', '.', text)
    text = re.sub(r'\,+', ',', text)

    # Xóa các khoảng trắng ở đầu và cuối chuỗi
    text = text.strip()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    text = ' '.join([word for word in words if word not in stop_words])
    return text

### Extract feature of text

In [19]:
def extract_features(text):
    sentences = sent_tokenize(text)
    num_sentences = len(sentences)
    avg_sens_length = sum(len(sentence) for sentence in sentences) / num_sentences if num_sentences > 0 else 0

    words = word_tokenize(text)
    num_words = len(words)
    avg_word_length = sum(len(word) for word in words) / num_words if num_words > 0 else 0
    return num_sentences,avg_sens_length, num_words, avg_word_length

In [20]:
def score_normalise(n):
    temp = (n*6)/10
    temp = round(temp)
    if temp < 1:
        temp = 1
    elif temp > 6:
        temp = 6
    else:
        temp = temp
    return temp

def pred_processor(pred):
    predic = []
    for i in range(len(pred)):
        predic.append(pred[i][0])

    predic = list(map(round, predic))
    final_pred = [i if i >= 1 else 1 for i in predic]
    final_pred = [i if i <= 6 else 6 for i in final_pred]
    return final_pred

In [21]:
train['cleaned_essay_text'] = train['full_text'].apply(clean_text)
train[['full_text', 'cleaned_essay_text']].head()

Unnamed: 0,full_text,cleaned_essay_text
0,Many people have car where they live. The thin...,many people car live . thing n't know use car ...
1,I am a scientist at NASA that is discussing th...,scientist nasa discussing `` face '' mars . ex...
2,People always wish they had the same technolog...,"people always wish technology seen movies , be..."
3,"We all heard about Venus, the planet without a...","heard venus , planet without almost oxygen ear..."
4,"Dear, State Senator\n\nThis is a letter to arg...","dear , state senator letter argue favor keepin..."


In [22]:
# Apply feature extraction
train['sentences_count'], train['avg_sentence_length'], train['word_count'], train['avg_word_length'] = zip(*train['cleaned_essay_text'].apply(extract_features))

# Display the features
train[['cleaned_essay_text','sentences_count','avg_sentence_length', 'word_count', 'avg_word_length']].head()

Unnamed: 0,cleaned_essay_text,sentences_count,avg_sentence_length,word_count,avg_word_length
0,many people car live . thing n't know use car ...,13,127.538462,280,4.967857
1,scientist nasa discussing `` face '' mars . ex...,21,45.190476,174,4.574713
2,"people always wish technology seen movies , be...",24,82.833333,328,5.134146
3,"heard venus , planet without almost oxygen ear...",20,97.65,302,5.533113
4,"dear , state senator letter argue favor keepin...",15,100.8,225,5.786667


In [24]:
train = train.reset_index(drop=True)
y = train['score']
X = train.drop(columns=["full_text","essay_id","score"])
train.head()

Unnamed: 0,essay_id,full_text,score,cleaned_essay_text,sentences_count,avg_sentence_length,word_count,avg_word_length
0,000d118,Many people have car where they live. The thin...,3,many people car live . thing n't know use car ...,13,127.538462,280,4.967857
1,000fe60,I am a scientist at NASA that is discussing th...,3,scientist nasa discussing `` face '' mars . ex...,21,45.190476,174,4.574713
2,001ab80,People always wish they had the same technolog...,4,"people always wish technology seen movies , be...",24,82.833333,328,5.134146
3,001bdc0,"We all heard about Venus, the planet without a...",4,"heard venus , planet without almost oxygen ear...",20,97.65,302,5.533113
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,"dear , state senator letter argue favor keepin...",15,100.8,225,5.786667


In [25]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 3),
    min_df=0.05,
    max_df=0.95,
    sublinear_tf=True,
    max_features=5000
)

# Fit and transform the text data
X_tfidf = tfidf_vectorizer.fit_transform(train['cleaned_essay_text'])

# Convert to DataFrame
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Combine TF-IDF features with other features
X = pd.concat([X_tfidf_df, X], axis=1)
X = X.drop(columns="cleaned_essay_text")

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_tf = tf.constant(X_train.values, dtype=tf.float32)
X_test_tf = tf.constant(X_test.values, dtype=tf.float32)
y_train_tf = tf.constant(y_train.values, dtype=tf.float32)
y_test_tf = tf.constant(y_test.values, dtype=tf.float32)

In [28]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(optimizer='adam',
              loss='mean_squared_error')
model.fit(X_train_tf, y_train_tf, epochs=50, batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7cc6fc9c10c0>

In [29]:
pred = model.predict(X_test)
final_pred = pred_processor(pred)



In [30]:
from sklearn.metrics import cohen_kappa_score
qwk = cohen_kappa_score(y_test, final_pred, weights='quadratic')
qwk

0.7847213552368777