In [1]:
!pip install textstat --no-index --find-links=file:///kaggle/input/packages/textstat/ 
!pip install vaderSentiment --no-index --find-links=file:///kaggle/input/packages/vaderSentiment/ 
!pip install pyspellchecker --no-index --find-links=file:///kaggle/input/packages/pyspellchecker/     

Looking in links: file:///kaggle/input/packages/textstat/
Processing /kaggle/input/packages/textstat/textstat-0.7.3-py3-none-any.whl
Processing /kaggle/input/packages/textstat/pyphen-0.15.0-py3-none-any.whl (from textstat)
Installing collected packages: pyphen, textstat
Successfully installed pyphen-0.15.0 textstat-0.7.3
Looking in links: file:///kaggle/input/packages/vaderSentiment/
Processing /kaggle/input/packages/vaderSentiment/vaderSentiment-3.3.2-py2.py3-none-any.whl
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Looking in links: file:///kaggle/input/packages/pyspellchecker/
Processing /kaggle/input/packages/pyspellchecker/pyspellchecker-0.8.1-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from textstat import textstat
import spacy
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
import os
import pickle
from sklearn.preprocessing import StandardScaler
from spellchecker import SpellChecker
from collections import Counter
from sklearn.preprocessing import minmax_scale
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader

In [3]:
nlp = spacy.load('en_core_web_sm')
analyzer = SentimentIntensityAnalyzer()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
def textstat_features(text):
    features = {}
    features['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
    features['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
    features['smog_index'] = textstat.smog_index(text)
    features['coleman_liau_index'] = textstat.coleman_liau_index(text)
    features['automated_readability_index'] = textstat.automated_readability_index(text)
    features['dale_chall_readability_score'] = textstat.dale_chall_readability_score(text)
    features['difficult_words'] = textstat.difficult_words(text)
    features['linsear_write_formula'] = textstat.linsear_write_formula(text)
    features['gunning_fog'] = textstat.gunning_fog(text)
    features['text_standard'] = textstat.text_standard(text, float_output=True)
    features['spache_readability'] = textstat.spache_readability(text)
    features['mcalpine_eflaw'] = textstat.mcalpine_eflaw(text)
    features['reading_time'] = textstat.reading_time(text)
    features['syllable_count'] = textstat.syllable_count(text)
    features['lexicon_count'] = textstat.lexicon_count(text)
    features['monosyllabcount'] = textstat.monosyllabcount(text)

    return features



def spelling_features(text):
    spell = SpellChecker()

    features = {}

    words = nltk.word_tokenize(text)
    misspelled = spell.unknown(words)

    misspelled_count = len(misspelled)
    misspelled_ratio = misspelled_count / len(words)
    
    features['misspelled_count'] = misspelled_count
    features['misspelled_count'] = misspelled_ratio

    return features


def extract_linguistic_features(text):

    doc = nlp(text)
    features = {}


    # tense features
    tenses = [i.morph.get("Tense") for i in doc]
    tenses = [i[0] for i in tenses if i]
    tense_counts = Counter(tenses)
    features['past_tense_ratio'] = tense_counts.get("Past", 0) / (tense_counts.get("Pres", 0) + tense_counts.get("Past", 0) + 1e-5)
    features['present_tense_ratio'] = tense_counts.get("Pres", 0) / (tense_counts.get("Pres", 0) + tense_counts.get("Past", 0) + 1e-5)
    
    
    # len features

    features['word_count'] = len(doc)
    features['sentence_count'] = len([sentence for sentence in doc.sents])
    features['words_per_sentence'] = features['word_count'] / features['sentence_count']
    features['std_words_per_sentence'] = np.std([len(sentence) for sentence in doc.sents])

    features['unique_words'] = len(set([token.text for token in doc]))
    features['lexical_diversity'] = features['unique_words'] / features['word_count']

    paragraph = text.split('\n\n')

    features['paragraph_count'] = len(paragraph)

    features['avg_chars_by_paragraph'] = np.mean([len(paragraph) for paragraph in paragraph])
    features['avg_words_by_paragraph'] = np.mean([len(nltk.word_tokenize(paragraph)) for paragraph in paragraph])
    features['avg_sentences_by_paragraph'] = np.mean([len(nltk.sent_tokenize(paragraph)) for paragraph in paragraph]) 

    
    
    # sentiment features
    analyzer = SentimentIntensityAnalyzer()
    sentences = nltk.sent_tokenize(text)

    compound_scores, negative_scores, positive_scores, neutral_scores = [], [], [], []
    for sentence in sentences:
        scores = analyzer.polarity_scores(sentence)
        compound_scores.append(scores['compound'])
        negative_scores.append(scores['neg'])
        positive_scores.append(scores['pos'])
        neutral_scores.append(scores['neu'])

    features["mean_compound"] = np.mean(compound_scores)
    features["mean_negative"] = np.mean(negative_scores)
    features["mean_positive"] = np.mean(positive_scores)
    features["mean_neutral"] = np.mean(neutral_scores)

    features["std_compound"] = np.std(compound_scores)
    features["std_negative"] = np.std(negative_scores)
    features["std_positive"] = np.std(positive_scores)
    features["std_neutral"] = np.std(neutral_scores)

    return features



def extract_features(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    
    # Length-based features
    num_words = len(tokens)
    num_sentences = textstat.sentence_count(text)
    avg_sentence_length = num_words / num_sentences if num_sentences != 0 else 0
    
    # Readability features
    readability = textstat.flesch_reading_ease(text)
    
    # Text complexity
    doc = nlp(text)
    num_clauses = sum([1 for token in doc if token.dep_ == 'ROOT'])
    avg_clause_length = num_words / num_clauses if num_clauses != 0 else 0
    
    # Text variation
    unique_words = len(set(tokens))
    pos_counts = nltk.FreqDist(tag for (word, tag) in pos_tags)
    
    
    features = {
        'num_words': num_words,
        'num_sentences': num_sentences,
        'avg_sentence_length': avg_sentence_length,
        'readability': readability,
        'num_clauses': num_clauses,
        'avg_clause_length': avg_clause_length,
        'unique_words': unique_words,
        'pos_tags': pos_tags,
        'pos_counts': dict(pos_counts),
    }
    
    
    return features





In [5]:
not_feature_list = ['essay_id', 'full_text', 'score', 'pos_counts','pos_tags']

def flatten_features(train_data):
    
    # Extract the feature columns
    feature_columns = [col for col in train_data.columns if col not in not_feature_list]
    
    print(feature_columns)
    # Flatten the features for each row
    flattened_features = train_data[feature_columns].values.tolist()
    
    # Convert each list of features to a PyTorch tensor
    flattened_features_tensors = [torch.tensor(features, dtype=torch.float32) for features in flattened_features]
    
    # Add the new column to the DataFrame
    train_data['flattened_features'] = flattened_features_tensors
    
    
    return train_data


def extract_all_feature(train_data):
    
    print('extracting all features')
    train_data['textstat_features'] = train_data['full_text'].apply(textstat_features)
    train_textstat = pd.DataFrame(train_data['textstat_features'].tolist())
    train_data = train_data.drop(columns=['textstat_features'])
    train_data = pd.concat([train_data, train_textstat], axis=1)
    
        
    train_data['spelling_features'] = train_data['full_text'].apply(spelling_features)
    spell_check_df = pd.DataFrame(train_data['spelling_features'].tolist(), columns=['misspelled_count', 'misspelled_ratio'])
    train_data = train_data.drop(columns=['spelling_features'])
    train_data = pd.concat([train_data, spell_check_df], axis=1)

    
    train_data['linguistic_features'] = train_data['full_text'].apply(extract_linguistic_features)
    train_linguistic = pd.json_normalize(train_data['linguistic_features'])
    train_data = train_data.drop(columns=['linguistic_features'])
    train_data = pd.concat([train_data, train_linguistic], axis=1)
    
    train_data['core_features'] = train_data['full_text'].apply(extract_features)
    core_df = pd.DataFrame(train_data['core_features'].tolist())
    train_data = train_data.drop(columns=['core_features'])
    train_data = pd.concat([train_data, core_df], axis=1)    
    
    return train_data



def select_features(train_data,features=[]):
    train_data = train_data.drop(columns=list(set(train_data.columns)-set(features)))
    return train_data
    

def normalize_features(train_data,features_to_normalize=[]):
    train_data[features_to_normalize] = minmax_scale(train_data[features_to_normalize])
    return train_data
    

    

In [6]:
features_to_use = [
'flesch_reading_ease',
'flesch_kincaid_grade',
'smog_index',
'coleman_liau_index',
'automated_readability_index',
'dale_chall_readability_score',
'difficult_words',
'linsear_write_formula',
'gunning_fog',
'text_standard',
'spache_readability',
'mcalpine_eflaw',
'reading_time',
'syllable_count',
'lexicon_count',
'monosyllabcount',
'misspelled_count',
'misspelled_count',
'past_tense_ratio',
'present_tense_ratio',
'word_count',
'sentence_count',
'words_per_sentence',
'std_words_per_sentence',
'unique_words',
'lexical_diversity',
'paragraph_count',
'avg_chars_by_paragraph',
'avg_words_by_paragraph',
'avg_sentences_by_paragraph',
'num_words',
'num_sentences',
'avg_sentence_length',
'readability',
'num_clauses',
'avg_clause_length',
'unique_words',
"mean_compound",
"mean_negative",
"mean_positive",
"mean_neutral",
"std_compound",
"std_negative",
"std_positive",
"std_neutral",
]




In [7]:


pos2idx = pickle.load(open('/kaggle/input/aes-lstm/pos2idx','rb'))



class EssayDataset(Dataset):
    def __init__(self, data, pos2idx):
        self.data = data
        self.pos2idx = pos2idx
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        essay = self.data.iloc[idx]
        l = []
        for token, pos_tag in essay['pos_tags']:
            l.append(self.pos2idx.get(pos_tag, 0))
        pos_tags = torch.tensor(l,dtype=torch.long)
        flattened_features = essay['flattened_features']
        return pos_tags, flattened_features
    
def collate_fn(batch):
    pos_tags, features = zip(*batch)
    
    pos_tags_padded = nn.utils.rnn.pad_sequence(pos_tags, batch_first=True, padding_value=0).to(device)
    
    features_padded = torch.stack(features).to(device)
    
    return pos_tags_padded, features_padded





In [8]:
test_dir = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv' 
test_data = pd.read_csv(test_dir)

test_data = extract_all_feature(test_data)
test_data = select_features(test_data,features_to_use+not_feature_list)
test_data = normalize_features(test_data,features_to_use)
test_data = flatten_features(test_data)



test_dataset = EssayDataset(test_data, pos2idx)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)


extracting all features
['flesch_reading_ease', 'flesch_kincaid_grade', 'smog_index', 'coleman_liau_index', 'automated_readability_index', 'dale_chall_readability_score', 'difficult_words', 'linsear_write_formula', 'gunning_fog', 'text_standard', 'spache_readability', 'mcalpine_eflaw', 'reading_time', 'syllable_count', 'lexicon_count', 'monosyllabcount', 'misspelled_count', 'past_tense_ratio', 'present_tense_ratio', 'word_count', 'sentence_count', 'words_per_sentence', 'std_words_per_sentence', 'unique_words', 'lexical_diversity', 'paragraph_count', 'avg_chars_by_paragraph', 'avg_words_by_paragraph', 'avg_sentences_by_paragraph', 'mean_compound', 'mean_negative', 'mean_positive', 'mean_neutral', 'std_compound', 'std_negative', 'std_positive', 'std_neutral', 'num_words', 'num_sentences', 'avg_sentence_length', 'readability', 'num_clauses', 'avg_clause_length', 'unique_words']


In [9]:

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attention = nn.Linear(hidden_dim, 1, bias=False)
    
    def forward(self, lstm_output):
        attn_weights = F.softmax(self.attention(lstm_output), dim=1)
        context = torch.bmm(attn_weights.transpose(1, 2), lstm_output)
        return context.squeeze(1), attn_weights

class EssayScoringModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, feature_dim, dropout_prob):
        super(EssayScoringModel, self).__init__()
        self.conv1 = nn.Conv1d(embedding_dim, hidden_dim, kernel_size=5, padding='same')
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.lstm_attention = Attention(hidden_dim*2)
        self.conv1_attention = Attention(hidden_dim)
        self.dropout = nn.Dropout(dropout_prob)
        
        

        self.fc1 = nn.Linear(hidden_dim * 2 + feature_dim, 256)  # Increased capacity
        self.fc2 = nn.Linear(256, 128)  # Added another fully connected layer
        self.fc3 = nn.Linear(128, 1)
        
        
    def forward(self, pos_tags, features):
        # Embedding layer
        embeds = F.one_hot(pos_tags, num_classes=50).float()  # Shape: (batch_size, seq_length, num_classes)

        # Convolutional layer
        conv_out = F.relu(self.conv1(embeds.transpose(1, 2)))  # Shape: (batch_size, hidden_dim, seq_length)
        conv_out = self.dropout(conv_out)
        
        # Attention pooling after convolution
        attn_conv_out = self.conv1_attention(conv_out.transpose(1, 2))[0]  # Shape: (batch_size, hidden_dim)
        
        # LSTM layer
        lstm_out, _ = self.lstm(attn_conv_out.unsqueeze(1))  # Shape: (batch_size, 1, hidden_dim)
        lstm_out = self.dropout(lstm_out)
        
        # Attention pooling after LSTM
        attn_lstm_out = self.lstm_attention(lstm_out)[0]  # Shape: (batch_size, hidden_dim * 2)
        
        # Concatenate LSTM output with additional features
        combined = torch.cat((attn_lstm_out, features), dim=1)  # Shape: (batch_size, hidden_dim * 2 + feature_dim)
        
        
        
        
        x = F.relu(self.fc1(combined))  # Shape: (batch_size, 256)
        x = self.dropout(x)
        x = F.relu(self.fc2(x))  # Shape: (batch_size, 128)
        x = self.fc3(x)  # Shape: (batch_size, 1)
        x = torch.sigmoid(x) * 5 + 1  # scores are between 1 and 6
        
        
        return x


In [10]:

'''
POS Embeddings Output Vector Dimensions 50
ConvolutionalLayer
Number of Filters 100
Filter Length 5
LSTM Layer Output Dimensions 100
Dropout Probability 0.5
Non Prompt-specific Features Vector Dimensions 86

'''

class ModelParameters:
    def __init():
        pass

model_parameters = pickle.load(open('/kaggle/input/aes-lstm/model_parameters','rb'))

embedding_dim = model_parameters.embedding_dim 
hidden_dim = model_parameters.hidden_dim 
feature_dim = model_parameters.feature_dim

model = EssayScoringModel(embedding_dim, hidden_dim, feature_dim,dropout_prob=0).to(device)
model.load_state_dict(torch.load('/kaggle/input/aes-lstm/essay_scoring_best_model.pth'))
model.eval()


EssayScoringModel(
  (conv1): Conv1d(50, 100, kernel_size=(5,), stride=(1,), padding=same)
  (lstm): LSTM(100, 100, batch_first=True, bidirectional=True)
  (lstm_attention): Attention(
    (attention): Linear(in_features=200, out_features=1, bias=False)
  )
  (conv1_attention): Attention(
    (attention): Linear(in_features=100, out_features=1, bias=False)
  )
  (dropout): Dropout(p=0, inplace=False)
  (fc1): Linear(in_features=246, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=1, bias=True)
)

In [11]:

test_preds = []

with torch.no_grad():
    for pos_tags, features in test_loader:
        output = model(pos_tags, features).squeeze()
        test_preds.append(float(output))

test_data['score'] = [round(pred) for pred in test_preds]
submission = test_data[['essay_id', 'score']]
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")

Submission file created successfully!


In [12]:
submission.head()

Unnamed: 0,essay_id,score
0,000d118,1
1,000fe60,2
2,001ab80,5
