# Loading Data

In [13]:
import numpy as np    # Math operations and arrays
import pandas as pd   # To work with tables

In [14]:
# Loading Training Data
train_data = pd.read_csv("../data/raw/training_set_rel3.tsv", delimiter="\t", encoding='ISO-8859-1')
train_data.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,


In [15]:
# Selecting Relevant Columns
required_columns = ['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1', 'domain1_score']
train_data = train_data[required_columns]

# Features Extraction

## 1) Vocabulary Features
### a) Length Based Features

In [16]:
import nltk
nltk.data.path.append(r"C:\Users\ehsan\OneDrive\Desktop\Automated-Grading-of-Exam-Papers-Using-Fusion-based-LSTM-Architecture\Data\nltk_data")

In [17]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Stopwords in English
stopword_list = set(stopwords.words('english'))

def extract_text_features(text):
    """
    Computes various length-based features for a given essay text.
    """
    # Splitting the text into words and sentences
    sentence_list = text.split('.')
    word_list = text.split()
    
    # Counting words and sentences
    total_words = len(word_list)
    total_sentences = len(sentence_list)
    
    # Computing average lengths
    avg_word_size = sum(len(word) for word in word_list) / total_words if total_words else 0
    avg_sentence_size = total_words / total_sentences if total_sentences else 0
    
    # Categorizing words by length
    min_word_size = 4  # Words shorter than this are considered short
    max_word_size = 6  # Words longer than this are considered long
    long_words = sum(1 for word in word_list if len(word) > max_word_size)
    short_words = sum(1 for word in word_list if len(word) < min_word_size)
    
    # Identifying unique words and non-stopwords
    distinct_words = set(word_list)
    filtered_words = [word for word in word_list if word.lower() not in stopword_list]
    
    # Summarizing extracted features
    feature_dict = {
        'total_words': total_words,
        'distinct_word_count': len(distinct_words),
        'filtered_word_count': len(filtered_words),
        'avg_sentence_size': avg_sentence_size,
        'avg_word_size': avg_word_size,
        'total_sentences': total_sentences,
        'long_words': long_words,
        'short_words': short_words
    }
    
    return feature_dict

# Applying Feature Extraction to Essays
train_data['text_features'] = train_data['essay'].apply(extract_text_features)

# Expanding Feature Dictionary into Separate Columns
feature_columns = ['total_words', 'distinct_word_count', 'filtered_word_count', 
                   'avg_sentence_size', 'avg_word_size', 'total_sentences', 
                   'long_words', 'short_words']

train_data[feature_columns] = train_data['text_features'].apply(pd.Series)
train_data[feature_columns].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ehsan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,total_words,distinct_word_count,filtered_word_count,avg_sentence_size,avg_word_size,total_sentences,long_words,short_words
0,338.0,184.0,170.0,30.727273,4.550296,11.0,67.0,138.0
1,419.0,216.0,230.0,22.052632,4.463007,19.0,86.0,169.0
2,279.0,167.0,139.0,18.6,4.526882,15.0,56.0,119.0
3,524.0,275.0,302.0,20.96,5.041985,25.0,140.0,182.0
4,465.0,226.0,229.0,15.0,4.526882,31.0,95.0,192.0


### b) Part of Speech Features

In [18]:
from collections import Counter
from nltk import word_tokenize, pos_tag
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def extract_pos_features(text):
    """
    Analyzes the Part-of-Speech (POS) distribution in a given text.
    """
    # Tokenizing and tagging words with their respective POS labels
    word_list = word_tokenize(text)
    tagged_words = pos_tag(word_list)
    
    # Counting occurrences of each POS category
    pos_frequencies = Counter(tag for _, tag in tagged_words)
    
    # Mapping specific POS categories to broader labels
    pos_summary = {
        'noun_count': pos_frequencies.get('NN', 0) + pos_frequencies.get('NNS', 0),
        'adjective_count': pos_frequencies.get('JJ', 0),
        'pronoun_count': pos_frequencies.get('PRP', 0) + pos_frequencies.get('PRP$', 0),
        'verb_count': sum(pos_frequencies.get(tag, 0) for tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']),
        'conjunction_count': pos_frequencies.get('CC', 0),
        'adverb_count': pos_frequencies.get('RB', 0),
        'determiner_count': pos_frequencies.get('DT', 0),
        'proper_noun_count': pos_frequencies.get('NNP', 0) + pos_frequencies.get('NNPS', 0),
        'numeral_count': pos_frequencies.get('CD', 0),
        'interjection_count': pos_frequencies.get('UH', 0)
    }
    
    return pos_summary

# Applying POS feature extraction to the dataset
train_data['pos_analysis'] = train_data['essay'].apply(extract_pos_features)

# Expanding the extracted POS data into separate columns
pos_columns = ['noun_count', 'adjective_count', 'pronoun_count', 'verb_count', 
               'conjunction_count', 'adverb_count', 'determiner_count', 
               'proper_noun_count', 'numeral_count', 'interjection_count']

train_data[pos_columns] = train_data['pos_analysis'].apply(pd.Series)
train_data[pos_columns].head()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ehsan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ehsan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ehsan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,noun_count,adjective_count,pronoun_count,verb_count,conjunction_count,adverb_count,determiner_count,proper_noun_count,numeral_count,interjection_count
0,72,20,41,68,14,21,20,12,0,0
1,96,19,42,85,18,17,35,18,4,0
2,72,15,16,53,17,11,27,14,2,0
3,126,42,23,100,18,26,43,71,0,0
4,107,23,28,87,15,34,54,9,5,0


### c) Punctuation Features

In [19]:
# Function to derive punctuation-based characteristics
def analyze_punctuation(text):

    # Counting different punctuation marks
    dot_count = text.count('.')
    comma_occurrences = text.count(',')
    question_marks = text.count('?')
    exclamations = text.count('!')
    quote_marks = text.count('"') + text.count("'")  # Including both single and double quotation marks
    colon_occurrences = text.count(':')
    semicolon_occurrences = text.count(';')
    bracket_pairs = text.count('(') + text.count(')')  # Counting both opening and closing brackets
    dash_count = text.count('-')
    triple_dots = text.count('...')  # Counting occurrences of ellipsis

    # Compiling extracted features into a structured dictionary
    punctuation_data = {
        'dot_count': dot_count,
        'comma_occurrences': comma_occurrences,
        'question_marks': question_marks,
        'exclamations': exclamations,
        'colon_occurrences': colon_occurrences,
        'semicolon_occurrences': semicolon_occurrences,
        'bracket_pairs': bracket_pairs // 2,  # Adjusting for paired brackets
        'dash_count': dash_count,
        'triple_dots': triple_dots
    }

    return punctuation_data

# Applying the punctuation feature extraction
train_data['punctuation_analysis'] = train_data['essay'].apply(analyze_punctuation)
punctuation_columns = ['dot_count', 'comma_occurrences', 'question_marks', 
                       'exclamations', 'colon_occurrences', 'semicolon_occurrences', 
                       'bracket_pairs', 'dash_count', 'triple_dots']
train_data[punctuation_columns] = train_data['punctuation_analysis'].apply(pd.Series)
train_data[punctuation_columns].head()

Unnamed: 0,dot_count,comma_occurrences,question_marks,exclamations,colon_occurrences,semicolon_occurrences,bracket_pairs,dash_count,triple_dots
0,10,18,2,4,1,0,1,2,0
1,18,12,1,1,0,0,0,1,0
2,14,9,0,0,0,0,0,0,0
3,24,13,1,2,0,0,0,3,0
4,30,13,0,0,0,0,0,2,0


In [20]:
vocab_features = ['essay_id', 'total_words', 'distinct_word_count', 'filtered_word_count', 
                    'avg_sentence_size', 'avg_word_size', 'total_sentences', 
                    'long_words', 'short_words', 'noun_count', 'adjective_count', 
                    'pronoun_count', 'verb_count', 'conjunction_count', 
                    'adverb_count', 'determiner_count', 'proper_noun_count', 
                    'numeral_count', 'interjection_count', 'dot_count', 
                    'comma_occurrences', 'question_marks', 'exclamations', 
                    'colon_occurrences', 'semicolon_occurrences', 'bracket_pairs', 
                    'dash_count', 'triple_dots']
vocab_features_df = train_data[vocab_features]
vocab_features_df.to_csv("../data/features/vocabulary_features.csv", index=False)
vocab_features_df.head()

Unnamed: 0,essay_id,total_words,distinct_word_count,filtered_word_count,avg_sentence_size,avg_word_size,total_sentences,long_words,short_words,noun_count,...,interjection_count,dot_count,comma_occurrences,question_marks,exclamations,colon_occurrences,semicolon_occurrences,bracket_pairs,dash_count,triple_dots
0,1,338.0,184.0,170.0,30.727273,4.550296,11.0,67.0,138.0,72,...,0,10,18,2,4,1,0,1,2,0
1,2,419.0,216.0,230.0,22.052632,4.463007,19.0,86.0,169.0,96,...,0,18,12,1,1,0,0,0,1,0
2,3,279.0,167.0,139.0,18.6,4.526882,15.0,56.0,119.0,72,...,0,14,9,0,0,0,0,0,0,0
3,4,524.0,275.0,302.0,20.96,5.041985,25.0,140.0,182.0,126,...,0,24,13,1,2,0,0,0,3,0
4,5,465.0,226.0,229.0,15.0,4.526882,31.0,95.0,192.0,107,...,0,30,13,0,0,0,0,0,2,0


## 2) Sentence Level Features
### a) Readability Features

In [21]:
!pip install textstat -q

In [22]:
from textstat import flesch_kincaid_grade, automated_readability_index, coleman_liau_index
from textstat import lix, flesch_reading_ease, gunning_fog, smog_index, rix, dale_chall_readability_score

def compute_readability_metrics(text):
    
    readability_scores = {
        'flesch_kincaid': flesch_kincaid_grade(text),
        'automated_readability': automated_readability_index(text),
        'coleman_liau': coleman_liau_index(text),
        'lix_score': lix(text),
        'flesch_reading_ease': flesch_reading_ease(text),
        'gunning_fog_index': gunning_fog(text),
        'smog_index': smog_index(text),
        'rix_score': rix(text),
        'dale_chall_score': dale_chall_readability_score(text)
    }
    return readability_scores

# Extracting and storing readability features
train_data['readability_metrics'] = train_data['essay'].apply(compute_readability_metrics)
readability_columns = ['flesch_kincaid', 'automated_readability', 'coleman_liau', 
                       'lix_score', 'flesch_reading_ease', 'gunning_fog_index', 
                       'smog_index', 'rix_score', 'dale_chall_score']

train_data[readability_columns] = train_data['readability_metrics'].apply(pd.Series)
train_data[readability_columns].head()

Unnamed: 0,flesch_kincaid,automated_readability,coleman_liau,lix_score,flesch_reading_ease,gunning_fog_index,smog_index,rix_score,dale_chall_score
0,9.7,11.3,8.54,42.32,65.56,10.42,10.7,3.93,7.0
1,9.1,10.1,7.95,41.53,67.08,10.31,11.8,4.1,7.28
2,9.9,9.9,8.3,39.97,59.74,10.54,12.3,3.79,7.62
3,10.9,12.0,10.97,46.12,51.78,10.97,12.4,4.89,8.34
4,8.2,7.7,8.0,35.93,64.2,8.01,11.1,2.93,6.51


In [23]:
from nltk.tokenize import word_tokenize, sent_tokenize
from textstat import syllable_count, difficult_words, lexicon_count

def extract_sentence_features(text):

    # Tokenize words and sentences
    tokens = word_tokenize(text)
    total_syllables = syllable_count(text)
    long_word_count = sum(1 for token in tokens if len(token) > 6)
    
    sentence_list = sent_tokenize(text)
    paragraph_list = text.split('\n')

    # Extracting various metrics related to sentence structure and complexity
    sentence_metrics = {
        'avg_char_per_word': sum(len(token) for token in tokens) / len(tokens) if tokens else 0,
        'avg_syll_per_word': total_syllables / len(tokens) if tokens else 0,
        'unique_word_count': len(set(tokens)),
        'avg_words_per_sentence': len(tokens) / len(sentence_list) if sentence_list else 0,
        'total_words': len(tokens),
        'total_sentences': len(sentence_list),
        'avg_sentences_per_paragraph': len(sentence_list) / len(paragraph_list) if paragraph_list else 0,
        'difficult_word_count': difficult_words(text),
        'type_token_ratio': lexicon_count(text, removepunct=True) / len(tokens) if tokens else 0,
        'total_characters': sum(len(token) for token in tokens),
        'total_syllables': total_syllables,
        'total_paragraphs': len(paragraph_list),
        'long_word_count': long_word_count
    }
    
    return sentence_metrics

# Applying the feature extraction to the dataset
train_data['sentence_metrics'] = train_data['essay'].apply(extract_sentence_features)
sentence_columns = ['avg_char_per_word', 'avg_syll_per_word', 'unique_word_count', 
                    'avg_words_per_sentence', 'total_words', 'total_sentences', 
                    'avg_sentences_per_paragraph', 'difficult_word_count', 
                    'type_token_ratio', 'total_characters', 'total_syllables', 
                    'total_paragraphs', 'long_word_count']

train_data[sentence_columns] = train_data['sentence_metrics'].apply(pd.Series)
train_data[sentence_columns].head()

Unnamed: 0,avg_char_per_word,avg_syll_per_word,unique_word_count,avg_words_per_sentence,total_words,total_sentences,avg_sentences_per_paragraph,difficult_word_count,type_token_ratio,total_characters,total_syllables,total_paragraphs,long_word_count
0,3.984456,1.19171,181.0,24.125,386.0,16.0,16.0,28.0,0.873057,1538.0,460.0,1.0,59.0
1,4.030172,1.297414,209.0,23.2,464.0,20.0,20.0,52.0,0.903017,1870.0,602.0,1.0,81.0
2,4.035144,1.306709,161.0,22.357143,313.0,14.0,14.0,39.0,0.891374,1263.0,409.0,1.0,52.0
3,4.328969,1.353519,267.0,22.62963,611.0,27.0,27.0,101.0,0.85761,2645.0,827.0,1.0,131.0
4,4.071567,1.317215,211.0,17.233333,517.0,30.0,30.0,50.0,0.89942,2105.0,681.0,1.0,87.0


### b) Sentence Vector Representation

In [24]:
!pip install gensim -q

In [25]:
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
import numpy as np

def clean_and_tokenize(text):
    
    stop_words_set = set(stopwords.words('english'))
    sentence_list = sent_tokenize(text)
    cleaned_sentences = []
    
    for sentence in sentence_list:
        tokens = word_tokenize(sentence)
        filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words_set] # Stopword removal, Puntuantions removal, lower
        if filtered_tokens:
            cleaned_sentences.append(filtered_tokens)
    
    return cleaned_sentences

# Collecting all sentences from the essays
all_cleaned_sentences = []
for essay in train_data['essay']:
    all_cleaned_sentences.extend(clean_and_tokenize(essay))

# Training a Word2Vec model on the processed sentences
word2vec_model = Word2Vec(sentences=all_cleaned_sentences, 
                          vector_size=100, 
                          window=5, 
                          min_count=1, 
                          workers=4)

def generate_sentence_vectors(essay, model):
    """
    Converts sentences in an essay into vectors using the trained Word2Vec model.
    """
    processed_sentences = clean_and_tokenize(essay)
    sentence_vectors = []
    
    for sentence in processed_sentences:
        vector = np.mean([model.wv[word] for word in sentence if word in model.wv], axis=0)
        if not np.isnan(vector).any():
            sentence_vectors.append(vector)
    
    return sentence_vectors

def generate_essay_vector(essay, model):
    """
    Converts an entire essay into a single vector representation.
    """
    sentence_vectors = generate_sentence_vectors(essay, model)
    if sentence_vectors:
        essay_vector = np.mean(sentence_vectors, axis=0)
    else:
        essay_vector = np.zeros(model.vector_size)  # Use a zero vector as fallback
    
    return essay_vector

# Creating vector representations for each essay
essay_vectors = train_data['essay'].apply(lambda essay: generate_essay_vector(essay, word2vec_model)).apply(pd.Series)

# Adding dataframe
w2v_columns = [f"w2v_{i}" for i in range(1,101)]
train_data[w2v_columns] = essay_vectors.apply(pd.Series)
train_data[w2v_columns].head()

ImportError: cannot import name 'triu' from 'scipy.linalg' (c:\Users\ehsan\anaconda3\Lib\site-packages\scipy\linalg\__init__.py)

In [None]:
all_sentence_features = ['essay_id'] + readability_columns + sentence_columns + w2v_columns
df_sentence_features = train_data[all_sentence_features]
df_sentence_features.to_csv("../data/features/sentence_features.csv", index=False)
df_sentence_features.head()

Unnamed: 0,essay_id,flesch_kincaid,automated_readability,coleman_liau,lix_score,flesch_reading_ease,gunning_fog_index,smog_index,rix_score,dale_chall_score,...,w2v_91,w2v_92,w2v_93,w2v_94,w2v_95,w2v_96,w2v_97,w2v_98,w2v_99,w2v_100
0,1,9.7,11.3,8.54,42.32,65.56,10.42,10.7,3.93,7.0,...,0.337763,0.165934,0.23112,0.336043,0.838567,0.6802,0.139399,-0.266914,0.615034,-0.079027
1,2,9.1,10.1,7.95,41.53,67.08,10.31,11.8,4.1,7.28,...,0.161736,0.041253,0.21527,0.475975,0.807934,0.297432,0.196454,-0.095835,0.728809,-0.060283
2,3,9.9,9.9,8.3,39.97,59.74,10.54,12.3,3.79,7.62,...,0.358576,0.096212,0.112069,0.498785,0.96116,0.425794,0.35872,-0.23413,0.810724,-0.149724
3,4,10.9,12.0,10.97,46.12,51.78,10.97,12.4,4.89,8.34,...,0.184646,0.047956,0.045367,0.389533,0.653036,0.301427,0.300964,-0.167743,0.669368,-0.069912
4,5,8.2,7.7,8.0,35.93,64.2,8.01,11.1,2.93,6.51,...,0.221012,0.123638,0.139652,0.578484,0.850886,0.43058,0.120094,-0.25521,0.765925,0.002656
