# 0) Setting Environment

In [None]:
# # Installing Required Modules
# !pip install nltk
# !pip install textstat
# !pip install gensim

In [None]:
import nltk
import string
import gensim
import textstat
import numpy as np
import pandas as pd
from nltk import pos_tag
from collections import Counter
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from gensim.corpora import Dictionary
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from sklearn.decomposition import TruncatedSVD
from gensim.models import LdaModel, LsiModel, Doc2Vec
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\autom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\autom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\autom\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# 1) Loading Data

In [None]:
# Reading Files
df_train = pd.read_csv("data/essay_data/training_set_rel3.tsv", sep="\t", encoding='ISO-8859-1')

In [None]:
df_train.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,


In [None]:
# Dropping Irrelavant Columns
columns_to_keep = ['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1',
                   'domain1_score']

df_train = df_train[columns_to_keep]

# 2) Feature Engineering

## 2.1) Vocabulary-level features

### 2.1.1) Length Based Features

In [None]:
# Set of English stopwords
stop_words = set(stopwords.words('english'))

def length_based_features(text):
    # Tokenize the text into sentences and words using string operations
    sentences = text.split('.')
    words = text.split()

    # Calculate word and sentence counts
    word_count = len(words)
    sentence_count = len(sentences)

    # Calculate average word and sentence lengths
    avg_word_length = sum(len(word) for word in words) / word_count if word_count else 0
    avg_sentence_length = word_count / sentence_count if sentence_count else 0

    # Calculate counts of long and short words
    long_word_threshold = 6  # Example threshold for long words
    short_word_threshold = 4  # Example threshold for short words
    long_word_count = sum(1 for word in words if len(word) > long_word_threshold)
    short_word_count = sum(1 for word in words if len(word) < short_word_threshold)

    # Calculate unique token count and nostop count
    unique_tokens = set(words)
    nostop_words = [word for word in words if word.lower() not in set(stopwords.words('english'))]
    nostop_count = len(nostop_words)
    unique_token_count = len(unique_tokens)

    # Compile features into a dictionary
    features = {
        'word_count': word_count,
        'unique_token_count': unique_token_count,
        'nostop_count': nostop_count,
        'avg_sentence_length': avg_sentence_length,
        'avg_word_length': avg_word_length,
        'sentence_count': sentence_count,
        'long_word_count': long_word_count,
        'short_word_count': short_word_count
    }

    return features


# Applying Function and Extracting Features
df_train['length_features'] = df_train['essay'].apply(length_based_features)
df_train[['word_count', 'unique_token_count', 'nostop_count', 'avg_sentence_length', 'avg_word_length', 'sentence_count', 'long_word_count', 'short_word_count']] = df_train['length_features'].apply(pd.Series)
df_train[['word_count', 'unique_token_count', 'nostop_count', 'avg_sentence_length', 'avg_word_length', 'sentence_count', 'long_word_count', 'short_word_count']].head()

Unnamed: 0,word_count,unique_token_count,nostop_count,avg_sentence_length,avg_word_length,sentence_count,long_word_count,short_word_count
0,338.0,184.0,170.0,30.727273,4.550296,11.0,67.0,138.0
1,419.0,216.0,230.0,22.052632,4.463007,19.0,86.0,169.0
2,279.0,167.0,139.0,18.6,4.526882,15.0,56.0,119.0
3,524.0,275.0,302.0,20.96,5.041985,25.0,140.0,182.0
4,465.0,226.0,229.0,15.0,4.526882,31.0,95.0,192.0


### 2.1.2) Part of Speech Features

In [None]:
def calculate_pos_features(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    pos_counts = Counter(tag for word, tag in pos_tags)

    # Simplify POS tags to match the examples given
    pos_features = {
        'noun': pos_counts['NN'] + pos_counts['NNS'],
        'adj': pos_counts['JJ'],
        'pron': pos_counts['PRP'] + pos_counts['PRP$'],
        'verb': pos_counts['VB'] + pos_counts['VBD'] + pos_counts['VBG'] + pos_counts['VBN'] + pos_counts['VBP'] + pos_counts['VBZ'],
        'cconj': pos_counts['CC'],
        'adv': pos_counts['RB'],
        'det': pos_counts['DT'],
        'propn': pos_counts['NNP'] + pos_counts['NNPS'],
        'num': pos_counts['CD'],
        'intj': pos_counts['UH'],
        # 'ner count' is not possible to calculate with just POS tags. It requires Named Entity Recognition (NER)
    }

    return pos_features

# Applying Function and Extracting Features
df_train['pos_features'] = df_train['essay'].apply(calculate_pos_features)
df_train[['noun', 'adj', 'pron', 'verb', 'cconj', 'adv', 'det', 'propn', 'num', 'intj']] = df_train['pos_features'].apply(pd.Series)
df_train[['noun', 'adj', 'pron', 'verb', 'cconj', 'adv', 'det', 'propn', 'num', 'intj']].head()

Unnamed: 0,noun,adj,pron,verb,cconj,adv,det,propn,num,intj
0,72,20,41,68,14,21,20,12,0,0
1,96,19,42,85,18,17,35,18,4,0
2,72,15,16,53,17,11,27,14,2,0
3,126,42,23,100,18,26,43,71,0,0
4,107,23,28,87,15,34,54,9,5,0


### 2.1.3) Punctuation Features

1.   List item
2.   List item



In [None]:
# Function to calculate punctuation-based features
def punctuation_features(text):
    # Count occurrences of each punctuation mark
    period_count = text.count('.')
    comma_count = text.count(',')
    question_mark_count = text.count('?')
    exclamation_mark_count = text.count('!')
    quotation_mark_count = text.count('"') + text.count("'")  # Counting both double and single quotes
    colon_count = text.count(':')
    semicolon_count = text.count(';')
    parentheses_count = text.count('(') + text.count(')')  # Each '(' has a matching ')'
    hyphen_count = text.count('-')
    ellipsis_count = text.count('...')  # Treating ellipsis as a sequence of three periods

    # Compiling features into a dictionary
    features = {
        'period_count': period_count,
        'comma_count': comma_count,
        'question_mark_count': question_mark_count,
        'exclamation_mark_count': exclamation_mark_count,
        'colon_count': colon_count,
        'semicolon_count': semicolon_count,
        'parentheses_count': parentheses_count // 2,  # Dividing by 2 to count pairs
        'hyphen_count': hyphen_count,
        'ellipsis_count': ellipsis_count
    }

    return features

# Applying Function and Extracting Features
df_train['punctuation_features'] = df_train['essay'].apply(punctuation_features)
df_train[['period_count', 'comma_count', 'question_mark_count', 'exclamation_mark_count', 'colon_count', 'semicolon_count', 'parentheses_count', 'hyphen_count', 'ellipsis_count']] = df_train['punctuation_features'].apply(pd.Series)
df_train[['period_count', 'comma_count', 'question_mark_count', 'exclamation_mark_count', 'colon_count', 'semicolon_count', 'parentheses_count', 'hyphen_count', 'ellipsis_count']].head()

Unnamed: 0,period_count,comma_count,question_mark_count,exclamation_mark_count,colon_count,semicolon_count,parentheses_count,hyphen_count,ellipsis_count
0,10,18,2,4,1,0,1,2,0
1,18,12,1,1,0,0,0,1,0
2,14,9,0,0,0,0,0,0,0
3,24,13,1,2,0,0,0,3,0
4,30,13,0,0,0,0,0,2,0


In [None]:
vocab_level_features = ['word_count', 'unique_token_count', 'nostop_count', 'avg_sentence_length', 'avg_word_length', 'sentence_count', 'long_word_count', 'short_word_count', 'noun', 'adj', 'pron', 'verb', 'cconj', 'adv', 'det', 'propn', 'num', 'intj', 'period_count', 'comma_count', 'question_mark_count', 'exclamation_mark_count', 'colon_count', 'semicolon_count', 'parentheses_count', 'hyphen_count', 'ellipsis_count']
vocab_level_features_df = df_train[vocab_level_features]
vocab_level_features_df.to_csv("data/vocab_level_features_df.csv", index=False)
vocab_level_features_df.head()

Unnamed: 0,word_count,unique_token_count,nostop_count,avg_sentence_length,avg_word_length,sentence_count,long_word_count,short_word_count,noun,adj,...,intj,period_count,comma_count,question_mark_count,exclamation_mark_count,colon_count,semicolon_count,parentheses_count,hyphen_count,ellipsis_count
0,338.0,184.0,170.0,30.727273,4.550296,11.0,67.0,138.0,72,20,...,0,10,18,2,4,1,0,1,2,0
1,419.0,216.0,230.0,22.052632,4.463007,19.0,86.0,169.0,96,19,...,0,18,12,1,1,0,0,0,1,0
2,279.0,167.0,139.0,18.6,4.526882,15.0,56.0,119.0,72,15,...,0,14,9,0,0,0,0,0,0,0
3,524.0,275.0,302.0,20.96,5.041985,25.0,140.0,182.0,126,42,...,0,24,13,1,2,0,0,0,3,0
4,465.0,226.0,229.0,15.0,4.526882,31.0,95.0,192.0,107,23,...,0,30,13,0,0,0,0,0,2,0


## 2.2) Sentence Level Features

### 2.2.1) Readability Features

#### 2.2.1.1) Readability Grades

In [None]:
def readability_features(text):
    features = {
        'Kincaid': textstat.flesch_kincaid_grade(text),
        'ARI': textstat.automated_readability_index(text),
        'Coleman_Liau': textstat.coleman_liau_index(text),
        'LIX': textstat.lix(text),
        'Flesch_Reading_Ease': textstat.flesch_reading_ease(text),
        'Gunning_Fog': textstat.gunning_fog(text),
        'SMOG': textstat.smog_index(text),
        'RIX': textstat.rix(text),
        'Dale_Chall': textstat.dale_chall_readability_score(text)
    }
    return features

# Applying Function and Extracting Features
df_train['readability_features'] = df_train['essay'].apply(readability_features)
df_train[['Kincaid', 'ARI', 'Coleman_Liau', 'LIX', 'Flesch_Reading_Ease', 'Gunning_Fog', 'SMOG', 'RIX', 'Dale_Chall']] = df_train['readability_features'].apply(pd.Series)
df_train[['Kincaid', 'ARI', 'Coleman_Liau', 'LIX', 'Flesch_Reading_Ease', 'Gunning_Fog', 'SMOG', 'RIX', 'Dale_Chall']].head()

Unnamed: 0,Kincaid,ARI,Coleman_Liau,LIX,Flesch_Reading_Ease,Gunning_Fog,SMOG,RIX,Dale_Chall
0,8.5,11.3,8.54,42.32,74.02,10.31,10.2,3.93,7.0
1,9.1,10.1,7.95,41.53,67.08,10.21,11.6,4.1,7.28
2,8.7,9.9,8.3,39.97,68.2,10.25,12.0,3.79,7.62
3,9.7,12.0,10.97,46.12,60.24,10.81,12.3,4.89,8.34
4,7.0,7.7,8.0,35.93,72.66,8.01,11.1,2.93,6.51


#### 2.2.1.2) Sentence Information Features

In [None]:
def sentence_info_features(text):
    words = word_tokenize(text)
    syllables = textstat.syllable_count(text)
    long_words = sum(1 for word in words if len(word) > 6)
    sentences = sent_tokenize(text)
    paragraphs = text.split('\n')

    features = {
        'characters_word': sum(len(word) for word in words) / len(words) if words else 0,
        'syll_word': syllables / len(words) if words else 0,
        'wordtypes': len(set(words)),
        'words_sentence': len(words) / len(sentences) if sentences else 0,
        'words': len(words),
        'sentences': len(sentences),
        'sentences_paragraph': len(sentences) / len(paragraphs) if paragraphs else 0,
        'complex_words': textstat.difficult_words(text),
        'type_token_ratio': textstat.lexicon_count(text, removepunct=True) / len(words) if words else 0,
        'characters': sum(len(word) for word in words),
        'syllables': syllables,
        'paragraphs': len(paragraphs),
        'long_words': long_words,
        'complex_dc': textstat.dale_chall_readability_score(text)
    }
    return features

# Applying Function and Extracting Features
df_train['sentence_info_features'] = df_train['essay'].apply(sentence_info_features)
df_train[['characters_word', 'syll_word', 'wordtypes', 'words_sentence', 'words', 'sentences', 'sentences_paragraph', 'complex_words', 'type_token_ratio', 'characters', 'syllables', 'paragraphs', 'long_words', 'complex_dc']] = df_train['sentence_info_features'].apply(pd.Series)
df_train[['characters_word', 'syll_word', 'wordtypes', 'words_sentence', 'words', 'sentences', 'sentences_paragraph', 'complex_words', 'type_token_ratio', 'characters', 'syllables', 'paragraphs', 'long_words', 'complex_dc']].head()

Unnamed: 0,characters_word,syll_word,wordtypes,words_sentence,words,sentences,sentences_paragraph,complex_words,type_token_ratio,characters,syllables,paragraphs,long_words,complex_dc
0,3.984456,1.15285,181.0,24.125,386.0,16.0,16.0,28.0,0.873057,1538.0,445.0,1.0,59.0,7.0
1,4.030172,1.260776,209.0,23.2,464.0,20.0,20.0,51.0,0.903017,1870.0,585.0,1.0,81.0,7.28
2,4.035144,1.27476,161.0,22.357143,313.0,14.0,14.0,37.0,0.891374,1263.0,399.0,1.0,52.0,7.62
3,4.328969,1.317512,267.0,22.62963,611.0,27.0,27.0,98.0,0.85761,2645.0,805.0,1.0,131.0,8.34
4,4.071567,1.276596,211.0,17.233333,517.0,30.0,30.0,47.0,0.89942,2105.0,660.0,1.0,87.0,6.51


#### 2.2.1.3) Word Usage Features

In [None]:
def word_usage_features(text):
    words = word_tokenize(text)
    tagged_words = pos_tag(words)

    pos_counts = Counter(tag for word, tag in tagged_words)

    features = {
        'tobeverb': pos_counts['VB'],
        'auxverb': pos_counts['MD'],
        'conjunction': pos_counts['CC'],
        'pronoun': pos_counts['PRP'] + pos_counts['PRP$'],
        'preposition': pos_counts['IN'],
        'nominalization': sum(1 for word, tag in tagged_words if tag.startswith('NN') and len(word) > 6)
    }
    return features

# Applying Function and Extracting Features
df_train['word_usage_features'] = df_train['essay'].apply(word_usage_features)
df_train[['tobeverb', 'auxverb', 'conjunction', 'pronoun', 'preposition', 'nominalization']] = df_train['word_usage_features'].apply(pd.Series)
df_train[['tobeverb', 'auxverb', 'conjunction', 'pronoun', 'preposition', 'nominalization']].head()

Unnamed: 0,tobeverb,auxverb,conjunction,pronoun,preposition,nominalization
0,21,5,14,41,53,36
1,29,16,18,42,58,52
2,13,4,17,16,32,33
3,31,14,18,23,64,90
4,42,26,15,28,43,58


In [None]:
# Getting all Sent-level features
readability_features = ['Kincaid', 'ARI', 'Coleman_Liau', 'LIX', 'Flesch_Reading_Ease', 'Gunning_Fog', 'SMOG', 'RIX', 'Dale_Chall',
                               'characters_word', 'syll_word', 'wordtypes', 'words_sentence', 'words', 'sentences', 'sentences_paragraph',
                               'complex_words', 'type_token_ratio', 'characters', 'syllables', 'paragraphs', 'long_words', 'complex_dc',
                               'tobeverb', 'auxverb', 'conjunction', 'pronoun', 'preposition', 'nominalization']

readability_features_df = df_train[readability_features]
readability_features_df.to_csv("data/readability_features_df.csv", index=False)
readability_features_df.head()

Unnamed: 0,Kincaid,ARI,Coleman_Liau,LIX,Flesch_Reading_Ease,Gunning_Fog,SMOG,RIX,Dale_Chall,characters_word,...,syllables,paragraphs,long_words,complex_dc,tobeverb,auxverb,conjunction,pronoun,preposition,nominalization
0,8.5,11.3,8.54,42.32,74.02,10.31,10.2,3.93,7.0,3.984456,...,445.0,1.0,59.0,7.0,21,5,14,41,53,36
1,9.1,10.1,7.95,41.53,67.08,10.21,11.6,4.1,7.28,4.030172,...,585.0,1.0,81.0,7.28,29,16,18,42,58,52
2,8.7,9.9,8.3,39.97,68.2,10.25,12.0,3.79,7.62,4.035144,...,399.0,1.0,52.0,7.62,13,4,17,16,32,33
3,9.7,12.0,10.97,46.12,60.24,10.81,12.3,4.89,8.34,4.328969,...,805.0,1.0,131.0,8.34,31,14,18,23,64,90
4,7.0,7.7,8.0,35.93,72.66,8.01,11.1,2.93,6.51,4.071567,...,660.0,1.0,87.0,6.51,42,26,15,28,43,58


### 2.2.2) Sentence Vector Representation

In [None]:
# Preprocessing text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    sentences = sent_tokenize(text)
    processed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence)
        words = [w.lower() for w in words if w.isalpha() and w.lower() not in stop_words]
        if words:
            processed_sentences.append(words)
    return processed_sentences

all_sentences = []
for essay in df_train['essay']:
    all_sentences.extend(preprocess_text(essay))

In [None]:
# Training Word2Vec model
word2vec_model = Word2Vec(sentences=all_sentences,
                          vector_size=100,
                          window=5,
                          min_count=1,
                          workers=4)

In [None]:
# Generating Vectors
def essay_to_sentence_vectors(essay, model):
    processed_sentences = preprocess_text(essay)
    essay_sentence_vectors = []
    for sentence in processed_sentences:
        sentence_vector = np.mean([model.wv[word] for word in sentence if word in model.wv], axis=0)
        if not np.isnan(sentence_vector).any():
            essay_sentence_vectors.append(sentence_vector)
    return essay_sentence_vectors

def essay_to_vector(essay, model):
    sentence_vectors = essay_to_sentence_vectors(essay, model)
    if sentence_vectors:  # Ensure there's at least one sentence vector
        essay_vector = np.mean(sentence_vectors, axis=0)
    else:
        essay_vector = np.zeros(model.vector_size)  # Fallback to a zero vector
    return essay_vector

# Converting essay to an overall vector
sentence_vectors = df_train['essay'].apply(lambda essay: essay_to_vector(essay, word2vec_model))
np.save('data/sentence_vectors_representation.npy', sentence_vectors.to_numpy())