In [57]:
## Folder Structure:
# simple_feature_extraction.ipynb (this script)

# asap-aes (folder)
## training_set_rel3.tsv

# supplementary_data
## Kuperman-BRM-data-2012.csv

In [1]:
import os
import math
import time
import random
import scipy
import regex as re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import warnings
import textstat
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('vader_lexicon')

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer


import pyphen
import syllapy
from spellchecker import SpellChecker

import spacy
#python3 -m spacy download en_core_web_md
from itertools import combinations


# import gingerit
# import languagetool_python

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

[nltk_data] Downloading package punkt to /Users/yli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/yli/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/yli/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
raw_training_set = pd.read_csv('asap-aes/training_set_rel3.tsv',sep='\t', encoding='latin1')
aoa_df = pd.read_csv('supplementary_data/Kuperman-BRM-data-2012.csv') # age of acquisition

In [3]:
print(raw_training_set.shape)
print(raw_training_set.columns)
#raw_training_set.dtypes

(12976, 28)
Index(['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1',
       'rater3_domain1', 'domain1_score', 'rater1_domain2', 'rater2_domain2',
       'domain2_score', 'rater1_trait1', 'rater1_trait2', 'rater1_trait3',
       'rater1_trait4', 'rater1_trait5', 'rater1_trait6', 'rater2_trait1',
       'rater2_trait2', 'rater2_trait3', 'rater2_trait4', 'rater2_trait5',
       'rater2_trait6', 'rater3_trait1', 'rater3_trait2', 'rater3_trait3',
       'rater3_trait4', 'rater3_trait5', 'rater3_trait6'],
      dtype='object')


In [4]:
raw_training_set.isna().any(axis = 0)
#raw_training_set.isna().all(axis = 0)

essay_id          False
essay_set         False
essay             False
rater1_domain1    False
rater2_domain1    False
rater3_domain1     True
domain1_score     False
rater1_domain2     True
rater2_domain2     True
domain2_score      True
rater1_trait1      True
rater1_trait2      True
rater1_trait3      True
rater1_trait4      True
rater1_trait5      True
rater1_trait6      True
rater2_trait1      True
rater2_trait2      True
rater2_trait3      True
rater2_trait4      True
rater2_trait5      True
rater2_trait6      True
rater3_trait1      True
rater3_trait2      True
rater3_trait3      True
rater3_trait4      True
rater3_trait5      True
rater3_trait6      True
dtype: bool

In [5]:
essay_str = raw_training_set['essay'][1]

#essay_str.split(' ')
essay_str

"Dear @CAPS1 @CAPS2, I believe that using computers will benefit us in many ways like talking and becoming friends will others through websites like facebook and mysace. Using computers can help us find coordibates, locations, and able ourselfs to millions of information. Also computers will benefit us by helping with jobs as in planning a house plan and typing a @NUM1 page report for one of our jobs in less than writing it. Now lets go into the wonder world of technology. Using a computer will help us in life by talking or making friends on line. Many people have myspace, facebooks, aim, these all benefit us by having conversations with one another. Many people believe computers are bad but how can you make friends if you can never talk to them? I am very fortunate for having a computer that can help with not only school work but my social life and how I make friends. Computers help us with finding our locations, coordibates and millions of information online. If we didn't go on the i

In [52]:
def get_sets(essay_str):

    words = essay_str.split(' ')
    words = [w for w in words if len(w) > 0]
    words = [w for w in words if w[0] != '@']
    words = [w.lower() for w in words]
    #tokens = word_tokenize(essay_str)
    
    idx_set1 = math.floor(len(words) / 3)
    idx_set2 = idx_set1 + idx_set1
    
    text_set1, text_set2, text_set3 = words[:idx_set1], words[idx_set1:idx_set2], words[idx_set2:]
    text_set1, text_set2, text_set3 = ' '.join(text_set1), ' '.join(text_set2), ' '.join(text_set3)
    #print(len(text_set1), len(text_set2), len(text_set3))

    return text_set1, text_set2, text_set3


def remove_punctuations(x):
    return [re.sub(r'[^\w\s]', '', token) for token in x if re.sub(r'[^\w\s]', '', token)]

def get_avg_AoA(tokens):
    tokens_df = pd.DataFrame({'token': tokens})
    tokens_df = pd.merge(tokens_df, aoa_df[['Word', 'Rating.Mean']], left_on = 'token', right_on = 'Word', how = 'left')
    
    if tokens_df['Rating.Mean'].isna().all():
        avg_aoa = 0
    else:
        avg_aoa = tokens_df['Rating.Mean'].mean()

    return avg_aoa

def get_cohesion_score(text):

    doc = nlp(text)

    # obtain vectors of size (300, ) for each word in the text
    vec_tokens = [token.vector for token in doc if token.has_vector and not token.is_stop and not token.is_punct]

    # calculate cosine similarity score for combinations of two vectors
    similarities = [np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
                    for v1, v2 in combinations(vec_tokens, 2)]
    
    if (len(vec_tokens) < 2) or (len(similarities) > 0):
        return np.mean(similarities)
    else:
        return 0

In [54]:
#text_training_df = raw_training_set[['essay_id', 'essay_set', 'essay']].copy()
text_training_df = raw_training_set[['essay_id', 'essay_set', 'essay']].head(200).copy() # take the first 200 rows to prototype faster

# split each essay into 3 sets, and keep the original essay labeled as "text_original"
text_training_df[['text_set1', 'text_set2', 'text_set3']] = text_training_df['essay'].apply(lambda x: pd.Series(get_sets(x)))
text_training_df = text_training_df.rename(columns = {'essay': 'text_original'})
text_training_df = text_training_df.melt(id_vars=['essay_id', 'essay_set'], value_vars=['text_set1', 'text_set2', 'text_set3', 'text_original'],
                                        var_name='text_set', value_name='text')

# tokenize into words and sentences
text_training_df['word_tokens'] = text_training_df['text'].apply(lambda x: word_tokenize(x))
text_training_df['sent_tokens'] = text_training_df['text'].apply(lambda x: sent_tokenize(x))
text_training_df['word_tokens_clean'] = text_training_df['word_tokens'].apply(lambda x: remove_punctuations(x))

In [55]:
# extract basic statistical features
# (1) total number of words 
# (2) total number of characters
# (3) average number of words per sentence
# (4) total number of sentences
# (5) total number of paragraphs, ---> I don't think we can do this, doesn't seem to be in the raw data
# (6) total number of spelling mistakes
# (7) total number of grammar mistakes ---> I tried multiple packages but no luck. Textblob and transformer worked, but test cases are wrong. Skipped for now. 
# Flesch-Kincaid Score

start_time = time.time()

text_training_df['word_count'] = text_training_df['word_tokens_clean'].apply(lambda x: len(x))
text_training_df['sent_count'] = text_training_df['sent_tokens'].apply(lambda x: len(x))
text_training_df['char_count'] = text_training_df['text'].apply(lambda x: len(x))
text_training_df['sent_length'] = text_training_df['word_count'] / text_training_df['sent_count']

spellcheck = SpellChecker()
text_training_df['spell_err_count'] = text_training_df['word_tokens_clean'].apply(lambda x: len(spellcheck.unknown(x)))
text_training_df['syllabus_count'] = text_training_df['word_tokens_clean'].apply(lambda x: sum(syllapy.count(word) for word in x))
#text_training_df['FleKin_score'] = (0.39 * (text_training_df['word_count'] / text_training_df['sent_count'])) + (11.8 * (text_training_df['syllabus_count'] / text_training_df['word_count'])) - 15.59
# the scores generated from this formula is very different from using flesch_reading_ease
# the formula is also different from wikipedia. Where did you get this?

# text_training_df['FleKin_score'] = 206.835 - (1.015 * (text_training_df['word_count'] / text_training_df['sent_count'])) - (84.6 * (text_training_df['syllabus_count'] / text_training_df['word_count']))
# this formula is from Wikipedia and close to the result of using flesch_reading_ease

text_training_df['FleKin_score'] = text_training_df['text'].apply(lambda x: textstat.flesch_reading_ease(x))

# extras, not in the proposal
text_training_df['DalCha_score'] = text_training_df['text'].apply(lambda x: textstat.dale_chall_readability_score(x))
text_training_df['unique_word_count'] = text_training_df['word_tokens_clean'].apply(lambda x: len(set(w.lower() for w in x)))

end_time = time.time()
execution_time = end_time - start_time
print(f"Time taken for basic statistical feature extraction: {execution_time:.5f} seconds")

text_training_df.head()


Execution time: 0.53421 seconds


Unnamed: 0,essay_id,essay_set,text_set,text,word_tokens,sent_tokens,word_tokens_clean,word_count,sent_count,char_count,sent_length,spell_err_count,syllabus_count,FleKin_score,DalCha_score,unique_word_count
0,1,1,text_set1,"dear local newspaper, i think effects computer...","[dear, local, newspaper, ,, i, think, effects,...","[dear local newspaper, i think effects compute...","[dear, local, newspaper, i, think, effects, co...",112,7,624,16.0,8,152,78.28,8.13,75
1,2,1,text_set1,dear i believe that using computers will benef...,"[dear, i, believe, that, using, computers, wil...",[dear i believe that using computers will bene...,"[dear, i, believe, that, using, computers, wil...",136,8,767,17.0,6,204,62.68,7.5,90
2,3,1,text_set1,"dear, more and more people use computers, but ...","[dear, ,, more, and, more, people, use, comput...","[dear, more and more people use computers, but...","[dear, more, and, more, people, use, computers...",91,7,519,13.0,2,136,75.3,7.61,70
3,4,1,text_set1,"dear local newspaper, i have found that many e...","[dear, local, newspaper, ,, i, have, found, th...","[dear local newspaper, i have found that many ...","[dear, local, newspaper, i, have, found, that,...",161,8,922,20.125,7,246,59.53,8.65,107
4,5,1,text_set1,dear i know having computers has a positive ef...,"[dear, i, know, having, computers, has, a, pos...",[dear i know having computers has a positive e...,"[dear, i, know, having, computers, has, a, pos...",153,9,823,17.0,2,225,71.14,6.85,90


In [8]:
# https://github.com/mauryquijada/word-complexity-predictor/tree/master
# Supposed to be able to extract these features using machine learning techniques:
    # Lemma length
    # Average age-of-acquisition (at what age a word is most likely to enter someone's vocabulary)
    # Average concreteness (a score of 1 to 5, with 5 being very concrete)
    # Frequency in a certain corpus
    # Lemma frequency in a certain corpus

In [56]:
start_time = time.time() # takes about 30 seconds, mostly on the cohesion score calculation

# content feature extraction 
## sentiments (4 metrics)
## cohesion score, calculated as cosine similarity
## Age of Acquisision score, mapped from the Kuperman dataset

analyzer = SentimentIntensityAnalyzer()
nlp = spacy.load("en_core_web_md")

text_training_df[['neg', 'neu', 'pos', 'compound']] = text_training_df['text'].apply(lambda x: list(analyzer.polarity_scores(x).values())).apply(pd.Series)
text_training_df['cohesion'] = text_training_df['text'].apply(lambda x: get_cohesion_score(x))
text_training_df['AoA_score'] = text_training_df['word_tokens_clean'].apply(lambda x: get_avg_AoA(x))

end_time = time.time()
execution_time = end_time - start_time
print(f"Time taken for basic content feature extraction: {execution_time:.5f} seconds")

text_training_df.head()

Time taken for basic content feature extraction: 34.51738 seconds


Unnamed: 0,essay_id,essay_set,text_set,text,word_tokens,sent_tokens,word_tokens_clean,word_count,sent_count,char_count,...,syllabus_count,FleKin_score,DalCha_score,unique_word_count,neg,neu,pos,compound,cohesion,AoA_score
0,1,1,text_set1,"dear local newspaper, i think effects computer...","[dear, local, newspaper, ,, i, think, effects,...","[dear local newspaper, i think effects compute...","[dear, local, newspaper, i, think, effects, co...",112,7,624,...,152,78.28,8.13,75,0.0,0.847,0.153,0.955,0.248577,5.925
1,2,1,text_set1,dear i believe that using computers will benef...,"[dear, i, believe, that, using, computers, wil...",[dear i believe that using computers will bene...,"[dear, i, believe, that, using, computers, wil...",136,8,767,...,204,62.68,7.5,90,0.015,0.787,0.198,0.9678,0.312173,6.014722
2,3,1,text_set1,"dear, more and more people use computers, but ...","[dear, ,, more, and, more, people, use, comput...","[dear, more and more people use computers, but...","[dear, more, and, more, people, use, computers...",91,7,519,...,136,75.3,7.61,70,0.016,0.67,0.314,0.9896,0.313247,5.671034
3,4,1,text_set1,"dear local newspaper, i have found that many e...","[dear, local, newspaper, ,, i, have, found, th...","[dear local newspaper, i have found that many ...","[dear, local, newspaper, i, have, found, that,...",161,8,922,...,246,59.53,8.65,107,0.024,0.762,0.214,0.9891,0.237326,6.17549
4,5,1,text_set1,dear i know having computers has a positive ef...,"[dear, i, know, having, computers, has, a, pos...",[dear i know having computers has a positive e...,"[dear, i, know, having, computers, has, a, pos...",153,9,823,...,225,71.14,6.85,90,0.0,0.816,0.184,0.9828,0.244275,5.864906


In [3]:
## this works but it works so badly
# from textblob import TextBlob

# text = "This sentense have grammar issue."
# blob = TextBlob(text)
# corrected_text = str(blob.correct())

# print("Corrected Text:", corrected_text)

# # Counting grammar mistakes by comparing changes
# original_words = text.split()
# corrected_words = str(corrected_text).split()
# num_mistakes = sum(1 for orig, corr in zip(original_words, corrected_words) if orig != corr)
# print("Number of Grammar Mistakes:", num_mistakes)

Corrected Text: His sentence have grammar issue.
Number of Grammar Mistakes: 2
