In [None]:
## Folder Structure:
# simple_feature_extraction.ipynb (this script)

# asap-aes (folder)
## training_set_rel3.tsv

# supplementary_data
## Kuperman-BRM-data-2012.csv

In [None]:
import os
import math
import time
from tqdm import tqdm
import random
import scipy
import regex as re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import warnings
import textstat
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('vader_lexicon')

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer


import pyphen
import syllapy
from spellchecker import SpellChecker

import spacy
#python3 -m spacy download en_core_web_md
from itertools import combinations


# import gingerit
# import languagetool_python

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD

In [None]:
#raw_training_set = pd.read_csv('asap-aes/training_set_rel3.tsv',sep='\t', encoding='latin1')
#raw_training_set_chunked = pd.read_csv('processed_data/train_chunked_processed.csv', encoding='latin1')
#raw_training_set_original = pd.read_csv('processed_data/train_full_processed.csv', encoding='latin1')
raw_training_set = pd.read_csv('processed_data/train_full_processed.csv', encoding='latin1')
aoa_df = pd.read_csv('supplementary_data/Kuperman-BRM-data-2012.csv') # age of acquisition

tqdm.pandas()

In [None]:
print(raw_training_set.shape)
print(raw_training_set.columns)
raw_training_set.head(3)

In [None]:
raw_training_set[['essay_set', 'rescaled_score']].groupby('essay_set', as_index = False).agg(['count', 'mean', 'max', 'min', 'std']).reset_index()


In [None]:
raw_training_set.isna().any(axis = 0)
#raw_training_set.isna().all(axis = 0)

In [None]:
essay_str = raw_training_set['essay'][10]
essay_str

In [None]:
def get_sets(essay_str):

    words = essay_str.split(' ')
    words = [w for w in words if len(w) > 0]
    words = [w for w in words if w[0] != '@']
    words = [w.lower() for w in words]
    #tokens = word_tokenize(essay_str)
    
    idx_set1 = math.floor(len(words) / 3)
    idx_set2 = idx_set1 + idx_set1
    
    text_set1, text_set2, text_set3 = words[:idx_set1], words[idx_set1:idx_set2], words[idx_set2:]
    text_set1, text_set2, text_set3 = ' '.join(text_set1), ' '.join(text_set2), ' '.join(text_set3)
    #print(len(text_set1), len(text_set2), len(text_set3))

    return text_set1, text_set2, text_set3


def remove_punctuations(x):
    return [re.sub(r'[^\w\s]', '', token) for token in x if re.sub(r'[^\w\s]', '', token)]

def get_avg_AoA(tokens):
    tokens_df = pd.DataFrame({'token': tokens})
    tokens_df = pd.merge(tokens_df, aoa_df[['Word', 'Rating.Mean']], left_on = 'token', right_on = 'Word', how = 'left')
    
    if tokens_df['Rating.Mean'].isna().all():
        avg_aoa = 0
    else:
        avg_aoa = tokens_df['Rating.Mean'].mean()

    return avg_aoa

def get_cohesion_score(text):

    doc = nlp(text)

    # obtain vectors of size (300, ) for each word in the text
    vec_tokens = [token.vector for token in doc if token.has_vector and not token.is_stop and not token.is_punct]

    # calculate cosine similarity score for combinations of two vectors
    if (len(vec_tokens) > 2):
        similarities = [np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
                        for v1, v2 in combinations(vec_tokens, 2)]
        if (len(similarities) > 0):
            return np.mean(similarities)
        else:
            #print('similarity = 0')
            return np.nan()
    else:
        #print('token too small')
        return np.nan



In [None]:
#text_training_df = raw_training_set[['essay_id', 'essay_set', 'essay']].head(200).copy() # take the first 200 rows to prototype faster
text_training_df = raw_training_set.loc[raw_training_set['essay_set'].isin([1, 3, 4, 5, 6]), ['essay_id', 'essay_set', 'essay']].copy()

# converting to lower case and removing words replaced starting with @
# text_training_df['text'] = text_training_df['essay'].apply(lambda x: pd.Series(get_text(x)))

# split each essay into 3 sets, and keep the original essay labeled as "text_original"
text_training_df[['text_set1', 'text_set2', 'text_set3']] = text_training_df['essay'].progress_apply(lambda x: pd.Series(get_sets(x)))
text_training_df = text_training_df.rename(columns = {'essay': 'text_original'})
text_training_df = text_training_df.melt(id_vars=['essay_id', 'essay_set'], value_vars=['text_set1', 'text_set2', 'text_set3', 'text_original'],
                                        var_name='text_set', value_name='text')


# tokenize into words and sentences
text_training_df['word_tokens'] = text_training_df['text'].progress_apply(lambda x: word_tokenize(x))
text_training_df['sent_tokens'] = text_training_df['text'].progress_apply(lambda x: sent_tokenize(x))
text_training_df['word_tokens_clean'] = text_training_df['word_tokens'].progress_apply(lambda x: remove_punctuations(x))

text_training_df.head()

In [None]:
text_training_df.groupby('essay_set', as_index = False).count()
# raw_training_set[raw_training_set['essay_id'] == 1171]
# text_training_df[text_training_df['essay_id'] == 1171]


In [None]:
# extract basic statistical features
# (1) total number of words 
# (2) total number of characters
# (3) average number of words per sentence
# (4) total number of sentences
# (5) total number of paragraphs, ---> I don't think we can do this, doesn't seem to be in the raw data
# (6) total number of spelling mistakes
# (7) total number of grammar mistakes ---> I tried multiple packages but no luck. Textblob and transformer worked, but test cases are wrong. Skipped for now. 
# Flesch-Kincaid Score

start_time = time.time()

text_training_df['word_count'] = text_training_df['word_tokens_clean'].progress_apply(lambda x: len(x))
text_training_df['sent_count'] = text_training_df['sent_tokens'].progress_apply(lambda x: len(x))
text_training_df['char_count'] = text_training_df['text'].progress_apply(lambda x: len(x))
text_training_df['sent_length'] = text_training_df['word_count'] / text_training_df['sent_count']

spellcheck = SpellChecker()
text_training_df['spell_err_count'] = text_training_df['word_tokens_clean'].progress_apply(lambda x: len(spellcheck.unknown(x)))
text_training_df['syllabus_count'] = text_training_df['word_tokens_clean'].progress_apply(lambda x: sum(syllapy.count(word) for word in x))
#text_training_df['FleKin_score'] = (0.39 * (text_training_df['word_count'] / text_training_df['sent_count'])) + (11.8 * (text_training_df['syllabus_count'] / text_training_df['word_count'])) - 15.59
# the scores generated from this formula is very different from using flesch_reading_ease
# the formula is also different from wikipedia. Where did you get this?

# text_training_df['FleKin_score'] = 206.835 - (1.015 * (text_training_df['word_count'] / text_training_df['sent_count'])) - (84.6 * (text_training_df['syllabus_count'] / text_training_df['word_count']))
# this formula is from Wikipedia and close to the result of using flesch_reading_ease

text_training_df['FleKin_score'] = text_training_df['text'].progress_apply(lambda x: textstat.flesch_reading_ease(x))

# extras, not in the proposal
text_training_df['DalCha_score'] = text_training_df['text'].progress_apply(lambda x: textstat.dale_chall_readability_score(x))
text_training_df['unique_word_count'] = text_training_df['word_tokens_clean'].progress_apply(lambda x: len(set(w.lower() for w in x)))

end_time = time.time()
execution_time = end_time - start_time
print(f"Time taken for basic statistical feature extraction: {execution_time:.5f} seconds")

text_training_df.head()


In [None]:
# https://github.com/mauryquijada/word-complexity-predictor/tree/master
# Supposed to be able to extract these features using machine learning techniques:
    # Lemma length
    # Average age-of-acquisition (at what age a word is most likely to enter someone's vocabulary)
    # Average concreteness (a score of 1 to 5, with 5 being very concrete)
    # Frequency in a certain corpus
    # Lemma frequency in a certain corpus
        
# text_training_df['text'].apply(lambda x: get_cohesion_score(x)).isna().any()

In [None]:
start_time = time.time() # takes about 30 seconds, mostly on the cohesion score calculation
# text_training_df = text_training_df.head(100).copy()
# content feature extraction 
## sentiments (4 metrics)
## cohesion score, calculated as cosine similarity
## Age of Acquisision score, mapped from the Kuperman dataset

analyzer = SentimentIntensityAnalyzer()
nlp = spacy.load("en_core_web_md")

text_training_df[['neg', 'neu', 'pos', 'compound']] = text_training_df['text'].progress_apply(lambda x: list(analyzer.polarity_scores(x).values())).apply(pd.Series)
text_training_df['cohesion'] = text_training_df['text'].progress_apply(lambda x: get_cohesion_score(x))
text_training_df['AoA_score'] = text_training_df['word_tokens_clean'].progress_apply(lambda x: get_avg_AoA(x))

end_time = time.time()
execution_time = end_time - start_time
print(f"Time taken for basic content feature extraction: {execution_time:.5f} seconds")

text_training_df.head()

In [None]:
basic_feature_train = text_training_df.merge(raw_training_set[['essay_id', 'rescaled_score', 'low_med_hi',
                                                               'low_med_hi_numeric']], on = ['essay_id'], how = 'left')


basic_feature_train.to_csv('scaled_feature_train_data.csv', index = False)