In [None]:
import numpy as np
import pandas as pd

In [None]:
# Loading Training Data
train_data = pd.read_csv("data/essay_data/training_set_rel3.tsv", delimiter="\t", encoding='ISO-8859-1')
train_data.head()

In [None]:
# Selecting Relevant Columns
required_columns = ['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1', 'domain1_score']
train_data = train_data[required_columns]

In [None]:
from nltk.corpus import stopwords

# Stopwords in English
stopword_list = set(stopwords.words('english'))

def extract_text_features(text):
    """
    Computes various length-based features for a given essay text.
    """
    # Splitting the text into words and sentences
    sentence_list = text.split('.')
    word_list = text.split()
    
    # Counting words and sentences
    total_words = len(word_list)
    total_sentences = len(sentence_list)
    
    # Computing average lengths
    avg_word_size = sum(len(word) for word in word_list) / total_words if total_words else 0
    avg_sentence_size = total_words / total_sentences if total_sentences else 0
    
    # Categorizing words by length
    min_word_size = 4  # Words shorter than this are considered short
    max_word_size = 6  # Words longer than this are considered long
    long_words = sum(1 for word in word_list if len(word) > max_word_size)
    short_words = sum(1 for word in word_list if len(word) < min_word_size)
    
    # Identifying unique words and non-stopwords
    distinct_words = set(word_list)
    filtered_words = [word for word in word_list if word.lower() not in stopword_list]
    
    # Summarizing extracted features
    feature_dict = {
        'total_words': total_words,
        'distinct_word_count': len(distinct_words),
        'filtered_word_count': len(filtered_words),
        'avg_sentence_size': avg_sentence_size,
        'avg_word_size': avg_word_size,
        'total_sentences': total_sentences,
        'long_words': long_words,
        'short_words': short_words
    }
    
    return feature_dict

# Applying Feature Extraction to Essays
train_data['text_features'] = train_data['essay'].apply(extract_text_features)

# Expanding Feature Dictionary into Separate Columns
feature_columns = ['total_words', 'distinct_word_count', 'filtered_word_count', 
                   'avg_sentence_size', 'avg_word_size', 'total_sentences', 
                   'long_words', 'short_words']

train_data[feature_columns] = train_data['text_features'].apply(pd.Series)

# Displaying Extracted Features
train_data[feature_columns].head()