In [3]:
import numpy as np
import pandas as pd

In [7]:
# Loading Training Data
train_data = pd.read_csv("../data/raw/training_set_rel3.tsv", delimiter="\t", encoding='ISO-8859-1')
train_data.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,


In [9]:
# Selecting Relevant Columns
required_columns = ['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1', 'domain1_score']
train_data = train_data[required_columns]

In [19]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Stopwords in English
stopword_list = set(stopwords.words('english'))

def extract_text_features(text):
    """
    Computes various length-based features for a given essay text.
    """
    # Splitting the text into words and sentences
    sentence_list = text.split('.')
    word_list = text.split()
    
    # Counting words and sentences
    total_words = len(word_list)
    total_sentences = len(sentence_list)
    
    # Computing average lengths
    avg_word_size = sum(len(word) for word in word_list) / total_words if total_words else 0
    avg_sentence_size = total_words / total_sentences if total_sentences else 0
    
    # Categorizing words by length
    min_word_size = 4  # Words shorter than this are considered short
    max_word_size = 6  # Words longer than this are considered long
    long_words = sum(1 for word in word_list if len(word) > max_word_size)
    short_words = sum(1 for word in word_list if len(word) < min_word_size)
    
    # Identifying unique words and non-stopwords
    distinct_words = set(word_list)
    filtered_words = [word for word in word_list if word.lower() not in stopword_list]
    
    # Summarizing extracted features
    feature_dict = {
        'total_words': total_words,
        'distinct_word_count': len(distinct_words),
        'filtered_word_count': len(filtered_words),
        'avg_sentence_size': avg_sentence_size,
        'avg_word_size': avg_word_size,
        'total_sentences': total_sentences,
        'long_words': long_words,
        'short_words': short_words
    }
    
    return feature_dict

# Applying Feature Extraction to Essays
train_data['text_features'] = train_data['essay'].apply(extract_text_features)

# Expanding Feature Dictionary into Separate Columns
feature_columns = ['total_words', 'distinct_word_count', 'filtered_word_count', 
                   'avg_sentence_size', 'avg_word_size', 'total_sentences', 
                   'long_words', 'short_words']

train_data[feature_columns] = train_data['text_features'].apply(pd.Series)

# Displaying Extracted Features
train_data[feature_columns].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ATAUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Unnamed: 0,total_words,distinct_word_count,filtered_word_count,avg_sentence_size,avg_word_size,total_sentences,long_words,short_words
0,338.0,184.0,170.0,30.727273,4.550296,11.0,67.0,138.0
1,419.0,216.0,230.0,22.052632,4.463007,19.0,86.0,169.0
2,279.0,167.0,139.0,18.6,4.526882,15.0,56.0,119.0
3,524.0,275.0,302.0,20.96,5.041985,25.0,140.0,182.0
4,465.0,226.0,229.0,15.0,4.526882,31.0,95.0,192.0
