In [8]:
import numpy as np    # Math operations and arrays
import pandas as pd   # To work with tables

In [9]:
# Loading Training Data
train_data = pd.read_csv("../data/raw/training_set_rel3.tsv", delimiter="\t", encoding='ISO-8859-1')
train_data.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,


In [10]:
# Selecting Relevant Columns
required_columns = ['essay_id', 'essay_set', 'essay', 'rater1_domain1', 'rater2_domain1', 'domain1_score']
train_data = train_data[required_columns]

# Vocabulary Features

In [11]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Stopwords in English
stopword_list = set(stopwords.words('english'))

def extract_text_features(text):
    """
    Computes various length-based features for a given essay text.
    """
    # Splitting the text into words and sentences
    sentence_list = text.split('.')
    word_list = text.split()
    
    # Counting words and sentences
    total_words = len(word_list)
    total_sentences = len(sentence_list)
    
    # Computing average lengths
    avg_word_size = sum(len(word) for word in word_list) / total_words if total_words else 0
    avg_sentence_size = total_words / total_sentences if total_sentences else 0
    
    # Categorizing words by length
    min_word_size = 4  # Words shorter than this are considered short
    max_word_size = 6  # Words longer than this are considered long
    long_words = sum(1 for word in word_list if len(word) > max_word_size)
    short_words = sum(1 for word in word_list if len(word) < min_word_size)
    
    # Identifying unique words and non-stopwords
    distinct_words = set(word_list)
    filtered_words = [word for word in word_list if word.lower() not in stopword_list]
    
    # Summarizing extracted features
    feature_dict = {
        'total_words': total_words,
        'distinct_word_count': len(distinct_words),
        'filtered_word_count': len(filtered_words),
        'avg_sentence_size': avg_sentence_size,
        'avg_word_size': avg_word_size,
        'total_sentences': total_sentences,
        'long_words': long_words,
        'short_words': short_words
    }
    
    return feature_dict

# Applying Feature Extraction to Essays
train_data['text_features'] = train_data['essay'].apply(extract_text_features)

# Expanding Feature Dictionary into Separate Columns
feature_columns = ['total_words', 'distinct_word_count', 'filtered_word_count', 
                   'avg_sentence_size', 'avg_word_size', 'total_sentences', 
                   'long_words', 'short_words']

train_data[feature_columns] = train_data['text_features'].apply(pd.Series)
train_data[feature_columns].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ehsan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,total_words,distinct_word_count,filtered_word_count,avg_sentence_size,avg_word_size,total_sentences,long_words,short_words
0,338.0,184.0,170.0,30.727273,4.550296,11.0,67.0,138.0
1,419.0,216.0,230.0,22.052632,4.463007,19.0,86.0,169.0
2,279.0,167.0,139.0,18.6,4.526882,15.0,56.0,119.0
3,524.0,275.0,302.0,20.96,5.041985,25.0,140.0,182.0
4,465.0,226.0,229.0,15.0,4.526882,31.0,95.0,192.0


In [15]:
import nltk 
nltk.data.path.append('../Dataset/nltk_data')

In [16]:
from collections import Counter
from nltk import word_tokenize, pos_tag
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def extract_pos_features(text):
    """
    Analyzes the Part-of-Speech (POS) distribution in a given text.
    """
    # Tokenizing and tagging words with their respective POS labels
    word_list = word_tokenize(text)
    tagged_words = pos_tag(word_list)
    
    # Counting occurrences of each POS category
    pos_frequencies = Counter(tag for _, tag in tagged_words)
    
    # Mapping specific POS categories to broader labels
    pos_summary = {
        'noun_count': pos_frequencies.get('NN', 0) + pos_frequencies.get('NNS', 0),
        'adjective_count': pos_frequencies.get('JJ', 0),
        'pronoun_count': pos_frequencies.get('PRP', 0) + pos_frequencies.get('PRP$', 0),
        'verb_count': sum(pos_frequencies.get(tag, 0) for tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']),
        'conjunction_count': pos_frequencies.get('CC', 0),
        'adverb_count': pos_frequencies.get('RB', 0),
        'determiner_count': pos_frequencies.get('DT', 0),
        'proper_noun_count': pos_frequencies.get('NNP', 0) + pos_frequencies.get('NNPS', 0),
        'numeral_count': pos_frequencies.get('CD', 0),
        'interjection_count': pos_frequencies.get('UH', 0)
    }
    
    return pos_summary

# Applying POS feature extraction to the dataset
train_data['pos_analysis'] = train_data['essay'].apply(extract_pos_features)

# Expanding the extracted POS data into separate columns
pos_columns = ['noun_count', 'adjective_count', 'pronoun_count', 'verb_count', 
               'conjunction_count', 'adverb_count', 'determiner_count', 
               'proper_noun_count', 'numeral_count', 'interjection_count']

train_data[pos_columns] = train_data['pos_analysis'].apply(pd.Series)
train_data[pos_columns].head()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ehsan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ehsan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Error with downloaded zip file
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ehsan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\ehsan/nltk_data'
    - 'C:\\Users\\ehsan\\anaconda3\\nltk_data'
    - 'C:\\Users\\ehsan\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\ehsan\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\ehsan\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - '../Dataset/nltk_data'
    - ''
**********************************************************************


In [None]:
# Function to derive punctuation-based characteristics
def analyze_punctuation(text):

    # Counting different punctuation marks
    dot_count = text.count('.')
    comma_occurrences = text.count(',')
    question_marks = text.count('?')
    exclamations = text.count('!')
    quote_marks = text.count('"') + text.count("'")  # Including both single and double quotation marks
    colon_occurrences = text.count(':')
    semicolon_occurrences = text.count(';')
    bracket_pairs = text.count('(') + text.count(')')  # Counting both opening and closing brackets
    dash_count = text.count('-')
    triple_dots = text.count('...')  # Counting occurrences of ellipsis

    # Compiling extracted features into a structured dictionary
    punctuation_data = {
        'dot_count': dot_count,
        'comma_occurrences': comma_occurrences,
        'question_marks': question_marks,
        'exclamations': exclamations,
        'colon_occurrences': colon_occurrences,
        'semicolon_occurrences': semicolon_occurrences,
        'bracket_pairs': bracket_pairs // 2,  # Adjusting for paired brackets
        'dash_count': dash_count,
        'triple_dots': triple_dots
    }

    return punctuation_data

# Applying the punctuation feature extraction to the dataset
train_data['punctuation_analysis'] = train_data['essay'].apply(analyze_punctuation)

# Expanding extracted punctuation details into separate columns
punctuation_columns = ['dot_count', 'comma_occurrences', 'question_marks', 
                       'exclamations', 'colon_occurrences', 'semicolon_occurrences', 
                       'bracket_pairs', 'dash_count', 'triple_dots']

train_data[punctuation_columns] = train_data['punctuation_analysis'].apply(pd.Series)
train_data[punctuation_columns].head()

In [None]:
textual_features = ['total_words', 'distinct_word_count', 'filtered_word_count', 
                    'avg_sentence_size', 'avg_word_size', 'total_sentences', 
                    'long_words', 'short_words', 'noun_count', 'adjective_count', 
                    'pronoun_count', 'verb_count', 'conjunction_count', 
                    'adverb_count', 'determiner_count', 'proper_noun_count', 
                    'numeral_count', 'interjection_count', 'dot_count', 
                    'comma_occurrences', 'question_marks', 'exclamations', 
                    'colon_occurrences', 'semicolon_occurrences', 'bracket_pairs', 
                    'dash_count', 'triple_dots']

# Creating a DataFrame containing the selected feature set
textual_features_df = train_data[textual_features]
textual_features_df.head()


## Sentence Level Features

In [None]:
!pip install textstat -q

In [None]:
from textstat import flesch_kincaid_grade, automated_readability_index, coleman_liau_index
from textstat import lix, flesch_reading_ease, gunning_fog, smog_index, rix, dale_chall_readability_score

def compute_readability_metrics(text):
    
    readability_scores = {
        'flesch_kincaid': flesch_kincaid_grade(text),
        'automated_readability': automated_readability_index(text),
        'coleman_liau': coleman_liau_index(text),
        'lix_score': lix(text),
        'flesch_reading_ease': flesch_reading_ease(text),
        'gunning_fog_index': gunning_fog(text),
        'smog_index': smog_index(text),
        'rix_score': rix(text),
        'dale_chall_score': dale_chall_readability_score(text)
    }
    return readability_scores

# Applying the readability analysis to essays
train_data['readability_metrics'] = train_data['essay'].apply(compute_readability_metrics)

# Expanding extracted readability scores into separate columns
readability_columns = ['flesch_kincaid', 'automated_readability', 'coleman_liau', 
                       'lix_score', 'flesch_reading_ease', 'gunning_fog_index', 
                       'smog_index', 'rix_score', 'dale_chall_score']

train_data[readability_columns] = train_data['readability_metrics'].apply(pd.Series)

# Displaying the first few records of readability analysis
train_data[readability_columns].head()


In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
from textstat import syllable_count, difficult_words, lexicon_count, dale_chall_readability_score

def extract_sentence_features(text):

    # Tokenize words and sentences
    tokens = word_tokenize(text)
    total_syllables = syllable_count(text)
    long_word_count = sum(1 for token in tokens if len(token) > 6)
    sentence_list = sent_tokenize(text)
    paragraph_list = text.split('\n')

    # Extracting various metrics related to sentence structure and complexity
    sentence_metrics = {
        'avg_char_per_word': sum(len(token) for token in tokens) / len(tokens) if tokens else 0,
        'avg_syll_per_word': total_syllables / len(tokens) if tokens else 0,
        'unique_word_count': len(set(tokens)),
        'avg_words_per_sentence': len(tokens) / len(sentence_list) if sentence_list else 0,
        'total_words': len(tokens),
        'total_sentences': len(sentence_list),
        'avg_sentences_per_paragraph': len(sentence_list) / len(paragraph_list) if paragraph_list else 0,
        'difficult_word_count': difficult_words(text),
        'type_token_ratio': lexicon_count(text, removepunct=True) / len(tokens) if tokens else 0,
        'total_characters': sum(len(token) for token in tokens),
        'total_syllables': total_syllables,
        'total_paragraphs': len(paragraph_list),
        'long_word_count': long_word_count,
        'dale_chall_score': dale_chall_readability_score(text)
    }
    
    return sentence_metrics

# Applying the feature extraction to the dataset
train_data['sentence_metrics'] = train_data['essay'].apply(extract_sentence_features)

# Expanding the computed metrics into individual columns
sentence_columns = ['avg_char_per_word', 'avg_syll_per_word', 'unique_word_count', 
                    'avg_words_per_sentence', 'total_words', 'total_sentences', 
                    'avg_sentences_per_paragraph', 'difficult_word_count', 
                    'type_token_ratio', 'total_characters', 'total_syllables', 
                    'total_paragraphs', 'long_word_count', 'dale_chall_score']

train_data[sentence_columns] = train_data['sentence_metrics'].apply(pd.Series)

train_data[sentence_columns].head()

In [None]:

combined_features = ['flesch_kincaid', 'automated_readability', 'coleman_liau', 
                       'lix_score', 'flesch_reading_ease', 'gunning_fog_index', 
                       'smog_index', 'rix_score', 'dale_chall_score', 'avg_char_per_word', 'avg_syll_per_word', 'unique_word_count', 
                    'avg_words_per_sentence', 'total_words', 'total_sentences', 
                    'avg_sentences_per_paragraph', 'difficult_word_count', 
                    'type_token_ratio', 'total_characters', 'total_syllables', 
                    'total_paragraphs', 'long_word_count', 'dale_chall_score']

# Creating a DataFrame containing the combined feature set
combined_features_df = train_data[combined_features]
combined_features_df.head()
