In [1]:
!pip install syllapy
!pip install ety



## Imports

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import nltk
from collections import Counter
import syllapy
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
import ety
from tqdm import tqdm

import tensorflow as tf
from transformers import (
    BertTokenizer,
    TFBertModel,
    XLMRobertaTokenizer,
    TFXLMRobertaModel,
)
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import (
    Embedding,
    Bidirectional,
    LSTM,
    Dense,
    Dropout,
    Input,
    Flatten,
    concatenate,
    BatchNormalization,
)
from keras import regularizers
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model

tqdm.pandas()
nltk.download("wordnet")
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\focus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\focus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\focus\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Read and Process Data

Below, I pre-process the text data along with performing feature extraction.  The following code takes text input, performs various operations on it, and returns a set of linguistic statistics and processed data.

The following code:

- Converts the input text to lowercase
- Tokenizes the text into sentences using the Natural Language Toolkit (`nltk`), and each sentence is further tokenized into words. POS tags are assigned to each word.
- For each word in the text, the code does the following:
  - Determines the word's origin language (etymology).
  - Retrieves the full name of the POS tag from the `pos_mapping`.
  - Counts the number of syllables in the word.
  - Calculates the length of the word.
- Calculates various linguistic statistics, such as the mean syllable count, the number of sentences, the mean sentence length, the mean word length, and the total number of words.

In [3]:
df = pd.read_csv(
    "./kaggle/input/clear-corpus-6-01-clear-corpus-6-01/CLEAR Corpus 6.01 - CLEAR Corpus 6.01.csv"
)

In [4]:
pos_mapping = {
    "CC": "Coordinating Conjunction",
    "CD": "Cardinal Digit",
    "DT": "Determiner",
    "EX": "Existential There",
    "FW": "Foreign Word",
    "IN": "Preposition or Subordinating Conjunction",
    "JJ": "Adjective",
    "JJR": "Adjective, Comparative",
    "JJS": "Adjective, Superlative",
    "LS": "List Item Marker",
    "MD": "Modal",
    "NN": "Noun, Singular or Mass",
    "NNS": "Noun, Plural",
    "NNP": "Proper Noun, Singular",
    "NNPS": "Proper Noun, Plural",
    "PDT": "Predeterminer",
    "POS": "Possessive Ending",
    "PRP": "Personal Pronoun",
    "PRP$": "Possessive Pronoun",
    "RB": "Adverb",
    "RBR": "Adverb, Comparative",
    "RBS": "Adverb, Superlative",
    "RP": "Particle",
    "TO": "to",
    "UH": "Interjection",
    "VB": "Verb, Base Form",
    "VBD": "Verb, Past Tense",
    "VBG": "Verb, Gerund or Present Participle",
    "VBN": "Verb, Past Participle",
    "VBP": "Verb, Non-3rd Person Singular Present",
    "VBZ": "Verb, 3rd Person Singular Present",
    "WDT": "Wh-determiner",
    "WP": "Wh-pronoun",
    "WP$": "Possessive Wh-pronoun",
    "WRB": "Wh-adverb",
}


def process_text(text):
    text = text.lower()

    word_origins = []
    word_pos = []
    syllable_counts = []
    sentence_lengths = []
    word_lengths = []

    sentences = sent_tokenize(text)

    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence)
        pos_tags = nltk.pos_tag(tokens)
        sentence_lengths.append(len(pos_tags))
        for token, pos in pos_tags:
            origin = ety.origins(token)
            if origin:
                origin = origin[0].language.name
            else:
                origin = "unknown"
            word_origins.append(origin)
            full_pos_name = pos_mapping.get(pos, pos)
            word_pos.append(full_pos_name)
            syllables = syllapy.count(token)
            syllable_counts.append(syllables)
            word_lengths.append(len(token))

    processed_excerpt = text
    origin_counts = Counter(word_origins)
    pos_counts = Counter(word_pos)
    mean_syllable_count = np.mean(syllable_counts)
    num_sentences = len(sentences)
    mean_sentence_length = np.mean(sentence_lengths)
    num_words = np.sum(sentence_lengths)
    mean_word_length = np.mean(word_lengths)

    return (
        word_origins,
        origin_counts,
        word_pos,
        pos_counts,
        syllable_counts,
        mean_syllable_count,
        num_sentences,
        mean_sentence_length,
        mean_word_length,
        num_words,
        processed_excerpt,
    )

In [5]:
df[
    [
        "word_origins",
        "word_origin_counts",
        "pos",
        "pos_counts",
        "syllable_counts",
        "mean_syllable_count",
        "num_sentences",
        "mean_sentence_length",
        "mean_word_length",
        "num_words",
        "processed_excerpt",
    ]
] = df["Excerpt"].progress_apply(lambda x: pd.Series(process_text(x)))

100%|██████████| 4724/4724 [05:39<00:00, 13.92it/s]


## Bi-directional LSTM Model

Below, I prepare text data for the machine learning model.

- **Extract Engineered Features:** I extract the sentence features, Parts of Speech counts, and language origin counts engineered above.
- **Fit Tokenizer:** The Tokenizer is trained on preprocessed text excerpts. This step helps the Tokenizer build a vocabulary and assign a unique integer index to each word in the text.
- **Convert Text to Sequences:** The text sequences in the DataFrame are converted into sequences of integers using the Tokenizer
- **Pad Sequences:** To ensure all sequences have the same length, I pad them with zeros to a maximum length.
- **Label Creation:** Labels for the data are created from the "BT Easiness" column.
- **Data Splitting:** The data is split into training and testing sets. I allocate 20% of the data for testing.
- **Early Stopping:** I define early stopping criteria for model training.

In [6]:
max_sequence_length = 300
max_words = 50000
dropout_rate = 0.25

sentence_features = df[
    [
        "mean_syllable_count",
        "num_sentences",
        "mean_sentence_length",
        "mean_word_length",
    ]
]

sentence_features = sentence_features.join(
    pd.DataFrame(df["pos_counts"].tolist()).fillna(0)
)

sentence_features = sentence_features.join(
    pd.DataFrame(df["word_origin_counts"].tolist()).fillna(0)
)

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df["processed_excerpt"])
sequences = tokenizer.texts_to_sequences(df["processed_excerpt"])
data = pad_sequences(sequences, maxlen=max_sequence_length)
labels = np.array(df["BT Easiness"])

(
    x_text_train,
    x_text_test,
    x_sentence_train,
    x_sentence_test,
    y_train,
    y_test,
) = train_test_split(
    data, sentence_features, labels, test_size=0.2, random_state=42, shuffle=True
)
early_stopping = EarlyStopping(
    monitor="val_loss", patience=5, restore_best_weights=True
)

Define and train a model with text and continuous inputs

- **Model Definition:** 
  - Two input layers are defined: `text_input` for text data and `continuous_input` for the engineered feature data.
  - For the text input, 
    - I start with an Embedding layer.
    - A Bidirectional LSTM layer processes the embedded text data.
    - A LeakyReLU activation function is applied to the output of the LSTM layer.
    - A Dropout layer is added to reduce overfitting.
  - For the continuous engineered feature data, 
    - A BatchNormalization layer is applied, 
    - Then a Dense layer
    - ReLU activation
    - A Dropout layer
  - Both the processed text and continuous data are concatenated.
  - A Dense layer
  - The final output layer is a Dense layer with 1 unit and a linear activation function for regression.
- **Model Training:** 
  - The model is trained for 30 epochs with a batch size of 15 and early stopping.

In [7]:
max_sequence_length = 300
max_words = 50000


text_input = Input(name="text", shape=(max_sequence_length,))
text_embedding = Embedding(max_words, 128, input_length=max_sequence_length)(text_input)
text_lstm = Bidirectional(LSTM(64))(text_embedding)
text_leakyrelu = LeakyReLU(alpha=0.1)(text_lstm)
text_dropout = Dropout(0.25)(text_leakyrelu)

continuous_input = Input(name="sentence", shape=(118,))
continuous_input_bn = BatchNormalization(name="continuous_batch_norm")(continuous_input)
continuous_input_dense = Dense(32, activation="relu", name="continuous_dense")(
    continuous_input_bn
)
continuous_input_dropout = Dropout(0.2, name="continuous_dropout")(
    continuous_input_dense
)

merged = concatenate([text_dropout, continuous_input_dropout])
merged = Dense(64, activation="relu")(merged)
output = Dense(1, activation="linear")(merged)

model = Model(inputs=[text_input, continuous_input], outputs=output)

model.compile(loss="mean_squared_error", optimizer="adam")
model.summary()

history = model.fit(
    {"text": x_text_train, "sentence": x_sentence_train},
    y_train,
    epochs=30,
    batch_size=15,
    validation_data=({"text": x_text_test, "sentence": x_sentence_test}, y_test),
    callbacks=[early_stopping],
)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 text (InputLayer)           [(None, 300)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 300, 128)             6400000   ['text[0][0]']                
                                                                                                  
 sentence (InputLayer)       [(None, 118)]                0         []                            
                                                                                                  
 bidirectional (Bidirection  (None, 128)                  98816     ['embedding[0][0]']           
 al)                                                                                          

Plot the testing and training loss.

In [None]:
train_loss = history.history["loss"]
val_loss = history.history["val_loss"]

epochs = list(range(1, len(train_loss) + 1))  # Convert range to list

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=epochs,
        y=train_loss,
        mode="lines",
        name="Training Loss",
        line=dict(color="blue"),
    )
)
fig.add_trace(
    go.Scatter(
        x=epochs,
        y=val_loss,
        mode="lines",
        name="Validation Loss",
        line=dict(color="red"),
    )
)
fig.update_layout(
    title="Training and Validation Loss",
    xaxis_title="Epochs",
    yaxis_title="Loss",
    showlegend=True,
    template="plotly_white",
)
fig.show()