In [1]:
!pip install syllapy
!pip install ety



## Imports

In [2]:
# Data manipulation and analysis libraries
import pandas as pd
import numpy as np

# Plotting and visualization library
import plotly.graph_objects as go

# Natural Language Processing (NLP) libraries
import nltk
from collections import Counter
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize

# Phonetic analysis library
import syllapy

# Etymology analysis library
import ety

# Progress bar for loops
from tqdm import tqdm

# Deep Learning and Machine Learning libraries
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from keras.models import Model

# Set up NLTK resources
tqdm.pandas()
nltk.download("wordnet")
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\focus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\focus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\focus\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Read and Preprocess Data

Below, I pre-process the text data along with performing feature extraction.  The following code takes text input, performs various operations on it, and returns a set of linguistic statistics and processed data.

The following code:

- Converts the input text to lowercase
- Tokenizes the text into sentences using the Natural Language Toolkit (`nltk`), and each sentence is further tokenized into words. POS tags are assigned to each word.
- For each word in the text, the code does the following:
  - Determines the word's origin language (etymology).
  - Retrieves the full name of the POS tag from the `pos_mapping`.
  - Counts the number of syllables in the word.
  - Calculates the length of the word.
- Calculates various linguistic statistics, such as the mean syllable count, the number of sentences, the mean sentence length, the mean word length, and the total number of words.

In [3]:
df = pd.read_csv(
    "./kaggle/input/clear-corpus-6-01-clear-corpus-6-01/CLEAR Corpus 6.01 - CLEAR Corpus 6.01.csv"
)

In [4]:
# Define a mapping of POS (Part of Speech) tags to their full names
pos_mapping = {
    "CC": "Coordinating Conjunction",
    "CD": "Cardinal Digit",
    "DT": "Determiner",
    "EX": "Existential There",
    "FW": "Foreign Word",
    "IN": "Preposition or Subordinating Conjunction",
    "JJ": "Adjective",
    "JJR": "Adjective, Comparative",
    "JJS": "Adjective, Superlative",
    "LS": "List Item Marker",
    "MD": "Modal",
    "NN": "Noun, Singular or Mass",
    "NNS": "Noun, Plural",
    "NNP": "Proper Noun, Singular",
    "NNPS": "Proper Noun, Plural",
    "PDT": "Predeterminer",
    "POS": "Possessive Ending",
    "PRP": "Personal Pronoun",
    "PRP$": "Possessive Pronoun",
    "RB": "Adverb",
    "RBR": "Adverb, Comparative",
    "RBS": "Adverb, Superlative",
    "RP": "Particle",
    "TO": "to",
    "UH": "Interjection",
    "VB": "Verb, Base Form",
    "VBD": "Verb, Past Tense",
    "VBG": "Verb, Gerund or Present Participle",
    "VBN": "Verb, Past Participle",
    "VBP": "Verb, Non-3rd Person Singular Present",
    "VBZ": "Verb, 3rd Person Singular Present",
    "WDT": "Wh-determiner",
    "WP": "Wh-pronoun",
    "WP$": "Possessive Wh-pronoun",
    "WRB": "Wh-adverb",
}


# Define a function to process text
def process_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Initialize lists to store various linguistic features
    word_origins = []
    word_pos = []
    syllable_counts = []
    sentence_lengths = []
    word_lengths = []

    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Iterate through each sentence
    for sentence in sentences:
        # Tokenize each sentence into words and determine POS tags
        tokens = nltk.word_tokenize(sentence)
        pos_tags = nltk.pos_tag(tokens)
        sentence_lengths.append(len(pos_tags))

        # Process each word in the sentence
        for token, pos in pos_tags:
            # Determine the word's origin language (etymology)
            origin = ety.origins(token)
            if origin:
                origin = origin[0].language.name
            else:
                origin = "unknown"
            word_origins.append(origin)

            # Get the full name of the POS tag using the mapping
            full_pos_name = pos_mapping.get(pos, pos)
            word_pos.append(full_pos_name)

            # Count the number of syllables in the word
            syllables = syllapy.count(token)
            syllable_counts.append(syllables)

            # Calculate the length of the word
            word_lengths.append(len(token))

    # Calculate various linguistic statistics
    processed_excerpt = text
    origin_counts = Counter(word_origins)
    pos_counts = Counter(word_pos)
    mean_syllable_count = np.mean(syllable_counts)
    num_sentences = len(sentences)
    mean_sentence_length = np.mean(sentence_lengths)
    num_words = np.sum(sentence_lengths)
    mean_word_length = np.mean(word_lengths)

    # Return the processed data and statistics
    return (
        word_origins,
        origin_counts,
        word_pos,
        pos_counts,
        syllable_counts,
        mean_syllable_count,
        num_sentences,
        mean_sentence_length,
        mean_word_length,
        num_words,
        processed_excerpt,
    )

In [5]:
df[
    [
        "word_origins",
        "word_origin_counts",
        "pos",
        "pos_counts",
        "syllable_counts",
        "mean_syllable_count",
        "num_sentences",
        "mean_sentence_length",
        "mean_word_length",
        "num_words",
        "processed_excerpt",
    ]
] = df["Excerpt"].progress_apply(lambda x: pd.Series(process_text(x)))

100%|██████████| 4724/4724 [01:49<00:00, 43.08it/s]


## Bi-directional LSTM Model

Below, I prepare text data for the machine learning model.

- **Fit Tokenizer:** The Tokenizer is trained on preprocessed text excerpts. This step helps the Tokenizer build a vocabulary and assign a unique integer index to each word in the text.
- **Convert Text to Sequences:** The text sequences in the DataFrame are converted into sequences of integers using the Tokenizer
- **Pad Sequences:** To ensure all sequences have the same length, I pad them with zeros to a maximum length.
- **Label Creation:** Labels for the data are created from the "BT Easiness" column.
- **Data Splitting:** The data is split into training and testing sets. I allocate 20% of the data for testing.
- **Early Stopping:** I define early stopping criteria for model training.

In [6]:
# Set the maximum sequence length for text data
max_sequence_length = 300

# Set the maximum number of words to consider in the Tokenizer
max_words = 50000

# Initialize a Tokenizer with a specified maximum number of words
tokenizer = Tokenizer(num_words=max_words)

# Fit the Tokenizer on the preprocessed excerpts in the DataFrame
tokenizer.fit_on_texts(df["processed_excerpt"])

# Convert text sequences into sequences of integers using the Tokenizer
sequences = tokenizer.texts_to_sequences(df["processed_excerpt"])

# Pad the sequences to ensure uniform length
data = pad_sequences(sequences, maxlen=max_sequence_length)

# Create an array of labels based on the "BT Easiness" column in the DataFrame
labels = np.array(df["BT Easiness"])

# Split the data into training and testing sets
(
    x_text_train,
    x_text_test,
    y_train,
    y_test,
) = train_test_split(data, labels, test_size=0.2, random_state=42, shuffle=True)

# Define early stopping criteria for model training
early_stopping = EarlyStopping(
    monitor="val_loss", patience=5, restore_best_weights=True
)

Define and train the model

- **Model Definition:** 
  - A Sequential model is created.
  - The first layer is an Embedding layer, which is used for text data.
  - A Bidirectional LSTM layer with 64 units follows the Embedding layer. Bidirectional LSTM processes sequences in both forward and backward directions, capturing context effectively.
  - A LeakyReLU activation function with an alpha of 0.1 is applied to the output of the LSTM layer.
  - A Dropout layer with a rate of 0.25 is added to reduce overfitting.
  - Finally, a Dense layer with one unit and a linear activation function is used for regression.
- **Model Training:** 
  - The model is trained for 30 epochs with a batch size of 50 and early stopping.

In [7]:
# Create a Sequential model for neural network architecture
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(64)))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.25))
model.add(Dense(1, activation="linear"))
model.compile(loss="mean_squared_error", optimizer="adam")

# Display a summary of the model architecture
model.summary()

# Train the model
history = model.fit(
    x_text_train,
    y_train,
    epochs=30,
    batch_size=50,
    validation_data=(x_text_test, y_test),
    callbacks=[early_stopping],
)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 128)          6400000   
                                                                 
 bidirectional (Bidirection  (None, 128)               98816     
 al)                                                             
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 128)               0         
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 6498945 (24.79 MB)
Trainable params: 6498945 (24.79 MB)
Non-trainable params: 0 (0.00 Byte)
________________

KeyboardInterrupt: 

Plot the testing and training loss.

In [None]:
# Extract training loss and validation loss from the history object
train_loss = history.history["loss"]
val_loss = history.history["val_loss"]

# Create a list of epochs for the x-axis, converting range to list
epochs = list(range(1, len(train_loss) + 1))

# Initialize a Plotly Figure for the line chart
fig = go.Figure()

# Add a trace for training loss as a blue line
fig.add_trace(
    go.Scatter(
        x=epochs,
        y=train_loss,
        mode="lines",
        name="Training Loss",
        line=dict(color="blue"),
    )
)

# Add a trace for validation loss as a red line
fig.add_trace(
    go.Scatter(
        x=epochs,
        y=val_loss,
        mode="lines",
        name="Validation Loss",
        line=dict(color="red"),
    )
)

# Update the layout of the figure with title and axis labels
fig.update_layout(
    title="Training and Validation Loss",
    xaxis_title="Epochs",
    yaxis_title="Loss",
    showlegend=True,
    template="plotly_white",
)

# Display the figure
fig.show()