# Title

## Problem Description

## Imports and Utility Code

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import nltk
from collections import Counter
import syllapy
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
import ety
from tqdm import tqdm

import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import (
    Embedding,
    Bidirectional,
    LSTM,
    Dense,
    Dropout,
    Input,
    Flatten,
    concatenate,
)
from keras import regularizers
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import LeakyReLU
from keras.models import Model

tqdm.pandas()
nltk.download("wordnet")
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

## Exploritory Data Analysis

In [None]:
df = pd.read_csv("CLEAR Corpus 6.01 - CLEAR Corpus 6.01.csv")
df = df[df["Kaggle split"] == "Train"]

In [None]:
pos_mapping = {
    "CC": "Coordinating Conjunction",
    "CD": "Cardinal Digit",
    "DT": "Determiner",
    "EX": "Existential There",
    "FW": "Foreign Word",
    "IN": "Preposition or Subordinating Conjunction",
    "JJ": "Adjective",
    "JJR": "Adjective, Comparative",
    "JJS": "Adjective, Superlative",
    "LS": "List Item Marker",
    "MD": "Modal",
    "NN": "Noun, Singular or Mass",
    "NNS": "Noun, Plural",
    "NNP": "Proper Noun, Singular",
    "NNPS": "Proper Noun, Plural",
    "PDT": "Predeterminer",
    "POS": "Possessive Ending",
    "PRP": "Personal Pronoun",
    "PRP$": "Possessive Pronoun",
    "RB": "Adverb",
    "RBR": "Adverb, Comparative",
    "RBS": "Adverb, Superlative",
    "RP": "Particle",
    "TO": "to",
    "UH": "Interjection",
    "VB": "Verb, Base Form",
    "VBD": "Verb, Past Tense",
    "VBG": "Verb, Gerund or Present Participle",
    "VBN": "Verb, Past Participle",
    "VBP": "Verb, Non-3rd Person Singular Present",
    "VBZ": "Verb, 3rd Person Singular Present",
    "WDT": "Wh-determiner",
    "WP": "Wh-pronoun",
    "WP$": "Possessive Wh-pronoun",
    "WRB": "Wh-adverb",
}


def process_text(text):
    text = text.lower()

    word_origins = []
    word_pos = []
    syllable_counts = []
    sentence_lengths = []
    word_lengths = []

    sentences = sent_tokenize(text)

    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence)
        pos_tags = nltk.pos_tag(tokens)
        sentence_lengths.append(len(pos_tags))
        for token, pos in pos_tags:
            origin = ety.origins(token)
            if origin:
                origin = origin[0].language.name
            else:
                origin = "unknown"
            word_origins.append(origin)
            full_pos_name = pos_mapping.get(pos, pos)
            word_pos.append(full_pos_name)
            syllables = syllapy.count(token)
            syllable_counts.append(syllables)
            word_lengths.append(len(token))

    processed_excerpt = text
    origin_counts = Counter(word_origins)
    pos_counts = Counter(word_pos)
    mean_syllable_count = np.mean(syllable_counts)
    num_sentences = len(sentences)
    mean_sentence_length = np.mean(sentence_lengths)
    num_words = np.sum(sentence_lengths)
    mean_word_length = np.mean(word_lengths)

    return (
        word_origins,
        origin_counts,
        word_pos,
        pos_counts,
        syllable_counts,
        mean_syllable_count,
        num_sentences,
        mean_sentence_length,
        mean_word_length,
        num_words,
        processed_excerpt,
    )

In [None]:
df[
    [
        "word_origins",
        "word_origin_counts",
        "pos",
        "pos_counts",
        "syllable_counts",
        "mean_syllable_count",
        "num_sentences",
        "mean_sentence_length",
        "mean_word_length",
        "num_words",
        "processed_excerpt",
    ]
] = df["Excerpt"].progress_apply(lambda x: pd.Series(process_text(x)))

## Models

In [None]:
max_sequence_length = 300
max_words = 50000
dropout_rate = 0.25

sentence_features = df[
    [
        "mean_syllable_count",
        "num_sentences",
        "mean_sentence_length",
        "mean_word_length",
    ]
]

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df["processed_excerpt"])
sequences = tokenizer.texts_to_sequences(df["processed_excerpt"])
data = pad_sequences(sequences, maxlen=max_sequence_length)
labels = np.array(df["BT Easiness"])

(
    x_text_train,
    x_text_test,
    x_sentence_train,
    x_sentence_test,
    y_train,
    y_test,
) = train_test_split(
    data, sentence_features, labels, test_size=0.2, random_state=42, shuffle=True
)
early_stopping = EarlyStopping(
    monitor="val_loss", patience=5, restore_best_weights=True
)

### Bidirectional LSTM

In [None]:
modela = Sequential()
modela.add(Embedding(max_words, 128, input_length=max_sequence_length))
modela.add(Bidirectional(LSTM(64)))
modela.add(LeakyReLU(alpha=0.1))
modela.add(Dropout(0.25))
modela.add(Dense(1, activation="linear"))

modela.compile(
    loss="mean_squared_error", optimizer="adam", metrics=["mean_absolute_error"]
)
modela.summary()

history = modela.fit(
    x_text_train,
    y_train,
    epochs=30,
    batch_size=15,
    validation_data=(x_text_test, y_test),
    callbacks=[early_stopping],
)

In [None]:
train_loss = history.history["loss"]
val_loss = history.history["val_loss"]

epochs = list(range(1, len(train_loss) + 1))  # Convert range to list

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=epochs,
        y=train_loss,
        mode="lines",
        name="Training Loss",
        line=dict(color="blue"),
    )
)
fig.add_trace(
    go.Scatter(
        x=epochs,
        y=val_loss,
        mode="lines",
        name="Validation Loss",
        line=dict(color="red"),
    )
)
fig.update_layout(
    title="Training and Validation Loss",
    xaxis_title="Epochs",
    yaxis_title="Loss",
    showlegend=True,
    template="plotly_white",
)
fig.show()

## Predictions

In [None]:
new_data = pd.read_csv("test.csv")  # replace with your new data file
texts = new_data["excerpt"].values
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=max_sequence_length)

# Make predictions
predictions = modela.predict(data)

# Create a new DataFrame with 'id' and 'target'
output = pd.DataFrame({"id": new_data["id"], "target": predictions.flatten()})

# Save to CSV
output.to_csv("submission.csv", index=False)

## Results and Analysis

## Conclusion

## References

http://etym.org/