<a href="https://colab.research.google.com/github/eve2024/Web_Programming/blob/main/final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, Bidirectional, Attention
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier


Load Dataset & Preprocess Data

In [None]:
# Load dataset
with open("/content/wikipedia.dat.txt", "r") as file:
    lines = file.readlines()

corrected_words = []
misspelled_words = []

for line in lines:
    if line.startswith("$"):
        # Split the line into corrected and misspelled words
        parts = line.strip().split("$")
        if len(parts) >= 2:  # Ensure there are at least two parts
            corrected_word, *misspelled_word = parts[1].split()
            corrected_words.append(corrected_word)
            if misspelled_word:  # Check if misspelled_word is not empty
                misspelled_words.append(misspelled_word[0])  # Extract the first element
            else:
                misspelled_words.append("")  # Append an empty string if no misspelled word

# Combine the corrected and misspelled words into pairs
data = pd.DataFrame({"corrected_word": corrected_words, "misspelled_word": misspelled_words})

# Data preprocessing
vocab = set()
for word in data["corrected_word"].values:
    vocab.update(list(word))
for word in data["misspelled_word"].values:
    vocab.update(list(word))

word_to_index = {char: index + 1 for index, char in enumerate(vocab)}


FileNotFoundError: [Errno 2] No such file or directory: '/content/wikipedia.dat.txt'

Replace Words with Integers

In [None]:
# Replace words with integers in the dataset
indexed_corrected_words = [[word_to_index[char] for char in word] for word in data["corrected_word"].values]
indexed_misspelled_words = [[word_to_index[char] for char in word] for word in data["misspelled_word"].values]

# Pad sequences to ensure uniform length
max_seq_length = max(len(word) for word in indexed_corrected_words + indexed_misspelled_words)
padded_corrected_words = pad_sequences(indexed_corrected_words, maxlen=max_seq_length)
padded_misspelled_words = pad_sequences(indexed_misspelled_words, maxlen=max_seq_length)


Split into Training and Testing Sets

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_misspelled_words, padded_corrected_words, test_size=0.2, random_state=42)


Bidirectional LSTM

In [None]:
# Model Architecture
input_layer = Input(shape=(max_seq_length,))
embedding_layer = Embedding(input_dim=len(vocab) + 1, output_dim=100, input_length=max_seq_length)(input_layer)
lstm_layer = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)
attention = Attention()
attention_output = attention([lstm_layer, lstm_layer])
output_layer = Dense(len(vocab) + 1, activation='softmax')(attention_output)

# Compile model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




Fit the Model

In [None]:
# Model training
model.fit(X_train, y_train, batch_size=64, epochs=10, validation_data=(X_test, y_test))

#model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7ce7dac84fa0>

Evaluate Model Performance with Accuracy

In [None]:
# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)

Test Accuracy: 0.5071428418159485
