# CNN and Spelling Error Detection
---

### Loading essential packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, Input
from sklearn.model_selection import train_test_split

### Making of Dataset

#### Reading misspelled words

In [2]:
with open("mis-spelled.txt", "r") as f:
    file1 = f.readlines()
# file1[0]

# removing the newline characters and correctly spelled words
misspelled = []
for i in file1:
    if i[0] != '$':
        misspelled.append(i[:-1])
# len(misspelled)

#### Reading correctly-spelled word

In [3]:
with open("correct-spelled.txt","r") as f:
  file2 = f.readlines()
# file2[0]

# removing the newline characters
correct = []
for i in file2:
  correct.append(i[:-1])
# len(correct)

#### Mixing as [correctly-spelled : misspelled :: 10 : 1]

In [4]:
p = 10000
q = 10*p

In [5]:
words = []
for i in range(p):
  words.append([misspelled[i],0])
for i in range(q):
  words.append([correct[i],1])

len(words)

110000

In [6]:
import random
random.shuffle(words)

In [7]:
# saving the dataset
with open("words.txt","w+") as f:
  for i in words:
    f.write(i[0] + "," + str(i[1]) + "\n")

### Splitting the data

In [8]:
word = []
marker = []

for i in words:
  word.append(i[0])
  marker.append(i[1])
    
# print(X[6], y[6])

In [9]:
x_train, x_test, y_train, y_test = train_test_split(word, marker)

### Manually tokenizing

In [10]:
count = 1
symbol2idx = {}
for k in x_train:
  for x in k:
    if symbol2idx.get(x.lower()) is None:
      symbol2idx[x] = count
      count += 1

for k in x_test:
  for x in k:
    if symbol2idx.get(x.lower()) is None:
      symbol2idx[x] = count
      count += 1

### Tokenized Set of Letters of Words

In [11]:
symbol2idx

{'d': 1,
 'a': 2,
 't': 3,
 'b': 4,
 'o': 5,
 'c': 6,
 'e': 7,
 'r': 8,
 'u': 9,
 'i': 10,
 'n': 11,
 'f': 12,
 'm': 13,
 'l': 14,
 's': 15,
 'p': 16,
 'v': 17,
 'h': 18,
 'z': 19,
 'y': 20,
 'g': 21,
 'k': 22,
 'w': 23,
 'q': 24,
 'x': 25,
 'j': 26,
 '_': 27,
 '-': 28,
 "'": 29,
 '.': 30}

### Getting the length of Largest Word

In [12]:
eq_train = []
for k in x_train:
  z = []
  for x in k:
    z.append(symbol2idx[x.lower()])
  eq_train.append(z)

len(eq_train)

82500

In [13]:
T = 1
for x in eq_train:
  if T < len(x):
    T = len(x)
    
T

31

In [14]:
eq_test = []
for k in x_test:
  z = []
  for x in k:
    z.append(symbol2idx[x.lower()])
  eq_test.append(z)

len(eq_test)

27500

In [15]:
for x in eq_test:
  if T < len(x):
    T = len(x)
    
T

31

### Making all the Lists Equal in Size

In [16]:
for i in range(len(eq_train)):
  while(len(eq_train[i])<T):
    eq_train[i].append(0)

for i in range(len(eq_test)):
  while(len(eq_test[i])<T):
    eq_test[i].append(0)

### Making of Model

In [17]:
V = count

In [18]:
x_train = np.array(eq_train)
x_test = np.array(eq_test)

In [19]:
i = Input(shape=(T, ))
x = Embedding(V, 20)(i)
x = Conv1D(1024, 7, activation="relu")(x)
x = MaxPooling1D(3)(x)
x = Conv1D(1024, 3, activation="relu")(x)
x = MaxPooling1D(3)(x)
x = Conv1D(1024, 1, activation="relu")(x)
x = Conv1D(1024, 1, activation="relu")(x)
x = Conv1D(1024, 1, activation="relu")(x)
x = Conv1D(1024, 1, activation="relu")(x)
x = MaxPooling1D(1)(x)
x = Dense(2048, activation="relu")(x)
x = Dense(2048, activation="relu")(x)
x = Dense(1, activation="sigmoid")(x)

model = Model(i, x)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [20]:
y_train = np.array(y_train)
y_test = np.array(y_test)

### Training of Model

In [21]:
my_model = model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Result

In [22]:
# accuracy of trained words
my_model.history['accuracy']

[0.9085272550582886,
 0.9126848578453064,
 0.9178000092506409,
 0.920127272605896,
 0.921963632106781,
 0.925151526927948,
 0.9259818196296692,
 0.9277818202972412,
 0.9295212030410767,
 0.9305575489997864,
 0.9320605993270874,
 0.9317636489868164,
 0.9329212307929993,
 0.933624267578125,
 0.9334787726402283,
 0.9335393905639648,
 0.9356363415718079,
 0.935903012752533,
 0.9307818412780762,
 0.9331818222999573]

In [23]:
# accuracy of tested words
my_model.history["val_accuracy"]

[0.9100000262260437,
 0.9169636368751526,
 0.9172363877296448,
 0.9210545420646667,
 0.9203454256057739,
 0.9178181886672974,
 0.9235818386077881,
 0.9236727356910706,
 0.9249818325042725,
 0.9228727221488953,
 0.9235454797744751,
 0.9228727221488953,
 0.92372727394104,
 0.9241636395454407,
 0.9248181581497192,
 0.9228545427322388,
 0.9223454594612122,
 0.9237818121910095,
 0.9185818433761597,
 0.9234908819198608]

In [24]:
# loss of trained words
my_model.history['loss']

[0.2969233989715576,
 0.266635000705719,
 0.2498001605272293,
 0.24169543385505676,
 0.2351873517036438,
 0.22774413228034973,
 0.2221507579088211,
 0.21787495911121368,
 0.2134140580892563,
 0.20984987914562225,
 0.205118790268898,
 0.20651927590370178,
 0.20015107095241547,
 0.20086081326007843,
 0.20068475604057312,
 0.19932757318019867,
 0.19362248480319977,
 0.19373877346515656,
 0.21317680180072784,
 0.20116235315799713]

In [25]:
# loss of tested words
my_model.history['val_loss']

[0.2785904109477997,
 0.2617946267127991,
 0.2615647315979004,
 0.2510560154914856,
 0.24453827738761902,
 0.24883835017681122,
 0.23639553785324097,
 0.24426767230033875,
 0.23638227581977844,
 0.23861585557460785,
 0.2371080070734024,
 0.2520958185195923,
 0.24652400612831116,
 0.2634761333465576,
 0.25040218234062195,
 0.2432243824005127,
 0.24720682203769684,
 0.26243844628334045,
 0.25262191891670227,
 0.3734048306941986]