In [1]:
#import the libraries

from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np

Create Model

In [2]:
#old model
model = keras.Sequential([
  keras.layers.Rescaling(1./255),
  keras.layers.Conv2D(32, (3,3), activation="relu"),
  keras.layers.MaxPooling2D(2),
  keras.layers.Conv2D(64, (3,3), activation="relu"),
  keras.layers.MaxPooling2D(2),
  keras.layers.Conv2D(128, (3,3), activation="relu"),
  keras.layers.MaxPooling2D(2),
  keras.layers.Conv2D(128, (3,3), activation="relu"),
  keras.layers.MaxPooling2D(2),
  keras.layers.Conv2D(256, (3,3), activation="relu"),
  keras.layers.Flatten(),
  keras.layers.Dense(1, activation="sigmoid"),

  
])

model.compile(loss="binary_crossentropy", optimizer="rmsprop", metrics=["accuracy"])

DATA LOADING

In [3]:
#Load the data
definitions = pd.read_csv('OPTED-Dictionary.csv')
simpDef1 = pd.read_excel('ChildFriendlyDefinitions.xlsx', sheet_name='Sheet1')
simpDef2 = pd.read_json('data.json')


DATA PREPROCESSING (Everything below this only needs to be run once. A CSV will be created with the full dataset)

In [4]:
import pandas as pd
def cleanDataframe(df):
    df_copy = df

    # may need to take the '-' out
    regex = "\[(.*?)\]|[0-9!@#$%^&*?\/=+\-]|\((.*?)\)|\{(.*?)\}|\<(.*?)\>"

    df_copy = df_copy.replace(to_replace=regex, value="", regex=True).dropna()  # remove illegal chars
    
    df_copy.word = df_copy.word.str.lower()  # lower case everything
    
    df_copy = df_copy.sort_values('word', ascending=True)
    df_copy = df_copy.drop_duplicates(subset='word', keep='first')
    
    return df_copy.reset_index().drop(['index'],axis=1)
    


ACTUAL DEFINITIONS

In [5]:
definitions_filter = definitions.drop(['Count', 'POS'], axis=1)
definitions_filter['word'] = definitions_filter['Word']
definitions_filter['definition'] = definitions_filter['Definition']

definitions_filter = definitions_filter.drop(['Word', 'Definition'], axis=1)
definitions_filter = cleanDataframe(df=definitions_filter)
definitions_filter


Unnamed: 0,word,definition
0,'em,"""An obsolete or colloquial contraction of the ..."
1,'gainst,"""A contraction of Against."""
2,'mongst,"""See Amongst."""
3,'neath,"""An abbreviation of Beneath."""
4,'s,"""A contraction for is or for has."""
...,...,...
111468,zymotic,"""Of pertaining to or caused by fermentation."""
111469,zyophyte,"""Any plant of a proposed class or grand divisi..."
111470,zythem,"""See Zythum."""
111471,zythepsary,"""A brewery."""


SIMPLIFIED DEFINITIONS

Part A

In [6]:
#Filter simpDef1
simpDef1_Filter = simpDef1.drop(['Exemplar'], axis=1)
simpDef1_Filter['word'] = simpDef1_Filter['Word']
simpDef1_Filter['definition'] = simpDef1_Filter['Child Friendly Definition']
simpDef1_Filter = simpDef1_Filter.drop(
    ['Word', 'Child Friendly Definition'], axis=1)
simpDef1_Filter = cleanDataframe(simpDef1_Filter)
simpDef1_Filter


Unnamed: 0,word,definition
0,accessible,When something is accessible it means anyone c...
1,accommodate,You accommodate when you change something that...
2,accomplish,"If you accomplish something, you succeed in do..."
3,achieve,"If you achieve something, you succeed in doing..."
4,acre,An acre is a very large area of land about the...
...,...,...
162,value,The value of a place or thing is how much mone...
163,verify,"If you verify something, you make sure that it..."
164,vigilant,Someone who is vigilant pays careful attention...
165,visible,"When something is visible, you can see it."


Part B

In [7]:
simpDef2_filter = simpDef2
simpDef2_Filter = cleanDataframe(simpDef2_filter)
simpDef2_Filter


Unnamed: 0,word,definition
0,'s,a suffix used to form the possessive of most s...
1,'tis,"shortened form of ""it is."""
2,'twas,"shortened form of ""it was."""
3,a,the first letter of the English alphabet.
4,a dime a dozen,plentiful and easy to get; common; cheap.
...,...,...
13907,zone,an area that is divided from other areas becau...
13908,zoo,"a place where living animals, especially wild ..."
13909,zoology,the science and study of animals.
13910,zoom,to move quickly while making a low humming sou...


Getting Dataset ready for training

In [8]:
#Basically we are combining all the dataset together and putting it into a csv
#Doing this because i don't want to constantly run this script (takes up RAM that I dont have)

df = pd.DataFrame(columns=['word','definition', 'simplified_definition'])

for index, row in simpDef1_Filter.iterrows():
    defs = definitions_filter[row['word'] == definitions_filter.word]['definition']
    if defs.count() >= 1:
        w = row['word']
        simpDefs = row['definition']
        defs = defs.values[0]
        df = pd.concat([df, pd.DataFrame([[w, defs, simpDefs]], columns=['word', 'definition', 'simplified_definition'])], ignore_index=True)

for index, row in simpDef2_Filter.iterrows():
    defs = definitions_filter[row['word'] ==
                              definitions_filter.word]['definition']
    if defs.count() >= 1:
        w = row['word']
        simpDefs = row['definition']
        defs = defs.values[0]
        df = pd.concat([df, pd.DataFrame([[w, defs, simpDefs]], columns=[
                       'word', 'definition', 'simplified_definition'])], ignore_index=True)

df

Unnamed: 0,word,definition,simplified_definition
0,accessible,"""Easy of access or approach; approachable; as ...",When something is accessible it means anyone c...
1,accommodate,"""To render fit suitable or correspondent; to ...",You accommodate when you change something that...
2,accomplish,"""To complete as time or distance.""","If you accomplish something, you succeed in do..."
3,achieve,"""To finish; to kill.""","If you achieve something, you succeed in doing..."
4,acre,"""A piece of land containing square rods or ...",An acre is a very large area of land about the...
...,...,...,...
10913,zither,"""An instrument of music used in Austria and Ge...",a stringed instrument that has a flat sound bo...
10914,zodiac,"""An imaginary belt in the heavens or broad ...",an imaginary belt in the heavens that includes...
10915,zone,"""Circuit; circumference.""",an area that is divided from other areas becau...
10916,zoo,"""A combining form from Gr. zwo n an animal as ...","a place where living animals, especially wild ..."


In [9]:
df.to_csv("fullDataset.csv", index=False)

Training the Algorithm

In [None]:
callbacks = [
    keras.callbacks.ModelCheckpoint(
        filepath="convnet_from_scratch.keras",
        save_best_only=True,
        monitor="val_loss")
]
history = model.fit(
    train_dataset,
    epochs=30,
    validation_data=validation_dataset,
    callbacks=callbacks)

Predicting/Plotting Data

In [None]:
import matplotlib.pyplot as plt
accuracy = history.history["accuracy"]
val_accuracy = history.history["val_accuracy"]
loss = history.history["loss"]
val_loss = history.history["val_loss"]
epochs = range(1, len(accuracy) + 1)
plt.plot(epochs, accuracy, "bo", label="Training accuracy")
plt.plot(epochs, val_accuracy, "b", label="Validation accuracy")
plt.title("Training and validation accuracy")
plt.legend()
plt.figure()
plt.plot(epochs, loss, "bo", label="Training loss")
plt.plot(epochs, val_loss, "b", label="Validation loss")
plt.title("Training and validation loss")
plt.legend()
plt.show()