In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('expressions.csv')

In [3]:
X_text = df['Expression']
y = df['Valid'].values

In [4]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(X_text)
X_seq = tokenizer.texts_to_sequences(X_text)

In [5]:
max_len = 30  
X_pad = pad_sequences(X_seq, maxlen=max_len, padding='post')
vocab_size = len(tokenizer.word_index) + 1 

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=32, input_length=max_len))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(32))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))



In [7]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(
    X_train, y_train,
    epochs=10,         
    batch_size=256,
    validation_split=0.1
)

Epoch 1/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 133ms/step - accuracy: 0.6258 - loss: 0.6284 - val_accuracy: 0.8600 - val_loss: 0.3432
Epoch 2/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 122ms/step - accuracy: 0.8734 - loss: 0.3298 - val_accuracy: 0.8941 - val_loss: 0.2771
Epoch 3/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 117ms/step - accuracy: 0.9103 - loss: 0.2325 - val_accuracy: 0.9848 - val_loss: 0.0677
Epoch 4/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 118ms/step - accuracy: 0.9877 - loss: 0.0484 - val_accuracy: 0.9800 - val_loss: 0.0756
Epoch 5/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 115ms/step - accuracy: 0.9915 - loss: 0.0364 - val_accuracy: 0.9934 - val_loss: 0.0241
Epoch 6/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 121ms/step - accuracy: 0.9975 - loss: 0.0125 - val_accuracy: 1.0000 - val_loss: 0.0034
Epoch 7/10

In [8]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

[1m438/438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - accuracy: 0.9999 - loss: 0.0026
Test Accuracy: 99.99%


In [9]:
new_expressions = ['6-6*2', '4+2', '2==3', '2=2', '6===6','2+3-4*1']
new_seq = tokenizer.texts_to_sequences(new_expressions)
new_pad = pad_sequences(new_seq, maxlen=max_len, padding='post')
predictions = model.predict(new_pad)
for expr, pred in zip(new_expressions, predictions):
    print(f"{expr} -> {'Valid' if pred>=0.5 else 'Invalid'}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 763ms/step
6-6*2 -> Valid
4+2 -> Valid
2==3 -> Invalid
2=2 -> Invalid
6===6 -> Invalid
2+3-4*1 -> Valid


In [10]:
model.save('expressions.keras')

In [15]:
import pickle

In [16]:
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)