In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Guilherme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Guilherme\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:


nltk.download('stopwords')
nltk.download('wordnet')

X = ["SELECT * FROM customers WHERE country = 'USA'",
     "spark.sql(\"SELECT COUNT(*) FROM orders WHERE order_date >= '2023-01-01'\")"]
y = ["sql", "pyspark"]

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    processed_text = ' '.join(words)

    processed_text = re.sub(r'([.,!?()])', r' \1 ', processed_text)

    return processed_text

X = [preprocess_text(text) for text in X]

vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(X).toarray()

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_categorical, test_size=0.2, random_state=42)

def build_model(input_shape):
    model = Sequential()
    model.add(Dense(128, input_shape=input_shape, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(len(label_encoder.classes_), activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def train_and_evaluate_model(model, X_train, X_test, y_train, y_test):
    history = model.fit(X_train, y_train, epochs=1000, batch_size=64, validation_data=(X_test, y_test), verbose=2)
    accuracy = model.evaluate(X_test, y_test)[1]
    print(f"Acurácia no conjunto de teste: {accuracy}")

    new_data = [
        "SELECT * FROM customers WHERE country = 'USA'",
        "spark.sql(\"SELECT COUNT(*) FROM orders WHERE order_date >= '2023-01-01'\")",
        "df.select('*').collect()"
    ]

    new_data = [preprocess_text(text) for text in new_data]
    new_data_tfidf = vectorizer.transform(new_data).toarray()
    predictions = model.predict(new_data_tfidf)

    for i, prediction in enumerate(predictions):
        label = label_encoder.inverse_transform([np.argmax(prediction)])[0]
        print(f"Código  {i + 1}: Tipo previsto - {label}")
        print(f"Consulta identificada: {new_data[i]}")

model = build_model((X_train.shape[1],))
train_and_evaluate_model(model, X_train, X_test, y_train, y_test)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Guilherme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Guilherme\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/1000
1/1 - 1s - loss: 0.6823 - accuracy: 0.6875 - val_loss: 0.6732 - val_accuracy: 0.6000 - 665ms/epoch - 665ms/step
Epoch 2/1000
1/1 - 0s - loss: 0.6651 - accuracy: 0.7500 - val_loss: 0.6802 - val_accuracy: 0.6000 - 35ms/epoch - 35ms/step
Epoch 3/1000
1/1 - 0s - loss: 0.6487 - accuracy: 0.8125 - val_loss: 0.6874 - val_accuracy: 0.6000 - 30ms/epoch - 30ms/step
Epoch 4/1000
1/1 - 0s - loss: 0.6330 - accuracy: 0.9375 - val_loss: 0.6948 - val_accuracy: 0.6000 - 29ms/epoch - 29ms/step
Epoch 5/1000
1/1 - 0s - loss: 0.6179 - accuracy: 1.0000 - val_loss: 0.7026 - val_accuracy: 0.6000 - 17ms/epoch - 17ms/step
Epoch 6/1000
1/1 - 0s - loss: 0.6032 - accuracy: 1.0000 - val_loss: 0.7112 - val_accuracy: 0.6000 - 24ms/epoch - 24ms/step
Epoch 7/1000
1/1 - 0s - loss: 0.5889 - accuracy: 1.0000 - val_loss: 0.7202 - val_accuracy: 0.6000 - 35ms/epoch - 35ms/step
Epoch 8/1000
1/1 - 0s - loss: 0.5746 - accuracy: 1.0000 - val_loss: 0.7288 - val_accuracy: 0.6000 - 31ms/epoch - 31ms/step
Epoch 9/1000
1