In [4]:
!pip install faker

Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.1.0


In [5]:
# pip install faker pandas

import pandas as pd
import random
from faker import Faker

# Initialize Faker and set seed for reproducibility
fake = Faker()
random.seed(42)
Faker.seed(42)

def generate_query_dataset(samples_per_category=200, output_file='query_dataset.csv'):
    """
    Generates a synthetic dataset of queries with categories:
    0 - Code, 1 - Reason, 2 - Language
    """
    categories = {
        0: "code",
        1: "reason",
        2: "language"
    }

    # Patterns and vocabulary components
    components = {
        "code": {
            "patterns": [
                "How to implement {algorithm} in {language}?",
                "Fix {error} in {language} code",
                "Best practices for {framework} development",
                "Tutorial on {tech} authentication",
                "Debugging {language} memory leaks"
            ],
            "variables": {
                "algorithm": ["binary search", "quicksort", "DFS", "A* algorithm"],
                "language": ["Python", "Java", "C++", "JavaScript"],
                "error": ["seg fault", "null pointer", "syntax error"],
                "framework": ["Django", "React", "Spring Boot"],
                "tech": ["OAuth2", "JWT", "SSL"]
            }
        },
        "reason": {
            "patterns": [
                "Why does {phenomenon} occur?",
                "Explain {concept}",
                "What causes {event}?",
                "How does {process} work?",
                "Scientific explanation of {topic}"
            ],
            "variables": {
                "phenomenon": ["gravity", "tides", "aurora borealis"],
                "concept": ["quantum entanglement", "relativity", "evolution"],
                "event": ["earthquakes", "volcanic eruptions", "tornadoes"],
                "process": ["photosynthesis", "nuclear fission", "digestion"],
                "topic": ["climate change", "black holes", "sleep patterns"]
            }
        },
        "language": {
            "patterns": [
                "Translate '{phrase}' to {lang}",
                "How to say {phrase} in {lang}?",
                "{lang} grammar rules for {grammar}",
                "Pronunciation of {word} in {lang}",
                "Difference between {lang1} and {lang2}"
            ],
            "variables": {
                "phrase": ["Hello", "Thank you", "Where is the bathroom?"],
                "lang": ["Spanish", "French", "German", "Mandarin"],
                "grammar": ["past tense", "subjunctive mood", "articles"],
                "word": ["hello", "goodbye", "please"],
                "lang1": ["Spanish", "French"],
                "lang2": ["Italian", "Portuguese"]
            }
        }
    }

    dataset = []
    for category_id, category_name in categories.items():
        cat_data = components[category_name]
        for _ in range(samples_per_category):
            pattern = random.choice(cat_data["patterns"])
            replacements = {}
            for key in cat_data["variables"]:
                if f"{{{key}}}" in pattern:
                    replacements[key] = random.choice(cat_data["variables"][key])
            query = pattern.format(**replacements)
            dataset.append({"query": query, "category": category_id})

    df = pd.DataFrame(dataset)
    df.to_csv(output_file, index=False)
    print(f"Dataset generated with {len(df)} samples. Saved to {output_file}")


generate_query_dataset(samples_per_category=500, output_file='query_dataset.csv')

Dataset generated with 1500 samples. Saved to query_dataset.csv


In [6]:
# Install required packages
# pip install tensorflow sentence-transformers pandas scikit-learn

import tensorflow as tf
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Check GPU availability
print("GPU Available:", tf.config.list_physical_devices('GPU'))

# Load dataset
df = pd.read_csv('query_dataset.csv')
texts = df['query'].tolist()
labels = df['category'].values

# Generate embeddings using GPU
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, labels, test_size=0.2, random_state=42, stratify=labels
)

# Create ANN model
def create_model(input_dim):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation='relu', input_shape=(input_dim,)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(3, activation='softmax')
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# Initialize and train model with GPU
with tf.device('/GPU:0') if tf.config.list_physical_devices('GPU') else tf.device('/CPU:0'):
    ann = create_model(embeddings.shape[1])

    early_stop = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=5, restore_best_weights=True
    )

    history = ann.fit(
        X_train, y_train,
        epochs=50,
        batch_size=256,
        validation_split=0.2,
        callbacks=[early_stop],
        verbose=1
    )

# Evaluate model
test_loss, test_acc = ann.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {test_acc:.4f}")

# Generate predictions
y_pred = np.argmax(ann.predict(X_test), axis=1)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the trained model
ann.save('query_classifier_model.keras')
print("Model saved as 'query_classifier_model.keras'")

# Example prediction function
def predict_query(text):
    embedding = model.encode([text], device='cuda')
    prediction = ann.predict(embedding)
    categories = {0: 'code', 1: 'reason', 2: 'language'}
    return categories[np.argmax(prediction)]



GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 743ms/step - accuracy: 0.5050 - loss: 1.0762 - val_accuracy: 0.9625 - val_loss: 0.9785
Epoch 2/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.9562 - loss: 0.9541 - val_accuracy: 0.9958 - val_loss: 0.7933
Epoch 3/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.9965 - loss: 0.7611 - val_accuracy: 0.9958 - val_loss: 0.5451
Epoch 4/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.9963 - loss: 0.5147 - val_accuracy: 1.0000 - val_loss: 0.2991
Epoch 5/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.9956 - loss: 0.2734 - val_accuracy: 1.0000 - val_loss: 0.1270
Epoch 6/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 1.0000 - loss: 0.1111 - val_accuracy: 1.0000 - val_loss: 0.0395
Epoch 7/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [10]:
# Test prediction
test_query = "what is your name "
print(f"\nPrediction for '{test_query}': {predict_query(test_query)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step

Prediction for 'what is your name ': language
