In [1]:
import pandas as pd

# Load data
data = pd.read_csv('queries.csv')


In [2]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Preprocess function
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]
    stopwords = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stopwords]
    return ' '.join(tokens)

# Apply preprocessing
data['query'] = data['query'].apply(preprocess_text)

# Vectorize text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['query'])
y = data['category']


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/closerlook/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/closerlook/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Choose a model
model = LogisticRegression()
# model = SVC()

# Train the model
model.fit(X_train, y_train)


In [5]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))


Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

  ticket_buy       1.00      1.00      1.00        46
   trip_plan       1.00      1.00      1.00        54

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [6]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear']
}

# Initialize Grid Search
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best parameters and score
print('Best Parameters:', grid_search.best_params_)
print('Best Score:', grid_search.best_score_)


Best Parameters: {'C': 0.1, 'solver': 'newton-cg'}
Best Score: 1.0


In [7]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize data
train_encodings = tokenizer(list(data['query']), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(data['query']), truncation=True, padding=True, max_length=128)

# Create torch dataset
class QueryDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = QueryDataset(train_encodings, list(y_train))
test_dataset = QueryDataset(test_encodings, list(y_test))

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train and evaluate
trainer.train()
trainer.evaluate()


2024-07-28 13:19:38.809790: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


: 

In [12]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


data = pd.read_csv('queries.csv')


nltk.download('punkt')
nltk.download('stopwords')


def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha()]
    stopwords = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stopwords]
    return ' '.join(tokens)


data['query'] = data['query'].apply(preprocess_text)

# Vectorize text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['query'])
y = data['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
# model = SVC()


model.fit(X_train, y_train)


y_pred = model.predict(X_test)


print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))


def predict_category(query):
    query = preprocess_text(query)
    query_vec = vectorizer.transform([query])
    category = model.predict(query_vec)
    return category[0]

import joblib

# Save the model and vectorizer
joblib.dump(model, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')


new_query = "I want to buy a plane ticket"
predicted_category = predict_category(new_query)
print(f'The predicted category for the query "{new_query}" is "{predicted_category}".')


Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

  ticket_buy       1.00      1.00      1.00        46
   trip_plan       1.00      1.00      1.00        54

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

The predicted category for the query "I want to buy a plane ticket" is "ticket_buy".


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/closerlook/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/closerlook/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Example usage
new_query = "i want to buy vegetables"
predicted_category = predict_category(new_query)
print(f'The predicted category for the query "{new_query}" is "{predicted_category}".')

The predicted category for the query "i want to buy vegetables" is "ticket_buy".
