In [46]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [47]:
df = pd.read_csv("vehicle_queries.csv")
df.head()

Unnamed: 0,query,generalized
0,Recommend a vehicle for long-distance travel,yes
1,Can I get a discount on this bus?,no
2,Show me some more details for this application,no
3,I want a bus for school transportation,yes
4,I want to buy a bus for transporting my staff,yes


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   query        80000 non-null  object
 1   generalized  80000 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


In [49]:
df.describe()

Unnamed: 0,query,generalized
count,80000,80000
unique,20,2
top,I want to buy a truck,yes
freq,4103,40095


In [50]:
# Seperating feature and target
X = df['query']
y = df['generalized']

In [51]:
le = LabelEncoder()
y = le.fit_transform(y)

In [52]:
# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
tfidf = TfidfVectorizer(stop_words="english")
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf= tfidf.transform(X_test)

In [54]:
model = DecisionTreeClassifier()
model.fit(X_train_tf, y_train)

In [55]:
y_pred = model.predict(X_test_tf)

In [56]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7883
           1       1.00      1.00      1.00      8117

    accuracy                           1.00     16000
   macro avg       1.00      1.00      1.00     16000
weighted avg       1.00      1.00      1.00     16000



In [57]:
model.score(X_train_tf, y_train)

1.0

In [58]:
model.score(X_test_tf, y_test)

1.0

In [59]:
# Query
query = 'show me buses'

# Transform data
tf_query= tfidf.transform([query.lower()])
pred = model.predict(tf_query)

# Display
response_data = {"Query Type":list(le.inverse_transform(pred))}
print(response_data)

{'Query Type': ['yes']}


In [60]:
import pickle
with open('model.h5', 'wb') as file:
    pickle.dump(model, file)
