In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("Smart_Search_keywords.csv", header=0, names=['search', 'vehicle'])
df.head()

Unnamed: 0,search,vehicle
0,buy school bus,Buses
1,tata star bus ultra,Buses
2,tata ultra t16,Buses
3,Bus,Buses
4,Truck,Trucks


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 908 entries, 0 to 907
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   search   908 non-null    object
 1   vehicle  908 non-null    object
dtypes: object(2)
memory usage: 14.3+ KB


In [4]:
df.describe()

Unnamed: 0,search,vehicle
count,908,908
unique,843,4
top,tata magic,Trucks
freq,3,397


In [5]:
# Seperating feature and target
X = df['search']
y = df['vehicle']

In [6]:
le = LabelEncoder()
y = le.fit_transform(y)

In [7]:
# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
tfidf = TfidfVectorizer(stop_words="english")
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf= tfidf.transform(X_test)

In [9]:
model = DecisionTreeClassifier()
model.fit(X_train_tf, y_train)

In [10]:
y_pred = model.predict(X_test_tf)

In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.88      0.94        17
           1       1.00      0.97      0.98        60
           2       0.96      1.00      0.98        85
           3       1.00      1.00      1.00        20

    accuracy                           0.98       182
   macro avg       0.99      0.96      0.97       182
weighted avg       0.98      0.98      0.98       182



In [12]:
model.score(X_train_tf, y_train)

0.9986225895316805

In [13]:
le.inverse_transform(y_pred)

array(['Trucks', 'Vans', 'Trucks', 'Trucks', 'Trucks', 'Trucks', 'Trucks',
       'SCV & Pickups', 'Trucks', 'Trucks', 'Trucks', 'Trucks',
       'SCV & Pickups', 'SCV & Pickups', 'Trucks', 'Vans', 'Buses',
       'Vans', 'SCV & Pickups', 'SCV & Pickups', 'Trucks',
       'SCV & Pickups', 'SCV & Pickups', 'Vans', 'Vans', 'Trucks',
       'Trucks', 'Trucks', 'Trucks', 'Trucks', 'SCV & Pickups', 'Vans',
       'SCV & Pickups', 'SCV & Pickups', 'Trucks', 'Trucks', 'Trucks',
       'Buses', 'Trucks', 'Vans', 'SCV & Pickups', 'Trucks',
       'SCV & Pickups', 'SCV & Pickups', 'Trucks', 'SCV & Pickups',
       'Trucks', 'SCV & Pickups', 'Trucks', 'Trucks', 'Buses', 'Trucks',
       'SCV & Pickups', 'Buses', 'SCV & Pickups', 'SCV & Pickups',
       'SCV & Pickups', 'SCV & Pickups', 'Buses', 'Trucks', 'Trucks',
       'SCV & Pickups', 'Trucks', 'Trucks', 'SCV & Pickups',
       'SCV & Pickups', 'Buses', 'Trucks', 'Vans', 'Trucks', 'Vans',
       'Buses', 'Trucks', 'Buses', 'Trucks', 'Vans', 'T