In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.metrics import balanced_accuracy_score

# **Intent Classification**

Intent classification (sometimes called intent recognition) is a technique used in natural language processing (NLP). Intent classification analyzes the intentions behind texts and categorizes them.

Some of the common techniques for intent classification are regex and machine learning algorithms.

This project will use labelled data from Airline Travel Information System with customer queries and intents as labels. We will use a supervised learning method Support Vector Machines (SVMs).

![](https://miro.medium.com/max/1400/1*CadlfKHwHualA9od3SM-ew.png)

# **EDA**

In [2]:
# load data and print first 10 rows
def load_data(path):
    df = pd.read_csv(path)
    df.columns = ['label', 'query']
    return df

df = load_data(os.path.join('..','data','external','atis-intent','atis_intents_test.csv'))
df.to_csv(os.path.join('..','data','external','atis-intent','atis_intents_test_.csv'))

In [3]:
# print shape of our data
print("There are {} rows and {} columns".format(df.shape[0], df.shape[1]))

There are 799 rows and 2 columns


In [4]:
# explore unique labels
print(df.label.unique())

['atis_airfare' 'atis_flight' 'atis_ground_service' 'atis_airline'
 'atis_flight_time' 'atis_quantity' 'atis_abbreviation' 'atis_aircraft']


In [5]:
# explore which labels are the most and least common
df.label.value_counts()

atis_flight            631
atis_airfare            48
atis_airline            38
atis_ground_service     36
atis_abbreviation       33
atis_aircraft            9
atis_quantity            3
atis_flight_time         1
Name: label, dtype: int64

In [6]:
# drop rows with multiple labels
df = df[df["label"].str.contains("#")==False]
df.label.value_counts()

atis_flight            631
atis_airfare            48
atis_airline            38
atis_ground_service     36
atis_abbreviation       33
atis_aircraft            9
atis_quantity            3
atis_flight_time         1
Name: label, dtype: int64

# **Preprocessing**

In [9]:
!pip install spacy
!pip install markupsafe==2.0.1 --user
!python -m spacy download en_core_web_lg

Collecting spacy
  Downloading spacy-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.7-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.7/128.7 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting pydantic!=1.8,!=1.8.1,<1.10.0,>=1.7.4
  Downloading pydantic-1.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting langcodes<4.0.0,>=3.2.0
  Using cached langcodes-3.3.0-py3-none-any.whl (181 kB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.6-cp39-cp39-manylinux_2_17_x86

Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [11]:
# import libraries and load spacy English model
import spacy
import numpy as np

nlp = spacy.load("en_core_web_lg")
print("Number of vectors: {}".format(nlp.vocab.vectors_length))

Number of vectors: 300


In [12]:
# load training and test datasets
training_df = load_data(os.path.join('..','data','external','atis-intent','atis_intents_train.csv'))
training_df = training_df[training_df["label"].str.contains("#")==False]

sen_train = training_df['query'].tolist()
labels_train = training_df['label'].tolist()

test_df = load_data(os.path.join('..','data','external','atis-intent','atis_intents_test.csv'))
test_df = test_df[test_df["label"].str.contains("#")==False]
sen_test = test_df['query'].tolist()
labels_test = test_df['label'].tolist()

In [13]:
train_size = len(labels_train)
test_size = len(labels_test)


print('Train data has {} rows and test data has {} rows'.format(train_size, test_size))

Train data has 4833 rows and test data has 799 rows


In [14]:
def encode_sentences(sentences):
    # Calculate number of sentences
    n_sentences = len(sentences)
    X = np.zeros((n_sentences, 300))

    # Iterate over the sentences
    for idx, sentence in enumerate(sentences):
        # Pass each sentence to the nlp object to create a document
        doc = nlp(sentence)
        # Save the document's .vector attribute to the corresponding row in     
        # X
        X[idx, :] = doc.vector
    return X

train_X = encode_sentences(sen_train)
test_X = encode_sentences(sen_test)

## Visulize the encodings

In [15]:
!pip install -U scikit-learn



In [17]:
from sklearn.manifold import TSNE
import re


X = train_X
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)



In [18]:
df = pd.DataFrame(X_tsne, index=sen_train, columns=['x', 'y'])
df["sentence"]=sen_train
df["intent"]= labels_train

In [20]:
!pip install altair

Collecting altair
  Downloading altair-4.2.0-py3-none-any.whl (812 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m812.8/812.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting toolz
  Downloading toolz-0.12.0-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.8/55.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: toolz, altair
Successfully installed altair-4.2.0 toolz-0.12.0


In [21]:
import altair as alt
alt.Chart(df).mark_circle(size=60).encode(
    x='x',
    y='y',
    color='intent',
    tooltip=['sentence']
).interactive()

  for col_name, dtype in df.dtypes.iteritems():


# **Label Encoding**

In [22]:
# encode labels i.e turn text labels into integers
from sklearn.preprocessing import LabelEncoder

# instantiate label encoder object
le = LabelEncoder()

labels_test = le.fit_transform(labels_test)
labels_train = le.fit_transform(labels_train)

In [25]:
!pip install lazypredict


Collecting lazypredict
  Downloading lazypredict-0.2.9-py2.py3-none-any.whl (12 kB)
Collecting pytest==5.4.3
  Downloading pytest-5.4.3-py3-none-any.whl (248 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.1/248.1 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xgboost==1.1.1
  Downloading xgboost-1.1.1-py3-none-manylinux2010_x86_64.whl (127.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.6/127.6 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting six==1.15.0
  Downloading six-1.15.0-py2.py3-none-any.whl (10 kB)
Collecting pandas==1.0.5
  Downloading pandas-1.0.5.tar.gz (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpip subprocess to install build dependencies[0m did 

[?25h

In [26]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)


NameError: name 'LazyClassifier' is not defined

# **Intent classification with SVM**

In [None]:
# Import SVC
from sklearn.svm import SVC
# Train the model
clfs = SVC(C=1)
clfs.fit(train_X, labels_train)

In [None]:
print('Validation on the train set results:')
balanced_accuracy_score(labels_train,clf.predict(train_X))

In [None]:
# Validate model on test set
print('Validation on the test set results:')
balanced_accuracy_score(labels_test,clf.predict(test_X))

Model accuracy on test set is 1% higher than on train set most likely due to the small size of test set. 