# Text Classifier

In [1]:
# Import necessary components
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from bs4 import BeautifulSoup
import cufflinks
from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

In [2]:
df = pd.read_csv('..\datasets\data_new.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210121 entries, 0 to 210120
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   statement  210121 non-null  object
 1   type       210121 non-null  object
dtypes: object(2)
memory usage: 3.2+ MB


In [3]:
df.type.value_counts()

question     130655
statement     78479
command         932
fact_req         55
Name: type, dtype: int64

In [4]:
print(df)

                                               statement       type
0                and confirm that address for me please     command
1       it's from Birmingham to em London Euston please   statement
2                                    the 8th of October   statement
3                   i'd like to leave on the 7:33 train   statement
4           there's the 7:33 from Birmingham New Street   statement
...                                                  ...        ...
210116                            ey yo can i get a fact   fact_req
210117                            you got a fact for me?   fact_req
210118                                ONE FACT PLEASE!!!   fact_req
210119               i could really use a fact right now   fact_req
210120                 i want a fact from you good madam   fact_req

[210121 rows x 2 columns]


In [5]:
# Text preprocessing
df = df.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    #text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    #text = text.replace('x', '')
    #text = re.sub(r'\W+', '', text)
    #text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text


df['statement'] = df['statement'].apply(clean_text)
df['statement'] = df['statement'].str.replace('\d+', '')



The default value of regex will change from True to False in a future version.



In [6]:
print(df)

                                               statement       type
0                and confirm that address for me please     command
1       it's from birmingham to em london euston please   statement
2                                     the th of october   statement
3                      i'd like to leave on the : train   statement
4              there's the : from birmingham new street   statement
...                                                  ...        ...
210116                            ey yo can i get a fact   fact_req
210117                            you got a fact for me?   fact_req
210118                                one fact please!!!   fact_req
210119               i could really use a fact right now   fact_req
210120                 i want a fact from you good madam   fact_req

[210121 rows x 2 columns]


In [7]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['statement'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))


Found 89276 unique tokens.


In [8]:
X = tokenizer.texts_to_sequences(df['statement'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)


Shape of data tensor: (210121, 250)


In [9]:
Y = pd.get_dummies(df['type']).values
print('Shape of label tensor:', Y.shape)


Shape of label tensor: (210121, 4)


In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(189108, 250) (189108, 4)
(21013, 250) (21013, 4)


In [12]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 2
batch_size = 256

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/2
 74/665 [==>...........................] - ETA: 29:18 - loss: 0.7627 - accuracy: 0.6224

KeyboardInterrupt: 

In [None]:
print(model.summary())

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))