<a href="https://colab.research.google.com/github/ciepielajan/Multi-Class-Classification-NLP/blob/main/Detecting_intentions_CNN_KerasClassifier_i_RandomizedSearchCV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Pobranie danych

In [1]:
# https://drive.google.com/file/d/1fI6EXyD9TMTC1jzdu206ljXOGNjdHprq/view?usp=sharing
!gdown --id "1fI6EXyD9TMTC1jzdu206ljXOGNjdHprq"

Downloading...
From: https://drive.google.com/uc?id=1fI6EXyD9TMTC1jzdu206ljXOGNjdHprq
To: /content/user_intent.zip
  0% 0.00/271k [00:00<?, ?B/s]100% 271k/271k [00:00<00:00, 29.2MB/s]


In [2]:
!unzip "user_intent.zip"

Archive:  user_intent.zip
  inflating: train.csv               
  inflating: __MACOSX/._train.csv    
  inflating: validation.csv          


In [3]:
import pandas as pd
import numpy as np
from datetime import datetime

data_set_train = pd.read_csv("train.csv")
data_set_valid = pd.read_csv("validation.csv")

print(data_set_train.shape)
print(data_set_valid.shape)

print(data_set_train.columns)
print(data_set_valid.columns)

(13784, 2)
(700, 2)
Index(['text', 'label'], dtype='object')
Index(['text', 'label'], dtype='object')


#### Podstawowy process text

In [4]:
import re
def process_text(sentence):
    sentence = re.sub('[A-Za-z0-9]+@[a-zA-z].[a-zA-Z]+', '', sentence)  # maile
    sentence = re.sub('(http[s]*:[/][/])[a-zA-Z0-9]+', '', sentence)  # linki
    sentence = re.sub(r"<[^>]+>", " ", sentence) # remove html tag
    sentence = re.sub(r"[^a-zA-Z\s]", "", sentence)  # remove punctuations and numbers   
    sentence = re.sub(r"\b[A-Za-z]{1}\b", "", sentence)  # remove single characters

    sentence = re.sub("^\s+|\s+$", "", sentence, flags=re.UNICODE) # Remove spaces both in the BEGINNING and in the END of a string:
    sentence = " ".join(re.split("\s+", sentence, flags=re.UNICODE))  # Remove ONLY DUPLICATE spaces:
    sentence = sentence.lower()
    
    return sentence

In [5]:
data_set_train["clean_text"] = data_set_train["text"].apply(lambda x: process_text(x))

#### `LabelEncoder` oraz `to_categorical`

In [6]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical 


labelencoder = LabelEncoder()
data_set_train["labelencoder"] = labelencoder.fit_transform(data_set_train["label"])

dummy_y = to_categorical(data_set_train["labelencoder"], dtype ="float32")

data_set_train[["clean_text","label","labelencoder"]].head(5)

Unnamed: 0,clean_text,label,labelencoder
0,find cinema nearest for films,SearchScreeningEvent,6
1,give the current series two stars,RateBook,4
2,find the good girl at movie house,SearchScreeningEvent,6
3,please make reservations for three at kosher t...,BookRestaurant,1
4,what is the forecast for here one second from now,GetWeather,2


In [7]:
dummy_y.shape

(13784, 7)

In [8]:
id_intention = 6
print("Sprawdzenie poprawności LabelEncoder i to_categorical \n")
print("Label - ", data_set_train["label"].iloc[id_intention])
print("LabelEncoder - ", data_set_train["labelencoder"].iloc[id_intention])
print()
print("to_categorical - ", dummy_y[id_intention])
print()
print("return to LabelEncoder - ",np.argmax(dummy_y[id_intention], axis=-1))
print("return to Label - ",labelencoder.inverse_transform([np.argmax(dummy_y[id_intention], axis=-1)]))

Sprawdzenie poprawności LabelEncoder i to_categorical 

Label -  BookRestaurant
LabelEncoder -  1

to_categorical -  [0. 1. 0. 0. 0. 0. 0.]

return to LabelEncoder -  1
return to Label -  ['BookRestaurant']


#### Zdefiniowanie X i y

In [9]:
X = data_set_train["clean_text"]
y = dummy_y

In [10]:
X.shape, y.shape

((13784,), (13784, 7))

#### Podział zbioru 

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((11027,), (2757,), (11027, 7), (2757, 7))

#### `Tokenizer` i `pad_sequences`

In [13]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [14]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

maxlen = 7
vocab_size = len(tokenizer.word_index) + 1

X_train = pad_sequences(X_train, padding="post", truncating="post", maxlen=maxlen)
X_test = pad_sequences(X_test, padding="post", truncating="post", maxlen=maxlen)


#### `KerasClassifier` i `RandomizedSearchCV`

In [15]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

In [16]:
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPooling1D, Dropout, Conv1D

In [17]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(GlobalMaxPooling1D())
    # model.add(Dense(10, activation='relu'))
    model.add(Dense(7, activation="softmax")) # sigmoid / softmax
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

In [18]:
from keras.wrappers.scikit_learn import KerasClassifier  
from sklearn.model_selection import RandomizedSearchCV

# Main settings
epochs = 20
embedding_dim = 100
maxlen = 7   #100   #7
output_file = 'output.txt'  # data/

# Run grid search for each source (yelp, amazon, imdb)
# for source, frame in df.groupby('source'):
# print('Running grid search for data set :', source)
sentences = data_set_train["clean_text"].values  
y = dummy_y
# sentences = df['sentence'].values
# y = df['label'].values

# Train-test split
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

# Tokenize words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences with zeros
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# Parameter grid for grid search
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7],
                  vocab_size=[vocab_size],
                  embedding_dim=[embedding_dim],
                  maxlen=[maxlen])
model = KerasClassifier(build_fn=create_model,
                        epochs=epochs, batch_size=64,
                        verbose=False)
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=4, verbose=10, n_iter=5)
grid_result = grid.fit(X_train, y_train)

# Evaluate testing set
test_accuracy = grid.score(X_test, y_test)

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] vocab_size=9462, num_filters=32, maxlen=7, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  vocab_size=9462, num_filters=32, maxlen=7, kernel_size=3, embedding_dim=100, score=0.948, total=  26.4s
[CV] vocab_size=9462, num_filters=32, maxlen=7, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.4s remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=7, kernel_size=3, embedding_dim=100, score=0.958, total=  26.6s
[CV] vocab_size=9462, num_filters=32, maxlen=7, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   53.0s remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=7, kernel_size=3, embedding_dim=100, score=0.961, total=  26.1s
[CV] vocab_size=9462, num_filters=32, maxlen=7, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.3min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=7, kernel_size=3, embedding_dim=100, score=0.959, total=  25.7s
[CV] vocab_size=9462, num_filters=128, maxlen=7, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.7min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=7, kernel_size=3, embedding_dim=100, score=0.950, total=  29.8s
[CV] vocab_size=9462, num_filters=128, maxlen=7, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=7, kernel_size=3, embedding_dim=100, score=0.959, total=  29.9s
[CV] vocab_size=9462, num_filters=128, maxlen=7, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  2.7min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=7, kernel_size=3, embedding_dim=100, score=0.967, total=  29.4s
[CV] vocab_size=9462, num_filters=128, maxlen=7, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  3.2min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=7, kernel_size=3, embedding_dim=100, score=0.959, total=  29.8s
[CV] vocab_size=9462, num_filters=64, maxlen=7, kernel_size=7, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  3.7min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=7, kernel_size=7, embedding_dim=100, score=0.950, total=  26.0s
[CV] vocab_size=9462, num_filters=64, maxlen=7, kernel_size=7, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  4.2min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=7, kernel_size=7, embedding_dim=100, score=0.961, total=  26.3s
[CV] vocab_size=9462, num_filters=64, maxlen=7, kernel_size=7, embedding_dim=100 
[CV]  vocab_size=9462, num_filters=64, maxlen=7, kernel_size=7, embedding_dim=100, score=0.963, total=  26.2s
[CV] vocab_size=9462, num_filters=64, maxlen=7, kernel_size=7, embedding_dim=100 
[CV]  vocab_size=9462, num_filters=64, maxlen=7, kernel_size=7, embedding_dim=100, score=0.962, total=  26.5s
[CV] vocab_size=9462, num_filters=32, maxlen=7, kernel_size=5, embedding_dim=100 
[CV]  vocab_size=9462, num_filters=32, maxlen=7, kernel_size=5, embedding_dim=100, score=0.945, total=  25.6s
[CV] vocab_size=9462, num_filters=32, maxlen=7, kernel_size=5, embedding_dim=100 
[CV]  vocab_size=9462, num_filters=32, maxlen=7, kernel_size=5, embedding_dim=100, score=0.961, total=  25.8s
[CV] vocab_size=9462, num_filters=32, maxlen=7, kernel_size=5, embedding_dim=100 
[CV]  vocab_size=9462, num_filters=32, m

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  8.8min finished


In [19]:
grid_result.best_score_

0.959083691239357

In [20]:
grid_result.best_params_

{'embedding_dim': 100,
 'kernel_size': 7,
 'maxlen': 7,
 'num_filters': 64,
 'vocab_size': 9462}

In [21]:
# {'embedding_dim': 100,
#  'kernel_size': 7,
#  'maxlen': 7,
#  'num_filters': 64,
#  'vocab_size': 9462}
# 0.9552150815725327

In [22]:
# {'embedding_dim': 100,
#  'kernel_size': 3,
#  'maxlen': 15,
#  'num_filters': 64,
#  'vocab_size': 9462}
# 0.9813313335180283

In [23]:
from keras.wrappers.scikit_learn import KerasClassifier  
from sklearn.model_selection import RandomizedSearchCV

# Main settings
epochs = 20
embedding_dim = 100
maxlen = 15   #100   #7
output_file = 'output.txt'  # data/

# Run grid search for each source (yelp, amazon, imdb)
# for source, frame in df.groupby('source'):
# print('Running grid search for data set :', source)
sentences = data_set_train["clean_text"].values  
y = dummy_y
# sentences = df['sentence'].values
# y = df['label'].values

# Train-test split
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

# Tokenize words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences with zeros
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# Parameter grid for grid search
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7],
                  vocab_size=[vocab_size],
                  embedding_dim=[embedding_dim],
                  maxlen=[maxlen])
model = KerasClassifier(build_fn=create_model,
                        epochs=epochs, batch_size=64,
                        verbose=False)
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=4, verbose=10, n_iter=5)
grid_result = grid.fit(X_train, y_train)

# Evaluate testing set
test_accuracy = grid.score(X_test, y_test)

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] vocab_size=9462, num_filters=128, maxlen=15, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  vocab_size=9462, num_filters=128, maxlen=15, kernel_size=3, embedding_dim=100, score=0.981, total=  37.9s
[CV] vocab_size=9462, num_filters=128, maxlen=15, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   37.9s remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=15, kernel_size=3, embedding_dim=100, score=0.986, total=  37.8s
[CV] vocab_size=9462, num_filters=128, maxlen=15, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=15, kernel_size=3, embedding_dim=100, score=0.983, total=  38.0s
[CV] vocab_size=9462, num_filters=128, maxlen=15, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.9min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=15, kernel_size=3, embedding_dim=100, score=0.983, total=  38.1s
[CV] vocab_size=9462, num_filters=32, maxlen=15, kernel_size=5, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.5min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=15, kernel_size=5, embedding_dim=100, score=0.980, total=  31.6s
[CV] vocab_size=9462, num_filters=32, maxlen=15, kernel_size=5, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.1min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=15, kernel_size=5, embedding_dim=100, score=0.986, total=  31.2s
[CV] vocab_size=9462, num_filters=32, maxlen=15, kernel_size=5, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  3.6min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=15, kernel_size=5, embedding_dim=100, score=0.981, total=  31.0s
[CV] vocab_size=9462, num_filters=32, maxlen=15, kernel_size=5, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  4.1min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=15, kernel_size=5, embedding_dim=100, score=0.985, total=  31.6s
[CV] vocab_size=9462, num_filters=32, maxlen=15, kernel_size=7, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  4.6min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=15, kernel_size=7, embedding_dim=100, score=0.978, total=  32.7s
[CV] vocab_size=9462, num_filters=32, maxlen=15, kernel_size=7, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  5.2min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=15, kernel_size=7, embedding_dim=100, score=0.985, total=  32.8s
[CV] vocab_size=9462, num_filters=32, maxlen=15, kernel_size=7, embedding_dim=100 
[CV]  vocab_size=9462, num_filters=32, maxlen=15, kernel_size=7, embedding_dim=100, score=0.982, total=  32.4s
[CV] vocab_size=9462, num_filters=32, maxlen=15, kernel_size=7, embedding_dim=100 
[CV]  vocab_size=9462, num_filters=32, maxlen=15, kernel_size=7, embedding_dim=100, score=0.986, total=  32.1s
[CV] vocab_size=9462, num_filters=128, maxlen=15, kernel_size=5, embedding_dim=100 
[CV]  vocab_size=9462, num_filters=128, maxlen=15, kernel_size=5, embedding_dim=100, score=0.978, total=  41.6s
[CV] vocab_size=9462, num_filters=128, maxlen=15, kernel_size=5, embedding_dim=100 
[CV]  vocab_size=9462, num_filters=128, maxlen=15, kernel_size=5, embedding_dim=100, score=0.986, total=  41.4s
[CV] vocab_size=9462, num_filters=128, maxlen=15, kernel_size=5, embedding_dim=100 
[CV]  vocab_size=9462, nu

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 11.8min finished


In [24]:
grid_result.best_score_

0.9841365814208984

In [25]:
grid_result.best_params_

{'embedding_dim': 100,
 'kernel_size': 5,
 'maxlen': 15,
 'num_filters': 128,
 'vocab_size': 9462}

In [26]:
from keras.wrappers.scikit_learn import KerasClassifier  
from sklearn.model_selection import RandomizedSearchCV

# Main settings
epochs = 20
embedding_dim = 100
maxlen = 25   #100   #7
output_file = 'output.txt'  # data/

# Run grid search for each source (yelp, amazon, imdb)
# for source, frame in df.groupby('source'):
# print('Running grid search for data set :', source)
sentences = data_set_train["clean_text"].values  
y = dummy_y
# sentences = df['sentence'].values
# y = df['label'].values

# Train-test split
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

# Tokenize words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences with zeros
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# Parameter grid for grid search
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7],
                  vocab_size=[vocab_size],
                  embedding_dim=[embedding_dim],
                  maxlen=[maxlen])
model = KerasClassifier(build_fn=create_model,
                        epochs=epochs, batch_size=64,
                        verbose=False)
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=4, verbose=10, n_iter=5)
grid_result = grid.fit(X_train, y_train)

# Evaluate testing set
test_accuracy = grid.score(X_test, y_test)

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] vocab_size=9462, num_filters=64, maxlen=25, kernel_size=5, embedding_dim=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  vocab_size=9462, num_filters=64, maxlen=25, kernel_size=5, embedding_dim=100, score=0.978, total=  44.3s
[CV] vocab_size=9462, num_filters=64, maxlen=25, kernel_size=5, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   44.3s remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=25, kernel_size=5, embedding_dim=100, score=0.986, total=  43.9s
[CV] vocab_size=9462, num_filters=64, maxlen=25, kernel_size=5, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.5min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=25, kernel_size=5, embedding_dim=100, score=0.983, total=  43.7s
[CV] vocab_size=9462, num_filters=64, maxlen=25, kernel_size=5, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.2min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=25, kernel_size=5, embedding_dim=100, score=0.986, total=  44.1s
[CV] vocab_size=9462, num_filters=128, maxlen=25, kernel_size=7, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.9min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=25, kernel_size=7, embedding_dim=100, score=0.981, total= 1.1min
[CV] vocab_size=9462, num_filters=128, maxlen=25, kernel_size=7, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.0min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=25, kernel_size=7, embedding_dim=100, score=0.987, total= 1.1min
[CV] vocab_size=9462, num_filters=128, maxlen=25, kernel_size=7, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  5.0min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=25, kernel_size=7, embedding_dim=100, score=0.985, total= 1.0min
[CV] vocab_size=9462, num_filters=128, maxlen=25, kernel_size=7, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  6.1min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=25, kernel_size=7, embedding_dim=100, score=0.987, total= 1.0min
[CV] vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  7.1min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, embedding_dim=100, score=0.981, total=  40.6s
[CV] vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  7.8min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, embedding_dim=100, score=0.985, total=  40.5s
[CV] vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, embedding_dim=100 
[CV]  vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, embedding_dim=100, score=0.983, total=  40.5s
[CV] vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, embedding_dim=100 
[CV]  vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, embedding_dim=100, score=0.984, total=  40.4s
[CV] vocab_size=9462, num_filters=32, maxlen=25, kernel_size=5, embedding_dim=100 
[CV]  vocab_size=9462, num_filters=32, maxlen=25, kernel_size=5, embedding_dim=100, score=0.979, total=  39.0s
[CV] vocab_size=9462, num_filters=32, maxlen=25, kernel_size=5, embedding_dim=100 
[CV]  vocab_size=9462, num_filters=32, maxlen=25, kernel_size=5, embedding_dim=100, score=0.985, total=  38.9s
[CV] vocab_size=9462, num_filters=32, maxlen=25, kernel_size=5, embedding_dim=100 
[CV]  vocab_size=9462, num_fil

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 15.6min finished


In [27]:
grid_result.best_score_

0.9849102348089218

In [28]:
grid_result.best_params_

{'embedding_dim': 100,
 'kernel_size': 7,
 'maxlen': 25,
 'num_filters': 128,
 'vocab_size': 9462}

In [29]:
from keras.wrappers.scikit_learn import KerasClassifier  
from sklearn.model_selection import RandomizedSearchCV

# Main settings
epochs = 20
embedding_dim = 100
maxlen = 35   #100   #7
output_file = 'output.txt'  # data/

# Run grid search for each source (yelp, amazon, imdb)
# for source, frame in df.groupby('source'):
# print('Running grid search for data set :', source)
sentences = data_set_train["clean_text"].values  
y = dummy_y
# sentences = df['sentence'].values
# y = df['label'].values

# Train-test split
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

# Tokenize words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences with zeros
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# Parameter grid for grid search
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7],
                  vocab_size=[vocab_size],
                  embedding_dim=[embedding_dim],
                  maxlen=[maxlen])
model = KerasClassifier(build_fn=create_model,
                        epochs=epochs, batch_size=64,
                        verbose=False)
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=4, verbose=10, n_iter=5)
grid_result = grid.fit(X_train, y_train)

# Evaluate testing set
test_accuracy = grid.score(X_test, y_test)


Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=7, embedding_dim=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=7, embedding_dim=100, score=0.980, total=  48.0s
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=7, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   48.0s remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=7, embedding_dim=100, score=0.986, total=  48.2s
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=7, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.6min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=7, embedding_dim=100, score=0.985, total=  48.1s
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=7, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.4min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=7, embedding_dim=100, score=0.985, total=  48.0s
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  3.2min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=3, embedding_dim=100, score=0.983, total=  38.7s
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.9min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=3, embedding_dim=100, score=0.986, total=  38.4s
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  4.5min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=3, embedding_dim=100, score=0.986, total=  38.3s
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  5.1min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=3, embedding_dim=100, score=0.984, total=  38.3s
[CV] vocab_size=9462, num_filters=128, maxlen=35, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  5.8min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=35, kernel_size=3, embedding_dim=100, score=0.980, total=  55.1s
[CV] vocab_size=9462, num_filters=128, maxlen=35, kernel_size=3, embedding_dim=100 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  6.7min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=35, kernel_size=3, embedding_dim=100, score=0.987, total=  55.0s
[CV] vocab_size=9462, num_filters=128, maxlen=35, kernel_size=3, embedding_dim=100 
[CV]  vocab_size=9462, num_filters=128, maxlen=35, kernel_size=3, embedding_dim=100, score=0.984, total=  54.9s
[CV] vocab_size=9462, num_filters=128, maxlen=35, kernel_size=3, embedding_dim=100 
[CV]  vocab_size=9462, num_filters=128, maxlen=35, kernel_size=3, embedding_dim=100, score=0.985, total=  54.9s
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=5, embedding_dim=100 
[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=5, embedding_dim=100, score=0.979, total=  43.6s
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=5, embedding_dim=100 
[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=5, embedding_dim=100, score=0.986, total=  43.8s
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=5, embedding_dim=100 
[CV]  vocab_size=9462, nu

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 16.9min finished


In [30]:
grid_result.best_score_

0.984813392162323

In [31]:
grid_result.best_params_

{'embedding_dim': 100,
 'kernel_size': 3,
 'maxlen': 35,
 'num_filters': 32,
 'vocab_size': 9462}

#### Predykcja na zbiorze validacyjnym

In [32]:
# oczysczenie danych
data_set_valid["clean_text"] = data_set_valid["text"].apply(lambda x: process_text(x))

# labelencoder 
data_set_valid["labelencoder"] = labelencoder.fit_transform(data_set_valid["label"])

# tokenizacja weg przetrenowanego już tokenizera
X_validate = tokenizer.texts_to_sequences(data_set_valid["clean_text"])

# pad sequel
X_validate = pad_sequences(X_validate, padding="post", truncating="post", maxlen=maxlen)
X_validate

array([[ 16,  42,  53, ...,   0,   0,   0],
       [250, 176,  21, ...,   0,   0,   0],
       [ 16,   9,   1, ...,   0,   0,   0],
       ...,
       [ 27,   1,  84, ...,   0,   0,   0],
       [ 16,   9,   1, ...,   0,   0,   0],
       [ 44,  40,   4, ...,   0,   0,   0]], dtype=int32)

In [33]:
dummy_y_valid = data_set_valid["labelencoder"].values
dummy_y_valid[:5]

array([6, 3, 2, 0, 6])

In [34]:
# Sprawdzenie rozmiaru zbiorów validacyjnego
X_validate.shape, dummy_y_valid.shape

((700, 35), (700,))

In [35]:
predicted_lstm_val = np.argmax(model.predict(X_validate), axis=-1)
predicted_lstm_val[:5]

AttributeError: ignored

#### Rozkodowanie przewidzianych i prawidłowych etykiet

In [None]:
y_pred = labelencoder.inverse_transform(predicted_lstm_val)
y_pred = pd.Series(y_pred)
y_pred.head()

In [None]:
y_val = labelencoder.inverse_transform(dummy_y_valid)
y_val = pd.Series(y_val)
y_val.head()

#### `Confusion matrix`

In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
classes = np.unique(y_val)

print('Accuracy:', round(accuracy_score(y_val, y_pred),2))
print('F1_score:', round(f1_score(y_val, y_pred, average='weighted'),2))

print(classification_report(y_val, y_pred))
# Plot confusion matrix
cm = confusion_matrix(y_val, y_pred)
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues, cbar=False)
ax.set(xlabel='Pred', ylabel='True', xticklabels=classes, yticklabels=classes, title='Confusion matrix')
plt.yticks(rotation=0)
plt.xticks(rotation=90)

#### Zapoznanie się z błędnymi predykcjami 

In [None]:
indexes = []
for i, phrase in enumerate(y_pred):
  if phrase == 'SearchCreativeWork':
    if y_val[i] == 'SearchScreeningEvent':
      indexes.append(i)
print(indexes)

In [None]:
for i in indexes:
  print(f"----------------------------\nTekst komendy:\n{data_set_valid['text'][i]}")
  print(f"Oczyszczona komenda:\n{data_set_valid['clean_text'][i]}")
  print(f'True category: {y_val[i]}')
  print(f'Predicted category: {y_pred[i]}')
  print("\n")

Wnioski:
> 