<a href="https://colab.research.google.com/github/ciepielajan/Multi-Class-Classification-NLP/blob/main/Detecting_intentions_CNN_KerasClassifier_i_RandomizedSearchCV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Pobranie danych

In [1]:
# https://drive.google.com/file/d/1fI6EXyD9TMTC1jzdu206ljXOGNjdHprq/view?usp=sharing
!gdown --id "1fI6EXyD9TMTC1jzdu206ljXOGNjdHprq"

Downloading...
From: https://drive.google.com/uc?id=1fI6EXyD9TMTC1jzdu206ljXOGNjdHprq
To: /content/user_intent.zip
  0% 0.00/271k [00:00<?, ?B/s]100% 271k/271k [00:00<00:00, 39.2MB/s]


In [2]:
!unzip "user_intent.zip"

Archive:  user_intent.zip
  inflating: train.csv               
  inflating: __MACOSX/._train.csv    
  inflating: validation.csv          


In [3]:
import pandas as pd
import numpy as np
from datetime import datetime

data_set_train = pd.read_csv("train.csv")
data_set_valid = pd.read_csv("validation.csv")

print(data_set_train.shape)
print(data_set_valid.shape)

print(data_set_train.columns)
print(data_set_valid.columns)

(13784, 2)
(700, 2)
Index(['text', 'label'], dtype='object')
Index(['text', 'label'], dtype='object')


#### Podstawowy process text

In [4]:
import re
def process_text(sentence):
    sentence = re.sub('[A-Za-z0-9]+@[a-zA-z].[a-zA-Z]+', '', sentence)  # maile
    sentence = re.sub('(http[s]*:[/][/])[a-zA-Z0-9]+', '', sentence)  # linki
    sentence = re.sub(r"<[^>]+>", " ", sentence) # remove html tag
    sentence = re.sub(r"[^a-zA-Z\s]", "", sentence)  # remove punctuations and numbers   
    sentence = re.sub(r"\b[A-Za-z]{1}\b", "", sentence)  # remove single characters

    sentence = re.sub("^\s+|\s+$", "", sentence, flags=re.UNICODE) # Remove spaces both in the BEGINNING and in the END of a string:
    sentence = " ".join(re.split("\s+", sentence, flags=re.UNICODE))  # Remove ONLY DUPLICATE spaces:
    sentence = sentence.lower()
    
    return sentence

In [5]:
data_set_train["clean_text"] = data_set_train["text"].apply(lambda x: process_text(x))

#### `LabelEncoder` oraz `to_categorical`

In [6]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical 


labelencoder = LabelEncoder()
data_set_train["labelencoder"] = labelencoder.fit_transform(data_set_train["label"])

dummy_y = to_categorical(data_set_train["labelencoder"], dtype ="float32")

data_set_train[["clean_text","label","labelencoder"]].head(5)

Unnamed: 0,clean_text,label,labelencoder
0,find cinema nearest for films,SearchScreeningEvent,6
1,give the current series two stars,RateBook,4
2,find the good girl at movie house,SearchScreeningEvent,6
3,please make reservations for three at kosher t...,BookRestaurant,1
4,what is the forecast for here one second from now,GetWeather,2


In [7]:
dummy_y.shape

(13784, 7)

In [8]:
id_intention = 6
print("Sprawdzenie poprawności LabelEncoder i to_categorical \n")
print("Label - ", data_set_train["label"].iloc[id_intention])
print("LabelEncoder - ", data_set_train["labelencoder"].iloc[id_intention])
print()
print("to_categorical - ", dummy_y[id_intention])
print()
print("return to LabelEncoder - ",np.argmax(dummy_y[id_intention], axis=-1))
print("return to Label - ",labelencoder.inverse_transform([np.argmax(dummy_y[id_intention], axis=-1)]))

Sprawdzenie poprawności LabelEncoder i to_categorical 

Label -  BookRestaurant
LabelEncoder -  1

to_categorical -  [0. 1. 0. 0. 0. 0. 0.]

return to LabelEncoder -  1
return to Label -  ['BookRestaurant']


#### Zdefiniowanie X i y

In [9]:
X = data_set_train["clean_text"]
y = dummy_y

In [10]:
X.shape, y.shape

((13784,), (13784, 7))

#### Podział zbioru 

In [11]:
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# X_train.shape, X_test.shape, y_train.shape, y_test.shape

#### `Tokenizer` i `pad_sequences`

In [13]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [14]:
# tokenizer = Tokenizer(num_words=5000)
# tokenizer.fit_on_texts(X_train)

# X_train = tokenizer.texts_to_sequences(X_train)
# X_test = tokenizer.texts_to_sequences(X_test)

# maxlen = 7
# vocab_size = len(tokenizer.word_index) + 1

# X_train = pad_sequences(X_train, padding="post", truncating="post", maxlen=maxlen)
# X_test = pad_sequences(X_test, padding="post", truncating="post", maxlen=maxlen)


#### `KerasClassifier` i `RandomizedSearchCV`

In [15]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

In [16]:
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPooling1D, Dropout, Conv1D

In [17]:
def create_model(num_filters, kernel_size, vocab_size, embedding_dim, maxlen):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(7, activation="softmax")) # sigmoid / softmax
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

In [18]:
%%time

# df z wynikami wszystkich RandomizedSearchCV
wyniki = pd.DataFrame()
wyniki

from keras.wrappers.scikit_learn import KerasClassifier  
from sklearn.model_selection import RandomizedSearchCV

for i  in [7,15,25,35]:  #7,15
  # Main settings
  epochs = 20
  embedding_dim = 100
  maxlen = i

  # Run grid search for each source (yelp, amazon, imdb)
  # for source, frame in df.groupby('source'):
  # print('Running grid search for data set :', source)
  sentences = data_set_train["clean_text"].values  
  y = dummy_y

  # Train-test split
  sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

  # Tokenize words
  tokenizer = Tokenizer(num_words=5000)
  tokenizer.fit_on_texts(sentences_train)
  X_train = tokenizer.texts_to_sequences(sentences_train)
  X_test = tokenizer.texts_to_sequences(sentences_test)

  # Adding 1 because of reserved 0 index
  vocab_size = len(tokenizer.word_index) + 1

  # Pad sequences with zeros
  X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
  X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

  # # Parameter grid for grid search
  # param_grid = dict(num_filters=[32, 64, 128],
  #                   kernel_size=[3, 5, 7],
  #                   vocab_size=[vocab_size],
  #                   embedding_dim=[embedding_dim],
  #                   maxlen=[maxlen])
  # model = KerasClassifier(build_fn=create_model,
  #                         epochs=epochs, batch_size=64,
  #                         verbose=False)

  # Parameter grid for grid search
  param_grid = dict(num_filters=[32, 64, 128],
                    kernel_size=[3, 5, 7],
                    vocab_size=[vocab_size],
                    embedding_dim=[100,200],
                    maxlen=[maxlen],
                    epochs = [20],
                    batch_size=[16,32,64] )

  model = KerasClassifier(build_fn=create_model,
                          verbose=False)
  

  grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=4, verbose=10, n_iter=5)  #n_liter odpowiada za ilość radomize serach
  # grid_result = grid.fit(X_train, y_train)

  from keras.callbacks import EarlyStopping
  # Define early stopping
  early_stopping = EarlyStopping(monitor='val_loss', patience=5)
  grid_result = grid.fit(X_train, y_train, callbacks=[early_stopping])  #validation_split=0.2

  # Evaluate testing set
  test_accuracy = grid.score(X_test, y_test)
  grid_result.best_params_.update({"best_score_":grid_result.best_score_})
  wyniki = wyniki.append(grid_result.best_params_, ignore_index=True)

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] vocab_size=9462, num_filters=128, maxlen=7, kernel_size=5, epochs=20, embedding_dim=100, batch_size=32 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  vocab_size=9462, num_filters=128, maxlen=7, kernel_size=5, epochs=20, embedding_dim=100, batch_size=32, score=0.945, total=  46.4s
[CV] vocab_size=9462, num_filters=128, maxlen=7, kernel_size=5, epochs=20, embedding_dim=100, batch_size=32 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   46.4s remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=7, kernel_size=5, epochs=20, embedding_dim=100, batch_size=32, score=0.958, total=  44.7s
[CV] vocab_size=9462, num_filters=128, maxlen=7, kernel_size=5, epochs=20, embedding_dim=100, batch_size=32 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.5min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=7, kernel_size=5, epochs=20, embedding_dim=100, batch_size=32, score=0.962, total=  43.8s
[CV] vocab_size=9462, num_filters=128, maxlen=7, kernel_size=5, epochs=20, embedding_dim=100, batch_size=32 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.2min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=7, kernel_size=5, epochs=20, embedding_dim=100, batch_size=32, score=0.961, total=  43.3s
[CV] vocab_size=9462, num_filters=128, maxlen=7, kernel_size=3, epochs=20, embedding_dim=200, batch_size=32 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  3.0min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=7, kernel_size=3, epochs=20, embedding_dim=200, batch_size=32, score=0.946, total= 1.4min
[CV] vocab_size=9462, num_filters=128, maxlen=7, kernel_size=3, epochs=20, embedding_dim=200, batch_size=32 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.4min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=7, kernel_size=3, epochs=20, embedding_dim=200, batch_size=32, score=0.962, total= 1.4min
[CV] vocab_size=9462, num_filters=128, maxlen=7, kernel_size=3, epochs=20, embedding_dim=200, batch_size=32 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  5.8min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=7, kernel_size=3, epochs=20, embedding_dim=200, batch_size=32, score=0.966, total= 1.5min
[CV] vocab_size=9462, num_filters=128, maxlen=7, kernel_size=3, epochs=20, embedding_dim=200, batch_size=32 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  7.3min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=7, kernel_size=3, epochs=20, embedding_dim=200, batch_size=32, score=0.959, total= 1.4min
[CV] vocab_size=9462, num_filters=64, maxlen=7, kernel_size=3, epochs=20, embedding_dim=100, batch_size=32 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  8.8min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=7, kernel_size=3, epochs=20, embedding_dim=100, batch_size=32, score=0.944, total=  41.1s
[CV] vocab_size=9462, num_filters=64, maxlen=7, kernel_size=3, epochs=20, embedding_dim=100, batch_size=32 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  9.5min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=7, kernel_size=3, epochs=20, embedding_dim=100, batch_size=32, score=0.959, total=  41.4s
[CV] vocab_size=9462, num_filters=64, maxlen=7, kernel_size=3, epochs=20, embedding_dim=100, batch_size=32 
[CV]  vocab_size=9462, num_filters=64, maxlen=7, kernel_size=3, epochs=20, embedding_dim=100, batch_size=32, score=0.961, total=  41.2s
[CV] vocab_size=9462, num_filters=64, maxlen=7, kernel_size=3, epochs=20, embedding_dim=100, batch_size=32 
[CV]  vocab_size=9462, num_filters=64, maxlen=7, kernel_size=3, epochs=20, embedding_dim=100, batch_size=32, score=0.957, total=  42.9s
[CV] vocab_size=9462, num_filters=64, maxlen=7, kernel_size=7, epochs=20, embedding_dim=100, batch_size=64 
[CV]  vocab_size=9462, num_filters=64, maxlen=7, kernel_size=7, epochs=20, embedding_dim=100, batch_size=64, score=0.946, total=  22.9s
[CV] vocab_size=9462, num_filters=64, maxlen=7, kernel_size=7, epochs=20, embedding_dim=100, batch_size=64 
[CV]  vocab_size=9462, n

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 22.0min finished


Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] vocab_size=9462, num_filters=64, maxlen=15, kernel_size=5, epochs=20, embedding_dim=200, batch_size=64 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  vocab_size=9462, num_filters=64, maxlen=15, kernel_size=5, epochs=20, embedding_dim=200, batch_size=64, score=0.979, total=  57.9s
[CV] vocab_size=9462, num_filters=64, maxlen=15, kernel_size=5, epochs=20, embedding_dim=200, batch_size=64 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   57.9s remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=15, kernel_size=5, epochs=20, embedding_dim=200, batch_size=64, score=0.986, total=  56.6s
[CV] vocab_size=9462, num_filters=64, maxlen=15, kernel_size=5, epochs=20, embedding_dim=200, batch_size=64 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.9min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=15, kernel_size=5, epochs=20, embedding_dim=200, batch_size=64, score=0.983, total=  57.0s
[CV] vocab_size=9462, num_filters=64, maxlen=15, kernel_size=5, epochs=20, embedding_dim=200, batch_size=64 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.9min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=15, kernel_size=5, epochs=20, embedding_dim=200, batch_size=64, score=0.985, total=  56.7s
[CV] vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=100, batch_size=64 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  3.8min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=100, batch_size=64, score=0.979, total=  27.9s
[CV] vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=100, batch_size=64 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.3min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=100, batch_size=64, score=0.985, total=  27.6s
[CV] vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=100, batch_size=64 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  4.7min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=100, batch_size=64, score=0.983, total=  28.0s
[CV] vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=100, batch_size=64 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  5.2min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=100, batch_size=64, score=0.986, total=  28.2s
[CV] vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=100, batch_size=16 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  5.7min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=100, batch_size=16, score=0.981, total= 1.5min
[CV] vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=100, batch_size=16 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  7.1min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=100, batch_size=16, score=0.985, total= 1.4min
[CV] vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=100, batch_size=16 
[CV]  vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=100, batch_size=16, score=0.981, total= 1.4min
[CV] vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=100, batch_size=16 
[CV]  vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=100, batch_size=16, score=0.983, total= 1.4min
[CV] vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=200, batch_size=16 
[CV]  vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=200, batch_size=16, score=0.979, total= 2.8min
[CV] vocab_size=9462, num_filters=64, maxlen=15, kernel_size=3, epochs=20, embedding_dim=200, batch_size=16 
[CV]  vocab_size

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 25.7min finished


Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, epochs=20, embedding_dim=100, batch_size=32 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, epochs=20, embedding_dim=100, batch_size=32, score=0.981, total=  53.0s
[CV] vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, epochs=20, embedding_dim=100, batch_size=32 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   53.0s remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, epochs=20, embedding_dim=100, batch_size=32, score=0.987, total=  52.5s
[CV] vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, epochs=20, embedding_dim=100, batch_size=32 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.8min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, epochs=20, embedding_dim=100, batch_size=32, score=0.985, total=  55.7s
[CV] vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, epochs=20, embedding_dim=100, batch_size=32 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.7min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, epochs=20, embedding_dim=100, batch_size=32, score=0.988, total=  54.7s
[CV] vocab_size=9462, num_filters=128, maxlen=25, kernel_size=7, epochs=20, embedding_dim=100, batch_size=32 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  3.6min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=25, kernel_size=7, epochs=20, embedding_dim=100, batch_size=32, score=0.979, total= 1.2min
[CV] vocab_size=9462, num_filters=128, maxlen=25, kernel_size=7, epochs=20, embedding_dim=100, batch_size=32 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.8min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=25, kernel_size=7, epochs=20, embedding_dim=100, batch_size=32, score=0.986, total= 1.2min
[CV] vocab_size=9462, num_filters=128, maxlen=25, kernel_size=7, epochs=20, embedding_dim=100, batch_size=32 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  6.1min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=25, kernel_size=7, epochs=20, embedding_dim=100, batch_size=32, score=0.984, total= 1.3min
[CV] vocab_size=9462, num_filters=128, maxlen=25, kernel_size=7, epochs=20, embedding_dim=100, batch_size=32 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  7.3min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=25, kernel_size=7, epochs=20, embedding_dim=100, batch_size=32, score=0.986, total= 1.2min
[CV] vocab_size=9462, num_filters=64, maxlen=25, kernel_size=3, epochs=20, embedding_dim=100, batch_size=16 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  8.6min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=25, kernel_size=3, epochs=20, embedding_dim=100, batch_size=16, score=0.979, total= 1.4min
[CV] vocab_size=9462, num_filters=64, maxlen=25, kernel_size=3, epochs=20, embedding_dim=100, batch_size=16 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 10.0min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=64, maxlen=25, kernel_size=3, epochs=20, embedding_dim=100, batch_size=16, score=0.985, total= 1.4min
[CV] vocab_size=9462, num_filters=64, maxlen=25, kernel_size=3, epochs=20, embedding_dim=100, batch_size=16 
[CV]  vocab_size=9462, num_filters=64, maxlen=25, kernel_size=3, epochs=20, embedding_dim=100, batch_size=16, score=0.983, total= 1.4min
[CV] vocab_size=9462, num_filters=64, maxlen=25, kernel_size=3, epochs=20, embedding_dim=100, batch_size=16 
[CV]  vocab_size=9462, num_filters=64, maxlen=25, kernel_size=3, epochs=20, embedding_dim=100, batch_size=16, score=0.985, total= 1.5min
[CV] vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, epochs=20, embedding_dim=200, batch_size=32 
[CV]  vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, epochs=20, embedding_dim=200, batch_size=32, score=0.980, total= 1.7min
[CV] vocab_size=9462, num_filters=32, maxlen=25, kernel_size=7, epochs=20, embedding_dim=200, batch_size=32 
[CV]  vocab_size

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 27.7min finished


Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=3, epochs=20, embedding_dim=100, batch_size=64 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=3, epochs=20, embedding_dim=100, batch_size=64, score=0.980, total=  35.0s
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=3, epochs=20, embedding_dim=100, batch_size=64 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   35.0s remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=3, epochs=20, embedding_dim=100, batch_size=64, score=0.986, total=  35.2s
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=3, epochs=20, embedding_dim=100, batch_size=64 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.2min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=3, epochs=20, embedding_dim=100, batch_size=64, score=0.985, total=  34.5s
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=3, epochs=20, embedding_dim=100, batch_size=64 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.7min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=3, epochs=20, embedding_dim=100, batch_size=64, score=0.984, total=  34.2s
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=5, epochs=20, embedding_dim=100, batch_size=64 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.3min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=5, epochs=20, embedding_dim=100, batch_size=64, score=0.981, total=  39.8s
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=5, epochs=20, embedding_dim=100, batch_size=64 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.0min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=5, epochs=20, embedding_dim=100, batch_size=64, score=0.984, total=  40.3s
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=5, epochs=20, embedding_dim=100, batch_size=64 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  3.7min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=5, epochs=20, embedding_dim=100, batch_size=64, score=0.986, total=  41.4s
[CV] vocab_size=9462, num_filters=32, maxlen=35, kernel_size=5, epochs=20, embedding_dim=100, batch_size=64 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  4.3min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=32, maxlen=35, kernel_size=5, epochs=20, embedding_dim=100, batch_size=64, score=0.983, total=  40.7s
[CV] vocab_size=9462, num_filters=128, maxlen=35, kernel_size=5, epochs=20, embedding_dim=200, batch_size=32 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  5.0min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=35, kernel_size=5, epochs=20, embedding_dim=200, batch_size=32, score=0.980, total= 3.2min
[CV] vocab_size=9462, num_filters=128, maxlen=35, kernel_size=5, epochs=20, embedding_dim=200, batch_size=32 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  8.2min remaining:    0.0s


[CV]  vocab_size=9462, num_filters=128, maxlen=35, kernel_size=5, epochs=20, embedding_dim=200, batch_size=32, score=0.986, total= 3.2min
[CV] vocab_size=9462, num_filters=128, maxlen=35, kernel_size=5, epochs=20, embedding_dim=200, batch_size=32 
[CV]  vocab_size=9462, num_filters=128, maxlen=35, kernel_size=5, epochs=20, embedding_dim=200, batch_size=32, score=0.985, total= 3.2min
[CV] vocab_size=9462, num_filters=128, maxlen=35, kernel_size=5, epochs=20, embedding_dim=200, batch_size=32 
[CV]  vocab_size=9462, num_filters=128, maxlen=35, kernel_size=5, epochs=20, embedding_dim=200, batch_size=32, score=0.985, total= 3.2min
[CV] vocab_size=9462, num_filters=128, maxlen=35, kernel_size=7, epochs=20, embedding_dim=200, batch_size=32 
[CV]  vocab_size=9462, num_filters=128, maxlen=35, kernel_size=7, epochs=20, embedding_dim=200, batch_size=32, score=0.982, total= 3.7min
[CV] vocab_size=9462, num_filters=128, maxlen=35, kernel_size=7, epochs=20, embedding_dim=200, batch_size=32 
[CV]  vo

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 39.2min finished


CPU times: user 3h 31min 7s, sys: 10min 37s, total: 3h 41min 45s
Wall time: 2h 2min 24s


In [None]:
# ws bledu erly stoping https://stackoverflow.com/a/56490322/3810512

In [19]:
wyniki

Unnamed: 0,batch_size,best_score_,embedding_dim,epochs,kernel_size,maxlen,num_filters,vocab_size
0,64.0,0.960148,100.0,20.0,7.0,7.0,64.0,9462.0
1,32.0,0.983653,100.0,20.0,3.0,15.0,128.0,9462.0
2,32.0,0.985007,100.0,20.0,7.0,25.0,32.0,9462.0
3,32.0,0.984523,200.0,20.0,7.0,35.0,128.0,9462.0


In [20]:
from time import time

In [21]:
start = time()

In [22]:
start

1616496440.3714688

# robocze

`Randomized Search Cross Validation`

Wykorzystuje to losowy zestaw hiperparametrów. Przydatne, gdy istnieje wiele hiperparametrów, więc przestrzeń wyszukiwania jest duża. Może być używany, jeśli masz wcześniejsze przekonanie o tym, jakie powinny być hiperparametry. 

In [None]:
%%time

# df z wynikami wszystkich RandomizedSearchCV
wyniki = pd.DataFrame()
wyniki

from keras.wrappers.scikit_learn import KerasClassifier  
from sklearn.model_selection import RandomizedSearchCV


# Main settings
# epochs = 20
# embedding_dim = 100
# maxlen = 7

# Run grid search for each source (yelp, amazon, imdb)
# for source, frame in df.groupby('source'):
# print('Running grid search for data set :', source)
sentences = data_set_train["clean_text"].values  
y = dummy_y

# Train-test split
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

# Tokenize words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences with zeros
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# Parameter grid for grid search
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7],
                  vocab_size=[vocab_size],
                  embedding_dim=[100,200],
                  maxlen=[7,15],
                  epochs = [10,20],
                  batch_size=[16,32,64] )

model = KerasClassifier(build_fn=create_model,
                        verbose=False)

grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=4, verbose=10, n_iter=5)
# grid_result = grid.fit(X_train, y_train)


from keras.callbacks import EarlyStopping
# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=epochs_to_wait_for_improve)
grid_result = grid.fit(X_train, y_train, callbacks=[early_stopping])

# Evaluate testing set
test_accuracy = grid.score(X_test, y_test)
grid_result.best_params_.update({"best_score_":grid_result.best_score_})
wyniki = wyniki.append(grid_result.best_params_, ignore_index=True)

In [None]:
wyniki

`Grid Search Cross Validation`

Tworzy siatkę nad przestrzenią wyszukiwania i ocenia model pod kątem wszystkich możliwych hiperparametrów w przestrzeni. Dobre w tym sensie, że jest proste i wyczerpujące. Z drugiej strony, może to być zbyt kosztowne w czasie obliczeń, jeśli przestrzeń poszukiwań jest duża (np. Bardzo wiele hiperparametrów).

Przykład do zastosowania: https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/


In [None]:
%%time

# df z wynikami wszystkich RandomizedSearchCV
wyniki = pd.DataFrame()
wyniki

from keras.wrappers.scikit_learn import KerasClassifier  
from sklearn.model_selection import GridSearchCV


# Main settings
# epochs = 20
# embedding_dim = 100
# maxlen = 7

# Run grid search for each source (yelp, amazon, imdb)
# for source, frame in df.groupby('source'):
# print('Running grid search for data set :', source)
sentences = data_set_train["clean_text"].values  
y = dummy_y

# Train-test split
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

# Tokenize words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences with zeros
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

# Parameter grid for grid search
param_grid = dict(num_filters=[32, 64, 128],
                  kernel_size=[3, 5, 7],
                  vocab_size=[vocab_size],
                  embedding_dim=[100,200],
                  maxlen=[7,15],
                  epochs = [10,20],
                  batch_size=[16,32,64]
                  )


model = KerasClassifier(build_fn=create_model,verbose=False)

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, verbose=10)
grid_result = grid.fit(X_train, y_train)

# Evaluate testing set
test_accuracy = grid.score(X_test, y_test)
grid_result.best_params_.update({"best_score_":grid_result.best_score_})
wyniki = wyniki.append(grid_result.best_params_, ignore_index=True)

In [None]:
wyniki

#### Predykcja na zbiorze validacyjnym