In [1]:
import string
import re
import pandas as pd
import numpy as np
import os
import json
from typing import Dict
import matplotlib.pyplot as plt
import argparse

from keras.models import Sequential
from keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.models import load_model

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

Using TensorFlow backend.


In [2]:
class Utils:
    CHAR_DICT = dict(zip([char for char in string.ascii_lowercase], range(26)))
    CHAR_DICT[' '] = 26

    def __init__(self):
        pass

    @staticmethod
    def get_data(path: str, **kwargs) -> pd.DataFrame:
        return pd.read_csv(path, **kwargs)

    @staticmethod
    def split_word(word: str) -> list:
        return [char for char in word]

    @staticmethod
    def clean_sentence(sentence: str) -> str:
        sentence = sentence.lower()
        match = re.compile('[^a-z\s]')
        return match.sub('', sentence)

    @classmethod
    def clean_external_data(cls, data: pd.DataFrame) -> pd.DataFrame:
        data2 = data[data['text'] != '[START]'].copy()
        data2['text'] = data2['text'].apply(lambda x: cls.clean_sentence(x))
        return data2.reset_index()

    @classmethod
    def make_conversation_pair(cls, data: pd.DataFrame) -> pd.DataFrame:
        external_chat_df = pd.DataFrame({"human": [], "robot": []})
        human_talks = ""
        robot_talks = ""

        for i in list(range(data.shape[0])):
            if data.source[i] == "human":
                human_talks = human_talks + data.text[i]
            else:
                robot_talks = robot_talks + data.text[i]

            if human_talks != "" and robot_talks != "":
                #                     print("Human talks: " + human_talks)
                #                     print("Robot talks: " + robot_talks)
                temp_df = pd.DataFrame({"human": [human_talks], "robot": [robot_talks]})
                external_chat_df = external_chat_df.append(temp_df)

                human_talks = ""
                robot_talks = ""

        return external_chat_df


def process_data(local_data_path: str, external_data_path: str, path_to_save: str) -> pd.DataFrame:
    print("Data processing is going on...")

    # Loading data from all sources
    local_data = Utils.get_data(local_data_path)
    external_data = Utils.get_data(external_data_path)

    local_data['CLIENT'] = local_data['CLIENT'].apply(lambda x: Utils.clean_sentence(x))
    local_data['CLIENT'] = local_data['CLIENT'].apply(lambda x: list(x))

    external_data['source'] = external_data['source'].apply(lambda x: Utils.clean_sentence(x))

    cleaned_external_data = Utils.clean_external_data(external_data)

    cleaned_external_data_pair = Utils.make_conversation_pair(cleaned_external_data)
    external_chat_df_filtered = cleaned_external_data_pair[cleaned_external_data_pair['human'].map(len) <= 30]

    external_chat_df_filtered = external_chat_df_filtered.rename(index=str,
                                                                 columns={"human": "CLIENT", "robot": 'ACTIVITY'})

    external_chat_df_filtered['CLIENT'] = external_chat_df_filtered['CLIENT'].apply(lambda x: x.lower())
    external_chat_df_filtered['CLIENT'] = external_chat_df_filtered['CLIENT'].apply(lambda x: list(x))

    all_df = local_data.append(external_chat_df_filtered).reset_index()
    all_df['CLIENT'] = all_df['CLIENT'].apply(lambda x: [Utils.CHAR_DICT[i] for i in x])

    return all_df


In [3]:


# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = len(Utils.CHAR_DICT.keys())


def train_and_evaluate(data: pd.DataFrame, args: Dict):
    x = pad_sequences(data['CLIENT'], maxlen=args['sequence_length'])
    print('Shape of data tensor:', x.shape)

    y = pd.get_dummies(data['ACTIVITY']).values
    print('Shape of label tensor:', y.shape)

    activity_dict = dict(zip(list(pd.get_dummies(data['ACTIVITY']).columns), range(len(list(pd.get_dummies(data['ACTIVITY']).columns)))))

    with open('resources/activities2dummies.json', 'w') as fp:
        json.dump(activity_dict, fp)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=args['train_test_ratio'], random_state=42)
    print(x_train.shape, y_train.shape)
    print(x_test.shape, y_test.shape)

    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, args['embedding_size'], input_length=x.shape[1]))
    model.add(SpatialDropout1D(args['keep_prob']))
    model.add(LSTM(args['sequence_length'], dropout=args['keep_prob'], recurrent_dropout=args['keep_prob']))
    model.add(Dense(len(data['ACTIVITY'].unique()), activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    history = model.fit(x_train, y_train,
                        epochs=args['train_steps'], batch_size=args['batch_size'],
                        validation_split=args['validation_split'],
                    )

    plt.title('Loss')
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    plt.legend()
    plt.savefig(os.path.join(args['base_dir'], args['output_dir'], "loss"))

    plt.title('Accuracy')
    plt.plot(history.history['acc'], label='train')
    plt.plot(history.history['val_acc'], label='test')
    plt.legend()
    plt.savefig(os.path.join(args['base_dir'], args['output_dir'], 'accuracy'))

    model.save(os.path.join(args['base_dir'], args['output_dir'], 'lstm_model.h5'))  # creates a HDF5 file 'my_model.h5'

    print("Model training completed.")



In [4]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    # Directory paths
    parser.add_argument('--base_dir',
                        help='Root directory',
                        required=True
                        )
    parser.add_argument('--data_dir',
                        help='Relative path of data directory',
                        required=True
                        )
    parser.add_argument('--local_file_name',
                        help='train file name',
                        required=True
                        )
    parser.add_argument('--external_file_name',
                        help='validation file name',
                        required=True
                        )
    parser.add_argument('--resources_dir',
                        help='Resourced directory path',
                        required=True
                        )
    parser.add_argument(
                        '--output_dir',
                        help='Directory to write checkpoints and export models',
                        required=True
                    )

    # Embeddings
    parser.add_argument("--embedding_size",
                        type=int,
                        default=32,
                        help="Word embedding size. (For glove, use 50 | 100 | 200 | 300)"
                        )

    # Model structure
    parser.add_argument("--sequence_length",
                        type=int,
                        default=30,
                        help="LSTM network length"
                        )
    parser.add_argument("--num_layers",
                        type=int,
                        default=1,
                        help="LSTM network depth"
                        )

    # Train params
    parser.add_argument("--learning_rate",
                        type=float,
                        default=1e-2,
                        help="Learning rate."
                    )
    parser.add_argument("--batch_size",
                        type=int,
                        default=16,
                        help="Batch size."
                    )
    parser.add_argument("--keep_prob",
                        type=float,
                        default=1,
                        help="Dropout keep prob."
                    )
    parser.add_argument(
                        '--train_steps',
                        help='Steps to run the training job for',
                        type=int,
                        default=300
                    )
    parser.add_argument(
                        '--train_test_ratio',
                        help='Train and test data splitting ratio',
                        default=1,
                        type=float
                    )
    parser.add_argument(
                        '--validation_split',
                        help='Validation done on percentage of train data ',
                        default=0.1,
                        type=float
                    )

    args = parser.parse_args()
    arguments = args.__dict__

    print(arguments)

    local_file_path = os.path.join(arguments['base_dir'], os.path.join(arguments['data_dir'], os.path.join(arguments['local_file_name'])))
    external_file_path = os.path.join(arguments['base_dir'], os.path.join(arguments['data_dir'], os.path.join(arguments['external_file_name'])))
    resources_dir = os.path.join(arguments['base_dir'], arguments['resources_dir'])

    df = process_data(local_file_path, external_file_path, resources_dir)
    train_and_evaluate(data=df, args=arguments)


usage: ipykernel_launcher.py [-h] --base_dir BASE_DIR --data_dir DATA_DIR
                             --local_file_name LOCAL_FILE_NAME
                             --external_file_name EXTERNAL_FILE_NAME
                             --resources_dir RESOURCES_DIR --output_dir
                             OUTPUT_DIR [--embedding_size EMBEDDING_SIZE]
                             [--sequence_length SEQUENCE_LENGTH]
                             [--num_layers NUM_LAYERS]
                             [--learning_rate LEARNING_RATE]
                             [--batch_size BATCH_SIZE] [--keep_prob KEEP_PROB]
                             [--train_steps TRAIN_STEPS]
                             [--train_test_ratio TRAIN_TEST_RATIO]
                             [--validation_split VALIDATION_SPLIT]
ipykernel_launcher.py: error: the following arguments are required: --base_dir, --data_dir, --local_file_name, --external_file_name, --resources_dir, --output_dir


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
with open('resources/activities2dummies.json', 'r') as fp:
    activities_dict = json.load(fp)

reverse_dict = {v: k for k,v in activities_dict.items()}

model = load_model('/home/soumyadip/Desktop/PycharmProjects/intent-activity-classifier/output/lstm_model.h5')
while True:
    user_input = input("You:  ")
    user_input = Utils.clean_sentence(user_input)
    user_input = list(user_input)
    user_input = [Utils.CHAR_DICT[i] for i in user_input]
    y_prob = model.predict(pad_sequences(np.array([user_input]), maxlen=30))
    y_class = y_prob.argmax(axis=-1)
    print("Bot:  " + reverse_dict[y_class[0]])