<a href="https://colab.research.google.com/github/ekunnii/chatbot_feeder/blob/master/notebooks/OpenNMT_feedback_copying.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Copying model with OpenNMT



In [0]:
import os
import pandas as pd
import re


In [0]:
DATASET='/home/ryan/projects/OpenNMT-py/data/feedback6k'
os.chdir(DATASET)
os.listdir()

['processed_test_fb.txt',
 'processed_train_fb_b.txt.atok',
 'response_valid_fb.txt.atok',
 'test_fb.txt',
 'processed_train_fb_a.txt.atok',
 'valid_fb.txt',
 'processed_valid_fb.txt.atok',
 'response_train_fb_a.txt.atok',
 'processed_train_fb_a.txt',
 'train_fb_b.txt',
 'response_train_fb.txt.atok',
 'response_test_fb.txt.atok',
 'processed_test_fb.txt.atok',
 'processed_train_fb.txt',
 'response_train_fb_b.txt',
 'response_train_fb_b.txt.atok',
 'processed_train_fb_b.txt',
 'processed_train_fb.txt.atok',
 'train_fb.txt',
 'processed_valid_fb.txt',
 'response_train_fb.txt',
 'response_valid_fb.txt',
 'train_fb_a.txt',
 'response_test_fb.txt',
 'response_train_fb_a.txt']

In [0]:
# Process all feedback files
file_list = ['train_fb.txt', 'train_fb_a.txt', 'train_fb_b.txt', 'test_fb.txt', 'valid_fb.txt']

for file_name in file_list:
    df = pd.read_json(file_name, lines=True)
    response_file = f'response_{file_name}'
    df['response'].to_csv(response_file, header=False, index=False)

In [0]:
def processed_feedback(file_name):
    df = pd.read_json(file_name, lines=True)
    response_list = df['response'].to_list()

    processed_responses = []

    for idx, sentence in enumerate(response_list):
        # match = re.search(r'you could have said| ', sentence)
        # match = re.search('said|tell|say|ask|answer|state|talk', sentence)
        match = re.search('you could|you should|said|saying|say|tell|told|admit|ask|answer|talk', sentence)

        if match:
            # TODO remove "you could have said"
            # TODO remove quotation if any
            # TODO split with 'or' and return both option 
            
            processed_sentence = re.sub("you could have|you should have|you could|you should", '', sentence).strip()
            processed_sentence = re.sub("^said|^saying|^say|^tell |^told |^admit |^asked |^ask |^answer |^answered |^talked |^talk ", '', processed_sentence).strip()
            processed_sentence = re.sub("^about|^me|^that", '', processed_sentence).strip()

            piceses = processed_sentence.split(' or ')
            processed_sentence = piceses[0].strip()
            for piece in piceses:
                piece = piece.strip()
                if len(piece) > len(processed_sentence):
                    processed_sentence = piece
            # remove                
            processed_sentence = re.sub("^if|^whether|^not", '', processed_sentence).strip()

            # replace subject
            processed_sentence = re.sub("you are ", 'i am ', processed_sentence).strip()
            processed_sentence = re.sub("your ", 'my ', processed_sentence).strip()        
            processed_sentence = re.sub("you\'ve ", 'i\'ve ', processed_sentence).strip()
            processed_sentence = re.sub("you were", 'i was', processed_sentence).strip()
            processed_sentence = re.sub("^you ", 'i ', processed_sentence).strip()

            
            # Remove starting space and comma
            processed_sentence = re.sub("\“|\”", '', processed_sentence).strip() 
            processed_sentence = processed_sentence.lstrip(':|,|\"|\'|-|.| ')
            processed_sentence = processed_sentence.rstrip('\"|\'| ')

            if len(processed_sentence) > 0:
                processed_responses.append(processed_sentence)
            else:
                # print(processed_sentence,"<<<<<<", sentence)
                processed_responses.append(sentence)

            # print(processed_sentence,"<<<<<<", sentence)

        else:
            processed_responses.append(sentence)

    df['response'] = processed_responses
    processed_file = "_".join(["processed", file_name])
    df['response'].to_csv(processed_file, header=False, index=False)

for file_name in file_list:
    processed_feedback(file_name)

In [0]:
REPO="/home/ryan/projects/OpenNMT-py/"
os.chdir(REPO)

## data preprocess

In [0]:
%%bash
for f in data/feedback6k/response*.txt; do if [[ "$f" != *"test"* ]]; then sed -i "$ d" $f; fi;  done
for f in data/feedback6k/response*.txt; do perl tools/tokenizer.perl -a -no-escape -l en -q  < $f > $f.atok; done
for f in data/feedback6k/processed*.txt; do if [[ "$f" != *"test"* ]]; then sed -i "$ d" $f; fi;  done
for f in data/feedback6k/processed*.txt; do perl tools/tokenizer.perl -a -no-escape -l en -q  < $f > $f.atok; done

In [0]:
# copy input to output
!onmt_preprocess \
-train_src data/feedback6k/response_train_fb.txt.atok \
-train_tgt data/feedback6k/response_train_fb.txt.atok \
-valid_src data/feedback6k/response_valid_fb.txt.atok \
-valid_tgt data/feedback6k/response_valid_fb.txt.atok \
-save_data data/feedback6k.atok.low \
-lower \
-dynamic_dict \
-overwrite

In [0]:
# regex output
!onmt_preprocess \
-train_src data/feedback6k/response_train_fb.txt.atok \
-train_tgt data/feedback6k/processed_train_fb.txt.atok \
-valid_src data/feedback6k/response_valid_fb.txt.atok \
-valid_tgt data/feedback6k/processed_valid_fb.txt.atok \
-save_data data/regex_feedback6k.atok.low \
-lower \
-dynamic_dict \
-overwrite

## Model summary

```
NMTModel(
  (encoder): RNNEncoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(12960, 500, padding_idx=1)
        )
      )
    )
    (rnn): LSTM(500, 500, num_layers=2, dropout=0.3)
  )
  (decoder): InputFeedRNNDecoder(
    (embeddings): Embeddings(
      (make_embedding): Sequential(
        (emb_luts): Elementwise(
          (0): Embedding(12962, 500, padding_idx=1)
        )
      )
    )
    (dropout): Dropout(p=0.3, inplace=False)
    (rnn): StackedLSTM(
      (dropout): Dropout(p=0.3, inplace=False)
      (layers): ModuleList(
        (0): LSTMCell(1000, 500)
        (1): LSTMCell(500, 500)
      )
    )
    (attn): GlobalAttention(
      (linear_in): Linear(in_features=500, out_features=500, bias=False)
      (linear_out): Linear(in_features=1000, out_features=500, bias=False)
    )
  )
  (generator): Sequential(
    (0): Linear(in_features=500, out_features=12962, bias=True)
    (1): Cast()
    (2): LogSoftmax()
  )
)
```

## seq2seq with copy attention


### copy input to output

In [0]:
!onmt_train -data data/feedback6k.atok.low -save_model feedback6k_model -gpu_ranks 0 -copy_attn

In [0]:
!onmt_translate -gpu 0 -model feedback6k_model_step_5000.pt -src data/feedback6k/response_valid_fb.txt.atok -tgt data/feedback6k/response_valid_fb.txt.atok -replace_unk -verbose -output response_valid_fb.pred.atok
!perl tools/multi-bleu.perl data/feedback6k/response_valid_fb.txt.atok < response_valid_fb.pred.atok

In [0]:
!onmt_translate -gpu 0 -model feedback6k_model_step_5000.pt -src data/feedback6k/response_test_fb.txt.atok -tgt data/feedback6k/response_test_fb.txt.atok -replace_unk -verbose -output response_test_fb.pred.atok
!perl tools/multi-bleu.perl data/feedback6k/response_test_fb.txt.atok < response_test_fb.pred.atok

### Convert feedback to regex feedback

In [0]:
!onmt_train -data data/regex_feedback6k.atok.low -save_model regex_feedback6k_model -gpu_ranks 0 -copy_attn

In [10]:
!onmt_translate -gpu 0 -model models/regex_copy/regex_feedback6k_model_step_15000.pt -src data/feedback6k/response_valid_fb.txt.atok -tgt data/feedback6k/processed_valid_fb.txt.atok -replace_unk -verbose -output processed_valid_fb.pred.atok
!perl tools/multi-bleu.perl data/feedback6k/processed_valid_fb.txt.atok < processed_valid_fb.pred.atok

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
PRED 166: " chocolate chip , or sugar cookie , or peanut butter would have been good answers . "
PRED SCORE: -0.6953
GOLD 166: peanut butter would have been good answers .
GOLD SCORE: -inf

SENT 167: ['no', 'i', 'don', "'t", 'have', 'a', 'license', '.']
PRED 167: no i don 't have a license .
PRED SCORE: -0.0000
GOLD 167: no i don 't have a license .
GOLD SCORE: -150.7128

SENT 168: ['oh', 'that', 'sounds', 'dangerous']
PRED 168: oh that sounds dangerous
PRED SCORE: -0.0000
GOLD 168: oh that sounds dangerous
GOLD SCORE: -inf

SENT 169: ['that', "'s", 'cool', 'what', 'do', 'you', 'like', 'to', 'do', '?']
PRED 169: that 's cool what do you like to do ?
PRED SCORE: -0.0003
GOLD 169: that 's cool what do you like to do ?
GOLD SCORE: -inf

SENT 170: ['what', 'are', 'you', 'going', 'to', 'have', 'for', 'lunch', '?']
PRED 170: what are you going to have for lunch ?
PRED SCORE: -0.0000
GOLD 170: what are you going to have for lunc

In [9]:
!onmt_translate -gpu 0 -model models/regex_copy/regex_feedback6k_model_step_15000.pt -src data/feedback6k/response_test_fb.txt.atok -tgt data/feedback6k/processed_test_fb.txt.atok -replace_unk -verbose -output processed_test_fb.pred.atok
!perl tools/multi-bleu.perl data/feedback6k/processed_test_fb.txt.atok < processed_test_fb.pred.atok

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
PRED 167: " i have never ridden a horse but i like them , "
PRED SCORE: -0.0086
GOLD 167: " i have never ridden a horse but i like them , "
GOLD SCORE: -inf

SENT 168: ['"', 'again', ',', 'no', 'more', 'talk', 'about', 'sleep', '!', '"']
PRED 168: " again , no more talk about sleep ! "
PRED SCORE: -0.0009
GOLD 168: " again , no more talk about sleep ! "
GOLD SCORE: -inf

SENT 169: ['i', 'just', 'don', "'t", 'like', 'most', 'mexican', 'food.', 'it', "'s", 'too', 'spicy', '.']
PRED 169: i just don 't like most mexican food. it 's too spicy .
PRED SCORE: -0.0000
GOLD 169: i just don 't like most mexican food. it 's too spicy .
GOLD SCORE: -290.0051

SENT 170: ['how', 'are', 'you', '?']
PRED 170: how are you ?
PRED SCORE: -0.0000
GOLD 170: how are you ?
GOLD SCORE: -inf

SENT 171: ['i', 'don', "'t", 'have', 'a', 'favorite', 'song', '.']
PRED 171: i don 't have a favorite song .
PRED SCORE: -0.0000
GOLD 171: i don 't have a fa

# seq2seq with attention

### copy input to output

In [0]:
!onmt_train -data data/feedback6k.atok.low -save_model feedback6k_model_nocopy -gpu_ranks 0 

In [0]:
!onmt_translate -gpu 0 -model feedback6k_model_nocopy_step_5000.pt -src data/feedback6k/response_valid_fb.txt.atok -tgt data/feedback6k/response_valid_fb.txt.atok -replace_unk -verbose -output nocopy_response_valid_fb.pred.atok
!perl tools/multi-bleu.perl data/feedback6k/response_valid_fb.txt.atok < nocopy_response_valid_fb.pred.atok

In [6]:
!onmt_translate -gpu 0 -model feedback6k_model_nocopy_step_5000.pt -src data/feedback6k/response_test_fb.txt.atok -tgt data/feedback6k/response_test_fb.txt.atok -replace_unk -verbose -output nocopy_response_test_fb.pred.atok
!perl tools/multi-bleu.perl data/feedback6k/response_test_fb.txt.atok < nocopy_response_test_fb.pred.atok

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
PRED 167: " i have never ridden a horse but i like them , "
PRED SCORE: -0.1435
GOLD 167: " i have never ridden a horse but i like them , "
GOLD SCORE: -0.1435

SENT 168: ['"', 'again', ',', 'no', 'more', 'talk', 'about', 'sleep', '!', '"']
PRED 168: " again , no more talk about sleep ! "
PRED SCORE: -0.2265
GOLD 168: " again , no more talk about sleep ! "
GOLD SCORE: -0.2265

SENT 169: ['i', 'just', 'don', "'t", 'like', 'most', 'mexican', 'food.', 'it', "'s", 'too', 'spicy', '.']
PRED 169: i just don 't like most mexican food. it 's too spicy .
PRED SCORE: -1.0338
GOLD 169: i just don 't like most mexican food. it 's too spicy .
GOLD SCORE: -1.0338

SENT 170: ['how', 'are', 'you', '?']
PRED 170: how are you ?
PRED SCORE: -0.0002
GOLD 170: how are you ?
GOLD SCORE: -0.0002

SENT 171: ['i', 'don', "'t", 'have', 'a', 'favorite', 'song', '.']
PRED 171: i don 't have a favorite song .
PRED SCORE: -0.0035
GOLD 171: i don 't ha

### convert input to regex feedback


In [0]:
!onmt_train -data data/regex_feedback6k.atok.low -save_model models/regex_nocopy/regex_nocopy_feedback6k_model -gpu_ranks 0 

In [0]:
!onmt_translate -gpu 0 -model models/regex_nocopy/regex_nocopy_feedback6k_model_step_15000.pt -src data/feedback6k/response_valid_fb.txt.atok -tgt data/feedback6k/processed_valid_fb.txt.atok -replace_unk -verbose -output regex_nocopy_processed_valid_fb.pred.atok
!perl tools/multi-bleu.perl data/feedback6k/processed_valid_fb.txt.atok < regex_nocopy_processed_valid_fb.pred.atok

In [0]:
!onmt_translate -gpu 0 -model models/regex_nocopy/regex_nocopy_feedback6k_model_step_15000.pt -src data/feedback6k/response_test_fb.txt.atok -tgt data/feedback6k/processed_test_fb.txt.atok -replace_unk -verbose -output regex_nocopy_processed_test_fb.pred.atok
!perl tools/multi-bleu.perl data/feedback6k/processed_test_fb.txt.atok < regex_nocopy_processed_test_fb.pred.atok

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
PRED 167: " i have never ridden a horse but i like them , "
PRED SCORE: -0.0044
GOLD 167: " i have never ridden a horse but i like them , "
GOLD SCORE: -0.0044

SENT 168: ['"', 'again', ',', 'no', 'more', 'talk', 'about', 'sleep', '!', '"']
PRED 168: " again , no more talk about sleep ! "
PRED SCORE: -0.0101
GOLD 168: " again , no more talk about sleep ! "
GOLD SCORE: -0.0101

SENT 169: ['i', 'just', 'don', "'t", 'like', 'most', 'mexican', 'food.', 'it', "'s", 'too', 'spicy', '.']
PRED 169: i just don 't like most mexican food. it 's too spicy .
PRED SCORE: -0.2436
GOLD 169: i just don 't like most mexican food. it 's too spicy .
GOLD SCORE: -0.2436

SENT 170: ['how', 'are', 'you', '?']
PRED 170: how are you ?
PRED SCORE: -0.0000
GOLD 170: how are you ?
GOLD SCORE: -0.0000

SENT 171: ['i', 'don', "'t", 'have', 'a', 'favorite', 'song', '.']
PRED 171: i don 't have a favorite song .
PRED SCORE: -0.0095
GOLD 171: i don 't ha