# Prepare data

Download lang-8-20111007-2.0.zip https://sites.google.com/site/naistlang8corpora/

In [4]:
%%sh

cd data
unzip lang-8-20111007-2.0.zip

Archive:  lang-8-20111007-2.0.zip
   creating: lang-8-20111007-2.0/
  inflating: lang-8-20111007-2.0/lang-8-20111007-L1-v2.dat  
   creating: __MACOSX/
   creating: __MACOSX/lang-8-20111007-2.0/
  inflating: __MACOSX/lang-8-20111007-2.0/._lang-8-20111007-L1-v2.dat  
  inflating: lang-8-20111007-2.0/README  
  inflating: __MACOSX/lang-8-20111007-2.0/._README  
  inflating: __MACOSX/._lang-8-20111007-2.0  


### Convert the scripts from python2 to python3

In [60]:
%%sh

cd data/scripts
2to3 -w m2_scripts/sort_m2.py
2to3 -w m2_scripts/get_num_lines.py
2to3 -w m2_scripts/convert_m2_to_parallel.py
2to3 -w lang-8_scripts/langidfilter.py
2to3 -w nltk_scripts/word-tokenize.py

--- m2_scripts/sort_m2.py	(original)
+++ m2_scripts/sort_m2.py	(refactored)
@@ -32,7 +32,7 @@
 
 	for lines in dataset_lines:
 		if len(lines) == 0:
-			print line
+			print(line)
 	sortedset = sorted(dataset_lines, key=lambda x: (len(x), -len(x[0].split())))
 	for lines in sortedset:
 		if len(lines) - 1 >= min_annots :	# len(lines) = sentence + annots


RefactoringTool: Skipping optional fixer: buffer
RefactoringTool: Skipping optional fixer: idioms
RefactoringTool: Skipping optional fixer: set_literal
RefactoringTool: Skipping optional fixer: ws_comma
RefactoringTool: Refactored m2_scripts/sort_m2.py
RefactoringTool: Files that were modified:
RefactoringTool: m2_scripts/sort_m2.py


### Install dependencies

In [56]:
!pip install langid nltk

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [18]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### run main processing script

In [1]:
%load_ext autoreload

%autoreload 2

In [3]:
%%sh

cd data
./prepare_data.sh

Process is terminated.


In [2]:
import os
BASE_DIR = os.getcwd()

DATA_DIR = f'{BASE_DIR}/data'
MODEL_DIR = f'{BASE_DIR}/models'
SCRIPTS_DIR = f'{BASE_DIR}/scripts'
SOFTWARE_DIR = f'{BASE_DIR}/software'

NUCLE_TAR = f'{DATA_DIR}/release3.3.tar.bz2'
LANG8V2 = f'{DATA_DIR}/lang-8-20111007-2.0/lang-8-20111007-L1-v2.dat'

# path to scripts directories
M2_SCRIPTS = f'{DATA_DIR}/scripts/m2_scripts'
MOSES_SCRIPTS = f'{DATA_DIR}scripts/moses_scripts'
LANG8_SCRIPTS = f'{DATA_DIR}/scripts/lang-8_scripts'
NLTK_SCRIPTS = f'{DATA_DIR}/scripts/nltk_scripts'

REPLACE_UNICODE = f'{MOSES_SCRIPTS}/replace-unicode-punctuation.perl'
REMOVE_NON_PRINT = f'{MOSES_SCRIPTS}/remove-non-printing-char.perl'
NORMALIZE_PUNCT = f'{MOSES_SCRIPTS}/normalize-punctuation.perl'

TOKENIZE = f'{NLTK_SCRIPTS}/word-tokenize.py'

def make_directory(pathname):
    import os
    if os.path.exists(pathname):
        print(f'Folder already exists at {pathname}, skipping..')
    else:
        print(f'Folder {pathname} doesn\'t exist, creating..')
        os.makedirs(pathname)


TEMP = f'{DATA_DIR}/tmp2'
make_directory(TEMP)

/home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/datascripts/moses_scripts/normalize-punctuation.perl


Folder already exists at /home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/data/tmp2, skipping..


In [4]:
# NUCLE
#########

import tarfile
import os

with tarfile.open(NUCLE_TAR, "r:bz2") as tar:
    tar.extractall(TEMP)

# echo "[`date`] Preparing NUCLE data..." >&2
# bzip2 -dc $NUCLE_TAR | tar xvf - -C tmp/
NUCLE_DIR = f'{TEMP}/release3.3'
NUCLE_DEV_DIR = f'{NUCLE_DIR}/nucle-dev'
NUCLE_TRAIN_DIR = f'{NUCLE_DIR}/nucle-train'

make_directory(NUCLE_DEV_DIR)
make_directory(NUCLE_TRAIN_DIR)

import data.scripts.m2_scripts.sort_m2 as sort_m2
import data.scripts.m2_scripts.get_num_lines as get_num_lines

NUCLE_SORT_M2 = f'{TEMP}/nucle.sort.m2'
NUCLE_SPLIT = f'{TEMP}/nucle.split'

sort_m2.collect_lines(f'{NUCLE_DIR}/data/conll14st-preprocessed.m2', 
                      NUCLE_SORT_M2, 
                      1,
                      True)

split_file_names = get_num_lines.split_lines(NUCLE_SORT_M2,
                                             NUCLE_SPLIT,
                                             numparts=4,
                                             shuffle=True)

files_to_cat = split_file_names + [f'{NUCLE_SORT_M2}.rem']

import shutil

with open(f'{NUCLE_DEV_DIR}/nucle-dev.m2', 'wb') as wfd:
    with open(f'{TEMP}/nucle.split.1.m2', 'rb') as fd:
        shutil.copyfileobj(fd, wfd)

with open(f'{TEMP}/nucle.combined.m2', 'wb') as wfd:
    for f in files_to_cat:
        with open(f, 'rb') as fd:
            shutil.copyfileobj(fd, wfd)
            
split_file_names = get_num_lines.split_lines(f'{TEMP}/nucle.combined.m2',
                                             f'{TEMP}/nucle-train',
                                             numparts=1)


with open(f'{NUCLE_TRAIN_DIR}/nucle-train.m2', 'wb') as wfd:
    with open(split_file_names[0], 'rb') as fd:
        shutil.copyfileobj(fd, wfd)
            
from data.scripts.m2_scripts.convert_m2_to_parallel import convert 

convert(f'{NUCLE_TRAIN_DIR}/nucle-train.m2',
       f'{NUCLE_TRAIN_DIR}/nucle-train.tok.src',
       f'{NUCLE_TRAIN_DIR}/nucle-train.tok.trg',)

convert(f'{NUCLE_DEV_DIR}/nucle-dev.m2',
       f'{NUCLE_DEV_DIR}/nucle-dev.tok.src',
       f'{NUCLE_DEV_DIR}/nucle-dev.tok.trg',)

# removing empty target sentence pairs
with open(f'{NUCLE_DEV_DIR}/nucle-dev.non_empty.tok.src', 'w') as wf1, open(f'{NUCLE_DEV_DIR}/nucle-dev.non_empty.tok.trg', 'w') as wf2:
    with open(f'{NUCLE_DEV_DIR}/nucle-dev.tok.src') as rf1, open(f'{NUCLE_DEV_DIR}/nucle-dev.tok.trg') as rf2:
        for rf1_line, rf2_line in zip(rf1, rf2):
            if (rf1_line is not "\n" and rf2_line is not "\n"):
                wf1.write(rf1_line)
                wf2.write(rf2_line)
    

['nucle.split.1.m2', 'nucle.split.2.m2', 'nucle.sort.m2', 'release3.3', 'nucle.split.3.m2', 'nucle.split.4.m2', '.ipynb_checkpoints', 'nucle.combined.m2', 'nucle.sort.m2.rem']
['nucle.split.1.m2', 'nucle.split.2.m2', 'nucle.sort.m2', 'release3.3', 'nucle.split.3.m2', 'nucle.split.4.m2', '.ipynb_checkpoints', 'nucle.combined.m2', 'nucle.sort.m2.rem']
Folder already exists at /home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/data/tmp2/release3.3/nucle-dev, skipping..
Folder already exists at /home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/data/tmp2/release3.3/nucle-train, skipping..


['/home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/data/tmp2/nucle.split.1.m2', '/home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/data/tmp2/nucle.split.2.m2', '/home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/data/tmp2/nucle.split.3.m2', '/home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/data/tmp2/nucle.split.4.m2', '/home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/data/tmp2/nucle.sort.m2.rem']


In [105]:
# LANG-8 v2
#############
# # Preparation of Lang-8 data
# echo "[`date`] Preparing Lang-8 data... (NOTE:Can take several hours, due to LangID.py filtering...)" >&2
L2='English' 				 # Learning language, i.e. extract only English learners text

LANG8_DIR = f'{DATA_DIR}/lang-8'
make_directory(LANG8_DIR)

Folder already exists at /home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/data/lang-8, skipping..


In [None]:

python3.6 $LANG8_SCRIPTS/extract.py -i $LANG8V2 -o $DATA_DIR/tmp/ -l2 $L2
cat $DATA_DIR/tmp/lang-8-20111007-L1-v2.dat.processed | perl -p -e 's@\[sline\].*?\[\\/sline\]@@sg' | sed 's/\[\\\/sline\]//g' | sed 's/\[\\\/f-[a-zA-Z]*\]//g' | sed 's/\[f-[a-zA-Z]*\]//g' | sed 's/rŠëyËb¢{//g' > $DATA_DIR/tmp/lang-8.$L2.cleanedup
rm $DATA_DIR/tmp/lang-8-20111007-L1-v2.dat.processed
python3.6 $LANG8_SCRIPTS/langidfilter.py $DATA_DIR/tmp/lang-8.$L2.cleanedup > $DATA_DIR/tmp/lang-8.$L2.extracted
rm $DATA_DIR/tmp/lang-8.$L2.cleanedup
python3.6 $LANG8_SCRIPTS/get_parallel.py -i $DATA_DIR/tmp/lang-8.$L2.extracted -o lang-8 -d $DATA_DIR/tmp/lang-8/

for EXT in src trg; do
    cat $DATA_DIR/tmp/lang-8/lang-8.$EXT | $REPLACE_UNICODE | $REMOVE_NON_PRINT | sed  's/\\"/\"/g' | sed 's/\\t/ /g' | $NORMALIZE_PUNCT | python3.6  $TOKENIZER  > $DATA_DIR/lang-8/lang-8.tok.$EXT
done

# Preparing the concatenated training data.
mkdir -p $DATA_DIR/concat-train
cat $DATA_DIR/nucle-train/nucle-train.tok.src $DATA_DIR/lang-8/lang-8.tok.src > $DATA_DIR/concat-train/concat-train.tok.src
cat $DATA_DIR/nucle-train/nucle-train.tok.trg $DATA_DIR/lang-8/lang-8.tok.trg > $DATA_DIR/concat-train/concat-train.tok.trg
mkdir -p $DATA_DIR/concat-train/cleaned/
$MOSES_SCRIPTS/clean-corpus-n.perl $DATA_DIR/concat-train/concat-train.tok src trg $DATA_DIR/concat-train/cleaned/concat-train.clean.tok 1 80


ln -s concat-train/cleaned/concat-train.clean.tok.src train.tok.src
ln -s concat-train/cleaned/concat-train.clean.tok.trg train.tok.trg
ln -s nucle-dev/nucle-dev.non_empty.tok.src dev.tok.src
ln -s nucle-dev/nucle-dev.non_empty.tok.trg dev.tok.trg
ln -s nucle-dev/nucle-dev.tok.src dev.all.tok.src
ln -s nucle-dev/nucle-dev.m2 dev.all.m2

# Download le models

In [None]:
%%sh

cd models
./download.sh

# Download external software required

In [62]:
%%sh

cd software
./download.sh

Downloading Fairseq from https://github.com/pytorch/fairseq (rev:388c520be21752cacb9fe3b1712038f32e0e9a5f)
Archive:  388c520be21752cacb9fe3b1712038f32e0e9a5f.zip
388c520be21752cacb9fe3b1712038f32e0e9a5f
   creating: fairseq-py-388c520be21752cacb9fe3b1712038f32e0e9a5f/
  inflating: fairseq-py-388c520be21752cacb9fe3b1712038f32e0e9a5f/.gitignore  
  inflating: fairseq-py-388c520be21752cacb9fe3b1712038f32e0e9a5f/CONTRIBUTING.md  
  inflating: fairseq-py-388c520be21752cacb9fe3b1712038f32e0e9a5f/LICENSE  
  inflating: fairseq-py-388c520be21752cacb9fe3b1712038f32e0e9a5f/PATENTS  
  inflating: fairseq-py-388c520be21752cacb9fe3b1712038f32e0e9a5f/README.md  
  inflating: fairseq-py-388c520be21752cacb9fe3b1712038f32e0e9a5f/distributed_train.py  
  inflating: fairseq-py-388c520be21752cacb9fe3b1712038f32e0e9a5f/eval_lm.py  
   creating: fairseq-py-388c520be21752cacb9fe3b1712038f32e0e9a5f/examples/
   creating: fairseq-py-388c520be21752cacb9fe3b1712038f32e0e9a5f/examples/language_model/
  inflating:

--2020-09-15 01:41:56--  https://github.com/shamilcm/fairseq-py/archive/388c520be21752cacb9fe3b1712038f32e0e9a5f.zip
Resolving github.com (github.com)... 52.74.223.119
Connecting to github.com (github.com)|52.74.223.119|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/shamilcm/fairseq-py/zip/388c520be21752cacb9fe3b1712038f32e0e9a5f [following]
--2020-09-15 01:41:57--  https://codeload.github.com/shamilcm/fairseq-py/zip/388c520be21752cacb9fe3b1712038f32e0e9a5f
Resolving codeload.github.com (codeload.github.com)... 13.250.162.133
Connecting to codeload.github.com (codeload.github.com)|13.250.162.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘388c520be21752cacb9fe3b1712038f32e0e9a5f.zip’

     0K .......... .......... .......... .......... ..........  484K
    50K .......... .......... .......... .......... .......... 20.5M
   100K .......... .......... ......

# Preprocess for training

In [63]:
%%sh

cd training
./preprocess.sh

Namespace(alignfile=None, destdir='processed/bin', joined_dictionary=False, nwordssrc=30000, nwordstgt=30000, only_source=False, output_format='binary', padding_factor=8, source_lang='src', srcdict=None, target_lang='trg', testpref=None, tgtdict=None, thresholdsrc=0, thresholdtgt=0, trainpref='processed/train', validpref='processed/dev')
| [src] Dictionary: 29693 types
| [src] processed/train.src: 1298756 sents, 20448818 tokens, 0.0% replaced by <unk>
| [src] Dictionary: 29693 types
| [src] processed/dev.src: 5448 sents, 154850 tokens, 0.00969% replaced by <unk>
| [trg] Dictionary: 29796 types
| [trg] processed/train.trg: 1298756 sents, 23896744 tokens, 0.0% replaced by <unk>
| [trg] Dictionary: 29796 types
| [trg] processed/dev.trg: 5448 sents, 153243 tokens, 0.00718% replaced by <unk>
| Wrote preprocessed data to processed/bin


+ source ../paths.sh
++++ dirname ../paths.sh
+++ cd ..
+++ pwd
++ BASE_DIR=/home/ec2-user/SageMaker/grip-grammar/mlconvgec2018
++ DATA_DIR=/home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/data
++ MODEL_DIR=/home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/models
++ SCRIPTS_DIR=/home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/scripts
++ SOFTWARE_DIR=/home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/software
+ src_ext=src
+ trg_ext=trg
+ train_data_prefix=/home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/data/train
+ dev_data_prefix=/home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/data/dev
+ dev_data_m2=/home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/data/dev.all.m2
+ SUBWORD_NMT=/home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/software/subword-nmt
+ FAIRSEQPY=/home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/software/fairseq-py
+ mkdir -p models/bpe_model
+ bpe_operations=30000
+ '[' '!' -d /home/ec2-user/SageMaker/grip-grammar/mlconvgec2018/software/subword-nmt

# Create container for training and deployment

In [1]:
%%sh

cd ..
chmod +x create_container.sh 

./create_container.sh grip-grammar-fairseq-six

Getting from region ap-southeast-1 and account 868408476013
Login Succeeded
Login Succeeded
Sending build context to Docker daemon  8.208GB
Step 1/22 : FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04
10.0-cudnn7-devel-ubuntu18.04: Pulling from nvidia/cuda
171857c49d0f: Pulling fs layer
419640447d26: Pulling fs layer
61e52f862619: Pulling fs layer
c118dad7e37a: Pulling fs layer
f3015ef64b84: Pulling fs layer
4c97ef225f71: Pulling fs layer
9bb8cad92ae2: Pulling fs layer
40a3f698c08c: Pulling fs layer
19b13b8eb432: Pulling fs layer
0e5a493eaab5: Pulling fs layer
73a33e481892: Pulling fs layer
4c97ef225f71: Waiting
19b13b8eb432: Waiting
9bb8cad92ae2: Waiting
40a3f698c08c: Waiting
c118dad7e37a: Waiting
73a33e481892: Waiting
f3015ef64b84: Waiting
61e52f862619: Verifying Checksum
61e52f862619: Download complete
419640447d26: Download complete
171857c49d0f: Verifying Checksum
171857c49d0f: Download complete
171857c49d0f: Pull complete
c118dad7e37a: Verifying Checksum
c118dad7e37a: Download comp

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



lol


# Setup sagemaker session

In [5]:
!pip install --upgrade sagemaker

Collecting sagemaker
  Downloading sagemaker-2.15.1.tar.gz (307 kB)
[K     |████████████████████████████████| 307 kB 11.0 MB/s eta 0:00:01
Collecting google-pasta
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting smdebug-rulesconfig==0.1.5
  Using cached smdebug_rulesconfig-0.1.5-py2.py3-none-any.whl (6.2 kB)
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-2.15.1-py2.py3-none-any.whl size=433611 sha256=2e0405fb61c3543fb335e788f0cd549766acda54417f4ac0aae38550a9d337aa
  Stored in directory: /home/ec2-user/.cache/pip/wheels/90/fe/7c/2226226666ad0da8a21d77e8c5093c71245cb38f548a24627f
Successfully built sagemaker
Installing collected packages: google-pasta, smdebug-rulesconfig, sagemaker
  Attempting uninstall: smdebug-rulesconfig
    Found existing installation: smdebug-rulesconfig 0.1.4
    Uninstalling smdebug-rulesconfig-0.1.4:
      Successfully uninst

In [6]:
!pip list

Package                            Version            
---------------------------------- -------------------
alabaster                          0.7.12             
anaconda-client                    1.7.2              
anaconda-project                   0.8.3              
argh                               0.26.2             
asn1crypto                         1.3.0              
astroid                            2.4.2              
astropy                            4.0                
atomicwrites                       1.3.0              
attrs                              19.3.0             
autopep8                           1.4.4              
autovizwidget                      0.16.0             
awscli                             1.18.149           
Babel                              2.8.0              
backcall                           0.1.0              
backports.shutil-get-terminal-size 1.0.0              
bcrypt                             3.2.0              
beautifuls

In [1]:
import sagemaker

sagemaker_session = sagemaker.Session()
region =  sagemaker_session.boto_session.region_name
account = sagemaker_session.boto_session.client('sts').get_caller_identity().get('Account')

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/grip-grammar-training/processed/bin'

role = sagemaker.get_execution_role()

In [4]:
print(sagemaker.__version__)

1.72.1


### Upload data to S3 bucket

In [74]:
inputs = sagemaker_session.upload_data(path='training/processed/bin', bucket=bucket, key_prefix=prefix)

In [71]:
hyperparameters = {
    "encoder-embed-dim": 500,
    "decoder-embed-dim": 500,
    "decoder-out-embed-dim": 500,
    "dropout": 0.2,
    "clip-norm": 0.1,
    "lr": 0.25,
    "min-lr": 1e-4,
    "encoder-layers": '[(1024,3)] * 7',
    "decoder-layers": '[(1024,3)] * 7',
    "momentum": 0.99,
    "max-epoch": 100,
    "batch-size": 96,
    "seed": 1000,
    "arch": "fconv",
    'no-save': "False",
    'no-epoch-checkpoints': "False",
    "nbest": 12,
    "beam": 12
}

In [72]:
from sagemaker.estimator import Estimator

algorithm_name = "grip-grammar-fairseq-six"
image = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, algorithm_name)

estimator = Estimator(image_uri=image,
                     role=role,
                     instance_count=1,
                     instance_type='ml.p3.8xlarge',
                     volume_size=64,
                     output_path='s3://{}/output'.format(bucket),
                     sagemaker_session=sagemaker_session,
                     hyperparameters=hyperparameters)

In [None]:
estimator.fit(inputs=inputs)

2020-09-25 08:36:40 Starting - Starting the training job...
2020-09-25 08:36:42 Starting - Launching requested ML instances......
2020-09-25 08:37:53 Starting - Preparing the instances for training......
2020-09-25 08:39:09 Downloading - Downloading input data
2020-09-25 08:39:09 Training - Downloading the training image..............[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34mStarting the training.[0m
[34m{'encoder-layers': '[(1024,3)] * 7', 'beam': '12', 'seed': '1000', 'lr': '0.25', 'clip-norm': '0.1', 'encoder-embed-dim': '500', 'max-epoch': '100', 'nbest': '12', 'no-save': 'False', 'decoder-embed-dim': '500', 'momentum': '0.99', 'batch-size': '96', 'decoder-layers': '[(1024,3)] * 7', 'dropout': '0.2', 'decoder-out-embed-dim': '500', 'arch': 'fconv', 'min-lr': '0.0001', 'no-epoch-checkpoints': 'False'}[0m
[34m['--encoder-layers', '[(1024,3)] * 7', '--beam', '12', '--seed', '1000', '--lr',

In [2]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

class JSONPredictor(Predictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(JSONPredictor, self).__init__(endpoint_name, sagemaker_session, JSONSerializer(), JSONDeserializer())

In [28]:
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c5.9xlarge', predictor_cls=JSONPredictor)

NameError: name 'estimator' is not defined

In [11]:
import html

text_input = "It is becoming even more convini@@ ent than talking thr@@ ought the phone since it is free . We discuss every thing we like and we have more personal space ."

result = predictor.predict(text_input)
#  Some characters are escaped HTML-style requiring to unescape them before printing
print(html.unescape(result))

NameError: name 'predictor' is not defined

trying to load the model from uri

In [3]:
uri = "s3://sagemaker-ap-southeast-1-868408476013/output/grip-grammar-fairseq-six-2020-09-25-08-36-40-383/output/model.tar.gz"
sagemaker.s3.S3Downloader.list(uri, sagemaker_session)
trained_model_location = uri

## Already have a model and looking to simply import it?

In [4]:
from sagemaker import Model

algorithm_name = "grip-grammar-fairseq-six"
image_uri = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, algorithm_name)

model = Model(image_uri=image_uri,
              role=role,
              model_data=trained_model_location,
              predictor_cls=JSONPredictor,
             )

In [None]:
predictor = model.deploy(initial_instance_count=1, instance_type='ml.c5.9xlarge')

----

In [17]:
import html
import json
text_input = ["I walk to the store and I bought milk.", "I walk to the store and I bought milk .", "the quick brown fox jumps over the lazy dog.", "I will eat fish for dinner and drank milk."]

result = predictor.predict(text_input)

# print(html.unescape(result))
print(result)
# a  = result.split("\n")
# print(len(a))
# for _ in a:
#     print(_)


ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from model with message "<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<title>500 Internal Server Error</title>
<h1>Internal Server Error</h1>
<p>The server encountered an internal error and was unable to complete your request. Either the server is overloaded or there is an error in the application.</p>
". See https://ap-southeast-1.console.aws.amazon.com/cloudwatch/home?region=ap-southeast-1#logEventViewer:group=/aws/sagemaker/Endpoints/grip-grammar-fairseq-six-2020-10-17-03-01-57-063 in account 868408476013 for more information.

In [69]:
result[0]

{'query': 'I walk to the store and I bought milk.',
 'result': 'I walk to the store and I bought milk .',
 'edits': [{'o_start': 8,
   'o_end': 9,
   'o_str': 'milk.',
   'c_start': 8,
   'c_end': 10,
   'c_str': 'milk .',
   'type': 'R:ORTH'}]}

# Simply want to use the endpoint?

In [19]:
result.split('\n')

['0 ||| It is becoming even more convenient than talking to the phone since it is free . We discuss every thing we like and we have more personal space . ||| F0= -0.19564148783683777 ||| None',
 '0 ||| It is becoming even more convenient than talking through the phone since it is free . We discuss every thing we like and we have more personal space . ||| F0= -0.232662633061409 ||| None',
 '0 ||| It is becoming even more convenient than talking thr ought the phone since it is free . We discuss every thing we like and we have more personal space . ||| F0= -0.2348337322473526 ||| None',
 '0 ||| It is becoming even more convenient than talking to the phone since it is free . We discussed every thing we like and we have more personal space . ||| F0= -0.26183924078941345 ||| None',
 '0 ||| It is becoming even more convenient than talking to the phone since it is free . We discuss every thing we like , and we have more personal space . ||| F0= -0.26989850401878357 ||| None',
 '0 ||| It is bec

In [25]:
!pip install kenlm

Collecting kenlm
  Downloading kenlm-0.tar.gz (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 16.0 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: kenlm
  Building wheel for kenlm (setup.py) ... [?25ldone
[?25h  Created wheel for kenlm: filename=kenlm-0.0.0-cp36-cp36m-linux_x86_64.whl size=2044675 sha256=f8f026fa7124de64f7e54d622fcc6040a2aeaa3f3e240d5217c3611935bc9419
  Stored in directory: /home/ec2-user/.cache/pip/wheels/5d/bb/b0/efb1d66f2fac70174e06e0fdec1eaf6e81c52418c3678189cb
Successfully built kenlm
Installing collected packages: kenlm
Successfully installed kenlm-0.0.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m


In [28]:
import ..fairseq.nbestreranker.augmenter

SyntaxError: invalid syntax (<ipython-input-28-565682efb64f>, line 1)