In [1]:
import numpy as np
import os, torch, subprocess, csv, re
torch.cuda.is_available()

True

## Prepare Files needed for this lab

Copy code files from tagger by Liu et et al from input directory to working. Create directories for outputs and input files.

In [2]:
# Copy over code files from github repository 
!cp -r /kaggle/input/liutagger/* ./
!mkdir -p checkpoint
!cp /kaggle/input/trainedtaggermodel/* ./checkpoint/
!mkdir ./inputs
!cp /kaggle/input/my-tagger-test/my_test.txt ./inputs/my_test.txt
!mkdir ./outputs
!ls 

LICENSE			   docs        model		 seq_wc.py
README.md		   eval_w.py   outputs		 train_w.py
__notebook_source__.ipynb  eval_wc.py  requirements.txt  train_wc.py
checkpoint		   inputs      seq_w.py


Function for writing the system output one gets from running the tagger program.

In [3]:
def writeLogOut(logTxt, filePath):
    if type(logTxt) is bytes: # convert to utf-8 if log is in byte chars
        logTxt = logTxt.decode()
    with open(filePath, 'w') as f:
        f.write(logTxt)

In [4]:
################################################################################
template = ['python -W ignore train_wc.py',
            '--emb_file /kaggle/input/glove6b100dtxt/glove.6B.100d.txt', 
            '--train_file {0}/{1}', 
            '--dev_file {0}/valid.txt', 
            '--test_file {0}/test.txt', 
            '--checkpoint checkpoint/ner_ --epoch {2}', 
            '--caseless --fine_tune --high_way --co_train --least_iters 100']
inPath = '/kaggle/input/conll003-englishversion'
infile = 'train.txt'
cmd = ' '.join(template).format(inPath,'train.txt',30)
print(cmd)

python -W ignore train_wc.py --emb_file /kaggle/input/glove6b100dtxt/glove.6B.100d.txt --train_file /kaggle/input/conll003-englishversion/train.txt --dev_file /kaggle/input/conll003-englishversion/valid.txt --test_file /kaggle/input/conll003-englishversion/test.txt --checkpoint checkpoint/ner_ --epoch 30 --caseless --fine_tune --high_way --co_train --least_iters 100


Running the baseline run  

Note: this is done twice to get some range of performance numbers

In [5]:
for n in range(3):
    print('=== Run %d ==='%n)
    #baseline = subprocess.check_output(cmd, shell=True)
    #writeLogOut(baseline, './outputs/baseline_%d.log'%n)

=== Run 0 ===
=== Run 1 ===
=== Run 2 ===


<a href="outputs/baseline_0.log">baseline_0.log</a>, 
<a href="outputs/baseline_1.log">baseline_1.log</a>,
<a href="outputs/baseline_2.log">baseline_2.log</a>  

<a href="checkpoint/ner_cwlm_lstm_crf.json">ner_cwlm_lstm_crf.json</a>,
<a href="checkpoint/ner_cwlm_lstm_crf.model">ner_cwlm_lstm_crf.model</a>

## Runs of partial files

### Splitting of Input Files
Read in the training file

In [6]:
# Get split points for each chunks of the file
with open('/kaggle/input/conll003-englishversion/train.txt', 'r') as f:
    raw = np.array(f.read().split('\n'))

Produce the split file, where first file has first 1/10, second file has first 2/10, and so on, until the last file having the entirety of the content.

In [7]:
docStarts = np.where(raw=='-DOCSTART- -X- -X- O')[0] # document start points
docEnds = np.hstack([docStarts[1:],raw.size]) # document end points
nDocs = docStarts.size # num of total documents in training file
nFiles = 10

docsPerFile = np.rint(np.arange(1,nFiles+1)/nFiles*nDocs).astype(int)

In [8]:
for n,ds in enumerate(docsPerFile):
    blob = raw[:docEnds[:ds][-1]]  # concat all lines in this file
    with open(os.path.join('inputs','train%02d.txt'%n), 'w') as f:
        f.write('\n'.join(blob))

################################################################################

Look at the number of lines in each of the input files

In [9]:
!wc -l inputs/train*.txt

   20342 inputs/train00.txt
   42425 inputs/train01.txt
   62177 inputs/train02.txt
   84506 inputs/train03.txt
  108704 inputs/train04.txt
  133690 inputs/train05.txt
  155268 inputs/train06.txt
  178879 inputs/train07.txt
  198281 inputs/train08.txt
  219554 inputs/train09.txt
 1203826 total


### Running Partially Split Files

Run training on the split files as well

In [10]:
inPath = './inputs'
log = [''for x in range(10)]

In [11]:
'''
for n in range(5,9):
    inFile = 'train%02d.txt'%n
    cmd = ' '.join(template).format(inPath,inFile,30)
    print('==== Running File %d of %d ===='%(n,len(log)))
    print(cmd)
    log[n] = subprocess.check_output(cmd, shell=True)
    
    writeLogOut(log[n], './outputs/part%02d.log'%n)
print("==== Done ====")
'''

'\nfor n in range(5,9):\n    inFile = \'train%02d.txt\'%n\n    cmd = \' \'.join(template).format(inPath,inFile,30)\n    print(\'==== Running File %d of %d ====\'%(n,len(log)))\n    print(cmd)\n    log[n] = subprocess.check_output(cmd, shell=True)\n    \n    writeLogOut(log[n], \'./outputs/part%02d.log\'%n)\nprint("==== Done ====")\n'

### Download links for output logs from runs of partial files

This allows for results to be produced and retrived over many different runs instead of all at once.

<a href="outputs/part00.log">part00.log</a>,
<a href="outputs/part01.log">part01.log</a>, 
<a href="outputs/part02.log">part02.log</a>, 
<a href="outputs/part03.log">part03.log</a>, 
<a href="outputs/part04.log">part04.log</a>,  
<a href="outputs/part05.log">part05.log</a>, 
<a href="outputs/part06.log">part06.log</a>, 
<a href="outputs/part07.log">part07.log</a>, 
<a href="outputs/part08.log">part08.log</a>, 
<a href="outputs/part09.log">part09.log</a>  

## Remapping All Entities to Generic

In [12]:
def remapFiles(filePathIn, filePathOut):
    remap = re.compile(r'([\w]\-)(.+)')
    
    fIn = open(filePathIn, 'r')
    reader = csv.reader(fIn, delimiter=' ')
    fOut = open(filePathOut, 'w')
    writer = csv.writer(fOut, delimiter=' ')
    for l in reader:
        if l:
            writer.writerow(l[:3] + [remap.sub(r'\g<1>ENT', l[-1])])
        else:
            writer.writerow('')
    fIn.close()
    fOut.close()

In [13]:
!mkdir ./inputs/remapped
remapFiles('/kaggle/input/conll003-englishversion/train.txt',
           './inputs/remapped/train.txt')
remapFiles('/kaggle/input/conll003-englishversion/test.txt',
           './inputs/remapped/test.txt')
remapFiles('/kaggle/input/conll003-englishversion/valid.txt',
           './inputs/remapped/valid.txt')

Check the remapped files have the same number of lines as originals

In [14]:
!wc -l /kaggle/input/conll003-englishversion/*.txt
!echo ''
!wc -l ./inputs/remapped/*.txt

  50350 /kaggle/input/conll003-englishversion/test.txt
 219554 /kaggle/input/conll003-englishversion/train.txt
  55044 /kaggle/input/conll003-englishversion/valid.txt
 324948 total

  50350 ./inputs/remapped/test.txt
 219554 ./inputs/remapped/train.txt
  55044 ./inputs/remapped/valid.txt
 324948 total


In [15]:
inPath = './inputs/remapped'
infile = 'train.txt'
cmd = ' '.join(template).format(inPath,'train.txt',30)
print(cmd)

python -W ignore train_wc.py --emb_file /kaggle/input/glove6b100dtxt/glove.6B.100d.txt --train_file ./inputs/remapped/train.txt --dev_file ./inputs/remapped/valid.txt --test_file ./inputs/remapped/test.txt --checkpoint checkpoint/ner_ --epoch 30 --caseless --fine_tune --high_way --co_train --least_iters 100


In [16]:
#remapped = subprocess.check_output(cmd, shell=True)
#writeLogOut(remapped, './outputs/remapped.log')

<a href="outputs/remapped.log">remapped.log</a>

## Own Test File

In [17]:
#!python -W ignore seq_wc.py --load_arg ./checkpoint/ner_cwlm_lstm_crf.json \
#    --load_check_point ./checkpoint/ner_cwlm_lstm_crf.model \
#    --gpu 0 --input_file ./inputs/my_test.txt --output_file ./outputs/my_test.out

Link to output: <a href="outputs/my_test.out">my_test.out</a>

Actual command for easy copy/paste
````
!python -W ignore train_wc.py \
    --emb_file /kaggle/input/glove6b100dtxt/glove.6B.100d.txt \
    --train_file ./inputs/train01.txt \
    --dev_file /kaggle/input/conll003-englishversion/valid.txt \
    --test_file /kaggle/input/conll003-englishversion/test.txt \
    --checkpoint checkpoint/ner_ --epoch 2 --caseless \
    --fine_tune --high_way --co_train --least_iters 100
````