In [None]:
import re
import csv
import codecs
import numpy as np
import pandas as pd
import os
import time
from tqdm import tqdm
import nltk
import random
from collections import Counter

from nltk.util import ngrams
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

random.seed(1024)
FloatTensor = torch.cuda.FloatTensor
LongTensor = torch.cuda.LongTensor
ByteTensor = torch.cuda.ByteTensor
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
%%capture
! pip install mlflow
! pip install sklearn-crfsuite

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


#Preprocessing

In [None]:
new_text = []
path = 'drive/My Drive/AISC/nlp/data/train/'

for file in os.listdir(path)[:100]:
    file = open(path+file, encoding='UTF-8')  
    text = file.readlines()
    for i in text:
        tokens = []
        labels = []
        i = list(filter(None, i.strip('\n').strip('\n').strip(' ').split(' ')))
        for j in i:
            tokens.append(j[:j.find('[')])
            labels.append(j[j.find('['):])
        new_text.append([tokens, labels])
      
print(len(new_text), 'sentences after combining documents')

32253 sentences after combining documents


In [None]:
x, y = list(zip(*new_text))
vocab = list(set([item for sublist in x for item in list(filter(None, sublist))]))
tags = list(set([item for sublist in y for item in list(filter(None, sublist))]))

# label counts
# Notice the skew in the dataset. We will need to remove the [0] label at the end
pd.Series([item for sublist in y for item in list(filter(None, sublist))]).value_counts()


[0]      1280910
[LEG]       1385
[CNP]       1157
[GOV]        632
[TIT]        631
[STD]        445
[JUR]        438
[EFD]        114
[VAL]         99
[TED]         95
[PER]         58
dtype: int64

In [None]:
word2index={'<UNK>' : 0, '<DUMMY>' : 1} # dummy token is for start or end of sentence

for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)

index2word = {v:k for k, v in word2index.items()}

tag2index = {}
for tag in tags:
    if tag2index.get(tag) is None:
        tag2index[tag] = len(tag2index) 
index2tag={v:k for k, v in tag2index.items()}

print(len(word2index), 'unique words in text')


26704 unique words in text


In [None]:
WINDOW_SIZE = 2
windows = []

for sample in new_text:
    #print(sample)
    dummy = ['<DUMMY>'] * WINDOW_SIZE
    window = list(nltk.ngrams(dummy + list(sample[0]) + dummy, WINDOW_SIZE * 2 + 1))
    windows.extend([[list(window[i]), sample[1][i]] for i in range(len(sample[0]))])
windows[0]

[['<DUMMY>', '<DUMMY>', 'TOKEN_57', 'TOKEN_2368', 'TOKEN_61'], '[0]']

#Model

In [None]:
class WindowClassifier(nn.Module): 
    def __init__(self, vocab_size, embedding_size, window_size, hidden_size, output_size):

        super(WindowClassifier, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.h_layer1 = nn.Linear(embedding_size * (window_size * 2 + 1), hidden_size)
        self.h_layer2 = nn.Linear(hidden_size, hidden_size)
        self.o_layer = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, inputs, is_training=False): 
        embeds = self.embed(inputs) # BxWxD
        concated = embeds.view(-1, embeds.size(1)*embeds.size(2)) # Bx(W*D)
        h0 = self.relu(self.h_layer1(concated))
        if is_training:
            h0 = self.dropout(h0)
        h1 = self.relu(self.h_layer2(h0))
        if is_training:
            h1 = self.dropout(h1)
        out = self.softmax(self.o_layer(h1))
        return out

def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(torch.LongTensor(idxs))

def prepare_word(word, word2index):
    return Variable(torch.LongTensor([word2index[word]]) if word2index.get(word) is not None else torch.LongTensor([word2index["<UNK>"]]))

def prepare_tag(tag,tag2index):
    return Variable(torch.LongTensor([tag2index[tag]]))


In [None]:
# Separating training and testing data sets
random.shuffle(windows)
train_data = windows[:int(len(windows) * 0.7)]
test_data = windows[int(len(windows) * 0.7):]

len(train_data), len(test_data)

(900174, 385790)

In [None]:
# Defining parameters for the model
BATCH_SIZE = 1000
EMBEDDING_SIZE = 50
HIDDEN_SIZE = 300
EPOCH = 3
LEARNING_RATE = 0.001

In [None]:
model = WindowClassifier(len(word2index), EMBEDDING_SIZE, WINDOW_SIZE, HIDDEN_SIZE, len(tag2index))
#model = model.cuda()
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

#Training

In [None]:
for epoch in range(EPOCH):
    losses = []
    for i,batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        x,y=list(zip(*batch))
        inputs = torch.cat([prepare_sequence(sent, word2index).view(1, -1) for sent in x])
        targets = torch.cat([prepare_tag(tag, tag2index) for tag in y])
        model.zero_grad()      
        preds = model(inputs, is_training=True)
        loss = loss_function(preds, targets)
        losses.append(loss.data.tolist())
        loss.backward()
        optimizer.step()

        if i % 1000 == 0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch, EPOCH, np.mean(losses)))
            losses = []

[0/3] mean_loss : 2.46
[1/3] mean_loss : 0.01
[2/3] mean_loss : 0.01


#Test

In [None]:
for_f1_score = []
accuracy = 0
for test in test_data:
    x, y = test[0], test[1]
    input_ = prepare_sequence(x, word2index).view(1, -1)

    i = model(input_).max(1)[1]
    pred = index2tag[i.data.tolist()[0]]
    for_f1_score.append([pred, y])
    if pred == y:
        accuracy += 1

print(accuracy/len(test_data) * 100)

# Note that the accuracy seems high because the '[0]' label (which is the cause for the imbalance) is included in the results 

In [None]:
# Removing '0' from the label because we are not interested in these words
y_pred, y_test = list(zip(*for_f1_score))
sorted_labels = sorted(
    list(set(y_test) - {'[0]'}),
    key=lambda name: (name[1:], name[0])
)

In [None]:
# this is because sklearn_crfsuite.metrics function flatten inputs
y_pred = [[y] for y in y_pred] 
y_test = [[y] for y in y_test]

#Results

In [None]:
from sklearn_crfsuite import metrics
print(metrics.flat_classification_report(y_test, y_pred, labels = sorted_labels, digits=3))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       [CNP]      0.585     0.201     0.299       359
       [EFD]      0.000     0.000     0.000        34
       [GOV]      0.588     0.629     0.608       197
       [JUR]      0.795     0.226     0.352       137
       [LEG]      0.734     0.760     0.747       404
       [PER]      0.000     0.000     0.000        17
       [STD]      0.312     0.077     0.123       130
       [TED]      0.000     0.000     0.000        26
       [TIT]      0.869     0.574     0.691       197
       [VAL]      1.000     0.038     0.074        26

   micro avg      0.690     0.431     0.530      1527
   macro avg      0.488     0.251     0.289      1527
weighted avg      0.635     0.431     0.479      1527



#Packaging

In [None]:
import pickle
import mlflow
import mlflow.pyfunc

state_dict_path = f'/content/drive/My Drive/AISC/nlp/state_dict.pt'
torch.save(model.state_dict(), state_dict_path)

artifacts = {'state_dict': state_dict_path}

In [None]:
class ModelWrapper(mlflow.pyfunc.PythonModel):
  # Load in the model and all required artifacts
  # The context object is provided by the MLflow framework
  # It will contain all of the artifacts specified above

  def load_context(self, context):
    import torch
    import pickle
    from model import WindowClassifier

    # Initialize the model and load in the state dict
    self.model = WindowClassifier()
    self.model.load_state_dict(torch.load(context.artifacts["state_dict"]))

  # Create a predict function for our models
  def predict(self, context, model_input):
    pred = model(model_input).max(1)[1]
    pred_labels = pred.data.tolist()[0]

    return pred_labels

In [None]:
mlflow.pyfunc.get_default_conda_env()

{'channels': ['defaults', 'conda-forge'],
 'dependencies': ['python=3.6.9',
  'pip',
  {'pip': ['mlflow', 'cloudpickle==1.3.0']}],
 'name': 'mlflow-env'}

In [None]:
# Let's create our own conda environment
conda_env = {
    'channels': ['defaults', 'conda-forge'],
    'dependencies': [f'python=3.6.9',
                     f'pip=19.3.1'
                     f'scikit-learn=0.23.3',
                     {
                         'pip':[f'mlflow=={mlflow.__version__}',
                                'cloudpickle==1.3.0',
                                'torch===1.5.1+cu101',
                                'torchvision===0.6.1+cu101',
                                'sklearn'
                               ]
                     }
                    ],
    'name': 'mlflow-env'
}

In [None]:
# Location in our gdrive where we want the model to be saved
mlflow_pyfunc_model_path = f"/content/drive/My Drive/AISC/nlp/iuris_model" 

# Package the model
mlflow.pyfunc.save_model(path=mlflow_pyfunc_model_path,
                         python_model=ModelWrapper(),
                         artifacts=artifacts,
                         conda_env=conda_env,
                         code_path=['/content/drive/My Drive/AISC/nlp/model.py', '/content/drive/My Drive/AISC/nlp/metadata.txt'])

