In [1]:
# model.py

import transformers
import torch.nn as nn

In [2]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10
ACCUMULATION = 2
BERT_PATH = "../input/bert_base_uncased"
MODEL_PATH = "model.bin"
TRAINING_FILE = "../input/imdb.csv"
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    BERT_PATH,
    do_lower_case=True
)

In [3]:
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(
            config.BERT_PATH
        )
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1) # Binary classification problem with 768 classes
        
    def forward(self, ids, mask, token_type_ids): # Attention mask
        _, o2 = self.bert( # Sequence of hidden state for every token, cls token of BERT of last hidden state
            ids, 
            attention_mask=mask,
            token_type_ids=token_type_ids
        )
        bo = self.bert_drop(o2)
        output = self.out(bo)
        return output

In [4]:
# engine.py
import torch.nn as nn
import torch
from tqdm import tqdm

In [5]:

def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))


def train_fn(data_loader, model, optimizer, device, scheduler, accumulation_steps):
    model.train()
    
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets = d["targets"]
        
        ids  = ids.to(device, dtype=torch.long)
        token_type_ids  = token_type_ids.to(device, dtype=torch.long)
        mask  = mask.to(device, dtype=torch.long)
        targets  = targets.to(device, dtype=torch.float)
        
        optimizer.zero_grad()
        outputs = model(
            ids = ids,
            mask = mask,
            token_type_ids = token_type_ids
        )
        loss = loss_fn(outputs, targets)
        loss.backward()
        
        # if(bi + 1) % accumulation_steps == 0:
        optimizer.step()
        scheduler.step()
            
            
    
def evan_fun(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids  = ids.to(device, dtype=torch.long)
            token_type_ids  = token_type_ids.to(device, dtype=torch.long)
            mask  = mask.to(device, dtype=torch.long)
            targets  = targets.to(device, dtype=torch.float)

            outputs = model(
                ids = ids,
                mask = mask,
                token_type_ids = token_type_ids
            )
            fin_targest.extend(
                targets.cpu().detach().numpy().tolist()
            )
            fin_outputs.extend(
                torch.sigmoid(outputs).cpu().detach().numpy().tolist()
            )
            
    return fin_outputs, fin_targets

In [6]:
# dataset.py
import config
import torch

In [7]:


class BERTDataset:
    def __init__(self, review, target):
        self.review = review
        self.target = target
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
        
    def __len__(self):
        return len(self.review)
        
    def __getitem__(self, item):
        review = str(self.review)
        review = " ".join(review.split())
        
        inputs = self.tokenizer.encode_plus(
            review,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        
        padding_length = self.max_len - len(ids)
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long), 
            'mask': torch.tensor(mask, dtype=torch.long), 
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.target[item], dtype=torch.float)
        }

In [8]:
# train.py
import config
import pandas as pd
#import dataset
# import engine
import torch
import torch.nn as nn
from sklearn import metrics
import numpy as np
from sklearn import model_selection
from transformers import AdamW
# from model import BERTBaseUncased

In [9]:
transformers.__version__
from transformers import get_linear_schedule_with_warmup


In [None]:
def run():
    dfx = pd.read_csv(config.TRAINING_FILE).fillna("none")
    print(dfx.columns)
    dfx.sentiment = dfx.sentiment.apply(
        lambda x: 1 if x == "positive" else 0
    )
    
    df_train, df_valid = model_selection.train_test_split(
        dfx,
        test_size=0.1,
        random_state=42,
        stratify=dfx.sentiment.values
    )
    print(df_valid.columns)
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)
    
    train_dataset = BERTDataset(
        review=df_train.review.values,
        target=df_train.sentiment.values
    )
    
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )
    
    valid_dataset = BERTDataset(
        review=df_valid.review.values,
        target=df_valid.sentiment.values
    )
    
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )
    
    device = torch.device("cpu")
    model = BERTBaseUncased()
    model.to(device)
    
    param_optimizer = list(model.named_parameters())
    #print(param_optimizer)
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001
        }, {
            'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0
        }
    ]
    
    #print(optimizer_parameters)
    
    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )
    
    model = nn.DataParallel(model)
    
    best_accuracy = 0
    
    #print()
    #print(tqdm(enumerate(train_data_loader), len(train_data_loader)))
    
    for epoch in range(config.EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler, accumulation_steps=ACCUMULATION)
        outputs, targets = eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"Accuracy Score = {accuracy}")
        
        if(accuracy > best_accuracy):
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy
            
            
#if __name__ == "__main__":
#    run()
run()   
    
    
    
    
    
    
    
    
    
    
    

Index(['review', 'sentiment'], dtype='object')
Index(['review', 'sentiment'], dtype='object')


  0%|          | 6/5625 [03:35<68:26:40, 43.85s/it]

In [1]:
67170.45/11

6106.404545454545

In [2]:
!touch app.py

In [5]:
# App.py

from flask import Flask
from flask import Request

In [4]:
app = Flask(__name__)

In [None]:
def sentence_prediction(sentence, model=MODEL):
    tokenizer = TOKENIZER
    max_length = MAX_LEN
    review = str(sentence)
    
    review = " ".join(review.split())
        
    inputs = self.tokenizer.encode_plus(
        review,
        None,
        add_special_tokens=True,
        max_length=max_length,
        pad_to_max_length=True
    )
    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    token_type_ids = inputs["token_type_ids"]

    padding_length = self.max_len - len(ids)
    ids = ids + ([0] * padding_length)
    mask = mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([0] * padding_length)

    ids: torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    mask: torch.tensor(mask, dtype=torch.long).unsqueeze(0)
    token_type_ids: torch.tensor(token_type_ids, dtype=torch.long).unsqueeze(0)
        
    ids  = ids.to(device, dtype=torch.long)
    token_type_ids  = token_type_ids.to(device, dtype=torch.long)
    mask  = mask.to(device, dtype=torch.long)

    outputs = model(
        ids = ids,
        mask = mask,
        token_type_ids = token_type_ids
    )
    outputs = torch.sigmoid(outputs).cpu().detach().numpy()
    return outputs[0][0]

@app.route("predict")
def predict():
    sentence = request.args.get("sentence")
    positive_prediction = sentence_prediction(sentence)
    negative_prediction = 1 - positive_prediction
    response = {}
    response["response"] = {
        'positiive': str(positive_prediction),
        'negative': str(negative_prediction),
        'sentence': str(sentence)
        
    }
    return flask.jsonify(response)


if __name__ == "__main__":
    MODEL = BERTBaseUncased()
    MODEL = nn.DataParallel(MODEL)
    MODEL.load_state_dict(torch.load(config.MODEL_PATH))
    MODEL.to(DEVICE)
    MODEL.eval()
    app.run()