In [38]:
!pip install tez
!pip install torch==1.7.1
!pip install transformers==3.5.1
!pip install autocorrect

Collecting autocorrect
[?25l  Downloading https://files.pythonhosted.org/packages/16/a8/1fc332535fc26db807fa48bdb54070355b83a36c797451c3d563bc190fa8/autocorrect-2.3.0.tar.gz (621kB)
[K     |████████████████████████████████| 624kB 8.6MB/s 
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25l[?25hdone
  Created wheel for autocorrect: filename=autocorrect-2.3.0-cp37-none-any.whl size=621587 sha256=5d94b837738d3c966eeb00d9003925c4f9c1c270c10d4ce6235be3087643da20
  Stored in directory: /root/.cache/pip/wheels/cc/1c/30/6b0199afbd20eef5959f5eaa0ead86aeef84391740482b2279
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.3.0


In [1]:
import pandas as pd
import tez
import torch
import torch.nn as nn
import transformers
from sklearn import metrics, model_selection, preprocessing
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.utils import shuffle
from sklearn import model_selection
from google.colab import drive
from sklearn.model_selection import train_test_split


In [2]:
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/Work/Dan/BERT Model 2')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
class BERTDataset:
    def __init__(self, text, target):
        self.text = text
        self.target = target
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-large-uncased", do_lower_case=True
        )
        self.max_len = 64
    def __len__(self):
        return len(self.text)
    def __getitem__(self, item):
        text = str(self.text[item])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        padding_length = self.max_len - len(ids)
        
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.long),
        }

class BERTBaseUncased(tez.Model):
    def __init__(self, num_train_steps, num_classes):
        super().__init__()
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-large-uncased", do_lower_case=True
        )
        self.bert = transformers.BertModel.from_pretrained("bert-large-uncased")
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(1024, num_classes)
        self.num_train_steps = num_train_steps
        self.step_scheduler_after = "batch"
    def fetch_optimizer(self):
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.bias"]
        optimizer_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        opt = AdamW(optimizer_parameters, lr=3e-5)
        return opt
    def fetch_scheduler(self):
        sch = get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps=0, num_training_steps=self.num_train_steps
        )
        return sch
    def loss(self, outputs, targets):
        if targets is None:
            return None
        return nn.CrossEntropyLoss()(outputs, targets)
    def monitor_metrics(self, outputs, targets):
        if targets is None:
            return {}
        outputs = torch.argmax(outputs, dim=1).cpu().detach().numpy()
        targets = targets.cpu().detach().numpy()
        accuracy = metrics.accuracy_score(targets, outputs)
        return {"accuracy": accuracy}
    def forward(self, ids, mask, token_type_ids, targets=None):
        _, o_2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        b_o = self.bert_drop(o_2)
        output = self.out(b_o)
        loss = self.loss(output, targets)
        acc = self.monitor_metrics(output, targets)
        return output, loss, acc



In [9]:
# df_train=pd.read_excel('Transcript-comb.xlsx',sheet_name='Traindata-Hiri')
# df_test=pd.read_excel('Transcript-comb.xlsx',sheet_name='Testdata-Hiri').rename(columns={'Input Conversation':'utterance'})
# from sklearn import preprocessing
# le = preprocessing.LabelEncoder()
# df_train['intent1']=le.fit_transform(df_train['intent'])
# import string
# import nltk
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# import re

# def remove_punct(text):
#     text=text.lower()
#     text  = "".join([char for char in text if char not in string.punctuation])
#     return text

# stopword = nltk.corpus.stopwords.words('english')

# def remove_stopwords(text):
#     text = [word for word in re.split('\W+', text) if ((word not in stopword)&(word!=''))]
#     return text

# wn = nltk.WordNetLemmatizer()

# def lemmatizer(text):
#     text = [wn.lemmatize(word) for word in text]
#     return text

# from autocorrect import Speller
# spell = Speller(lang='en')

# def spell_check(text):
#   return [spell(i) for i in text]

# def clean_data(x):
#     x=x.encode('ascii','ignore').decode() # remove texts other than english
#     x=remove_punct(x) # remove punctuations
#     x=remove_stopwords(x) # remove stopwords
#     x=spell_check(x)
#     x=lemmatizer(x) # lemmatization
#     excluded_tags = ["ADJ", "ADV"]
#     t=nltk.pos_tag(x)
#     x=[i[0] for i in t if i[1] not in excluded_tags]
#     return ' '.join(x)
# clean_data(df_train.utterance.iloc[0])
# df_train['utterance']=df_train['utterance'].apply(lambda x: clean_data(x))
# df_test['utterance']=df_test['utterance'].astype(str).apply(lambda x: clean_data(x))
# df_train.to_csv('train_data.csv',index=None)
# df_test.to_csv('test_data.csv',index=None)

In [None]:
if __name__ == "__main__":

    df_train=pd.read_csv('train_data.csv')
    df_test=pd.read_csv('test_data.csv')

    X_train, X_valid, y_train, y_valid = train_test_split(df_train['utterance'], df_train['intent1'], test_size=0.2, random_state=42)
    train_dataset = BERTDataset(text=X_train.values, target=y_train.values)
    valid_dataset = BERTDataset(text=X_valid.values, target=y_valid.values)
    n_train_steps = int(len(df_train) / 32 * 10)
    model = BERTBaseUncased(num_train_steps=n_train_steps, num_classes=df_train.intent1.nunique())
    train_dataset = BERTDataset(text=X_train.values, target=y_train.values)
    valid_dataset = BERTDataset(text=X_valid.values, target=y_valid.values)
    n_train_steps = int(len(df_train) / 32 * 10)
    model = BERTBaseUncased(num_train_steps=n_train_steps, num_classes=df_train.intent1.nunique())
    tb_logger = tez.callbacks.TensorBoardLogger(log_dir=".logs/")
    es = tez.callbacks.EarlyStopping(monitor="valid_loss", model_path="model.bin")
    model.fit(train_dataset,valid_dataset=valid_dataset,train_bs=32, device='cuda',epochs=1,callbacks=[tb_logger,es],fp16=True)

    torch.save(model.state_dict(), "model.bin")
    #preds = model.predict(valid_dataset, batch_size=16, n_jobs=-1)

In [19]:
# lmap = dict(zip(le.transform(le.classes_),le.classes_))
# import pickle
# with open('lmap.pkl','wb') as f:
#   pickle.dump(lmap,f)