## **<font color='green'>Data Loading </font>**

In [1]:
import random
import copy
import time
import pandas as pd
import numpy as np
import gc
import re
import torch

#import spacy
from tqdm import tqdm_notebook, tnrange
from tqdm.auto import tqdm

tqdm.pandas(desc='Progress')
from collections import Counter

from nltk import word_tokenize

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from sklearn.metrics import f1_score
import os 

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# cross validation and metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from torch.optim.optimizer import Optimizer

from sklearn.preprocessing import StandardScaler
from multiprocessing import  Pool
from functools import partial
import numpy as np
from sklearn.decomposition import PCA
import torch as t
import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data1 = pd.read_csv("data/crs_final_df_kor.csv", header = 0)
data = data1.copy()
data = data[pd.notnull(data['LongDescription'])]

## **<font color='green'>CLEANING</font>**

### Over 25 characters

In [3]:
data = data1.loc[:,["PurposeCode","LongDescription","ProjectTitle", "ShortDescription", 
                    "AgencyCode", "RecipientCode","RegionCode","IncomegroupCode","FlowCode", "Bi_Multi", "DonorCode",
                    "USD_Disbursement","USD_Received",
                    ]]
data = data[data['LongDescription'].str.len() > 25].reset_index(drop=True).copy()

### Over 100 Classification

In [4]:
data['len'] = data['LongDescription'].apply(lambda s : len(s))
count_df = data[['PurposeCode','LongDescription']].groupby('PurposeCode').aggregate({'LongDescription':'count'}).reset_index().sort_values('LongDescription',ascending=False)
target_code = count_df[count_df['LongDescription']>100]['PurposeCode'].values
count_df['PurposeCode'].unique()
def condition_parser(x):
    if x in target_code:
        return x
    else:
        return "OTHER"
    
data['PurposeCode'] = data['PurposeCode'].apply(lambda x: condition_parser(x))  
data['PurposeCode'] = data['PurposeCode'].map(str)
data = data[data['PurposeCode']!='OTHER']

### Cleaning
- Relace Contraction 
- lower
- lemmatazation
- clean numbers

In [5]:
import re

def clean_text(x):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', x)
    return x

def clean_numbers(x):
    if bool(re.search(r'\d', x)):
        x = re.sub('[0-9]{5,}', '#####', x)
        x = re.sub('[0-9]{4}', '####', x)
        x = re.sub('[0-9]{3}', '###', x)
        x = re.sub('[0-9]{2}', '##', x)
    return x
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
def _get_contractions(contraction_dict):
    contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
    return contraction_dict, contraction_re
contractions, contractions_re = _get_contractions(contraction_dict)
def replace_contractions(text):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_re.sub(replace, text)
# Usage
replace_contractions("this's a text with contraction")
# lower the text
data["LongDescription"] = data["LongDescription"].apply(lambda x: x.lower())

# Clean the text
data["LongDescription"] = data["LongDescription"].apply(lambda x: clean_text(x))

# Clean numbers
data["LongDescription"] = data["LongDescription"].apply(lambda x: clean_numbers(x))

# Clean Contractions
data["LongDescription"] = data["LongDescription"].apply(lambda x: replace_contractions(x))
def column_types(df):
    for i in df.columns:
        class_name = str(type(df[i][0]))
        print("column "+ i + " is " + class_name)

In [6]:
from transformers import BertTokenizer, BertModel
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn, pandas as pd
import torch
import torch
import torch.nn
import torch.nn.functional as F
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"

In [7]:
embed_size = 300 # how big is each word vector
max_features = 120000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 750 # max number of words in a question to use
batch_size = 512 # how many samples to process at once
n_epochs = 5 # how many times to iterate over all samples
n_splits = 5 # Number of K-fold Splits
SEED = 10
debug = 0

In [8]:
data.head(2)

Unnamed: 0,PurposeCode,LongDescription,ProjectTitle,ShortDescription,AgencyCode,RecipientCode,RegionCode,IncomegroupCode,FlowCode,Bi_Multi,DonorCode,USD_Disbursement,USD_Received,len
0,12230.0,to provide medical equipment and consulting se...,Medical Equipment Sector Development Loan Project,MEDICAL EQUIPMENT SECTOR DEVELOPMENT LOAN PROJECT,2.0,640.0,10009.0,10018.0,13.0,1.0,742.0,,,81
1,31120.0,to educate governmental officials of developin...,Master's degree program in Rural Development(2...,MASTER'S DEGREE PROGRAM IN RURAL DEVELOPMENT(2...,4.0,266.0,10003.0,10016.0,11.0,1.0,742.0,0.00504,,223


### Define BERT

In [9]:

def train_test_split(data,train,test,val,n_cls):
    N = data.shape[0]
    train,test,val = int(N*train), int(N*test), int(N*val)
    train = N-(train+test+val)+train
    assert train+test+val == N
    shuffled_data = data.sample(frac=1.0).reset_index()
#     n_cls = list(shuffled_data['PurposeCode'].unique())
    shuffled_data['targets'] = shuffled_data['PurposeCode'].apply(lambda x: n_cls.index(x))
    train_data = shuffled_data.loc[:train-1]
    test_data = shuffled_data.loc[train:train+test]
    val_data = shuffled_data.loc[train+test:]
    train_corpus, train_targets = train_data['LongDescription'].values, train_data['targets'].values
    test_corpus, test_targets = test_data['LongDescription'].values, test_data['targets'].values
    val_corpus, val_targets = val_data['LongDescription'].values, val_data['targets'].values
    return train_corpus, train_targets, test_corpus, test_targets ,val_corpus, val_targets

In [10]:
class Model(nn.Module):
    def __init__(self,  bert,num_cls):
        super(Model, self).__init__()
        self.dim = 768
        self.encoder = BertModel.from_pretrained(bert)
        # self.fc = nn.Linear(self.dim, num_cls)
        self.hidden = 100
        self.mlp_projection =  nn.Sequential(nn.Linear(self.dim,self.hidden),
                                             nn.ReLU(),
                                             nn.Linear(self.hidden,self.hidden,bias=True))
        self.mlp_prediction =  nn.Sequential(nn.Linear(self.dim,self.hidden),
                                             nn.ReLU(),
                                             nn.Linear(self.hidden,num_cls,bias=True))
        #nn.Linear(self.dim,self.hidden), nn.ReLU(),nn.Linear(self.hidden,num_cls)
    def forward(self, input_ids, attention_mask,ce=False):
        output = self.encoder(input_ids = input_ids, attention_mask = attention_mask)
        embedding = output['pooler_output']
        if ce:
            return self.mlp_prediction(embedding)
        else:
            return self.mlp_projection(embedding)

In [11]:
class Model_wfreezing(nn.Module):
    def __init__(self,  bert,num_cls):
        super(Model, self).__init__()
        self.dim = 768
        self.encoder = BertModel.from_pretrained(bert)
        # self.fc = nn.Linear(self.dim, num_cls)
        self.hidden = 100
        self.mlp_projection =  nn.Sequential(nn.Linear(self.dim,self.hidden),
                                             nn.ReLU(),
                                             nn.Linear(self.hidden,self.hidden,bias=True))
        self.mlp_prediction =  nn.Sequential(nn.Linear(self.dim,self.hidden),
                                             nn.ReLU(),
                                             nn.Linear(self.hidden,num_cls,bias=True))
        #nn.Linear(self.dim,self.hidden), nn.ReLU(),nn.Linear(self.hidden,num_cls)
    def forward(self, input_ids, attention_mask,ce=False):
        output = self.encoder(input_ids = input_ids, attention_mask = attention_mask)
        embedding = output['pooler_output'].detach()
        if ce:
            return self.mlp_prediction(embedding)
        else:
            return self.mlp_projection(embedding)

In [12]:
class CRSdataset(Dataset):
    def __init__(self, bert, targets, text_list, max_len = 512):
        self.tokenizer = BertTokenizer.from_pretrained(bert)
        self.data = []
        self.max_len=max_len
        self.targets = targets
        for text in tqdm(text_list):
            org_input = self.tokenizer(text, padding='max_length', truncation=True,
                                       max_length=self.max_len, return_tensors='pt')
            org_input['input_ids'] = torch.squeeze(org_input['input_ids'])
            org_input['attention_mask'] = torch.squeeze(org_input['attention_mask'])
            self.data.append(org_input)
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx],self.targets[idx]

### BERT Training

In [13]:
import random
import os
available_gpu = [0,1]
data['PurposeCode'] = data['PurposeCode'].apply(lambda x: int(float(x)))

In [14]:
# train_corpus, train_targets, test_corpus, test_targets ,val_corpus, val_targets = train_test_split (data,0.75,0.25,0.0)
n_cls = list(data['PurposeCode'].unique())
# random seed fix 
random.seed(0)
np.random.seed(0)
train_corpus, train_targets, test_corpus, test_targets ,val_corpus, val_targets = train_test_split (data,0.75,0.25,0.0,n_cls)

In [15]:
lr_list = [2e-5,3e-5,5e-5]
num_epochs = [1,2,3,4]
bsz = 8
n_class = len(data['PurposeCode'].unique())
model = Model(bert='bert-base-uncased',num_cls = n_class)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
trainds = CRSdataset(bert='bert-base-uncased',targets=train_targets, text_list=train_corpus)
testds = CRSdataset(bert='bert-base-uncased',targets=test_targets,text_list=test_corpus)
# valds = CRSdataset(bert='bert-base-uncased',targets=val_targets,text_list=val_corpus)

  0%|          | 0/45405 [00:00<?, ?it/s]

  0%|          | 0/15135 [00:00<?, ?it/s]

In [17]:
trainloader= DataLoader(trainds, batch_size=bsz, shuffle=True, num_workers=1,drop_last=True)
testloader = DataLoader(testds, batch_size=bsz, shuffle=False, num_workers=1,drop_last=True)
# valloader = DataLoader(valds, batch_size=bsz, shuffle=False, num_workers=1,drop_last=True)

In [18]:
import sklearn.metrics
def get_res(model,dataloader):
    tbar= tqdm(dataloader)
    all_outputs = []
    test_y = []
    loss = []
    for inputs, targets in tbar:
        input_ids = inputs['input_ids'].long().cuda()
        attention_mask = inputs['attention_mask'].long().cuda()
        output = model(input_ids,attention_mask,ce=True)
        loss.append(ce_criterion(output,targets.cuda()).cpu())
        all_outputs.append(output.cpu())
        test_y.append(targets)
    all_outputs = torch.cat(all_outputs)
    # all_losses = torch.cat(loss).mean() #dimension 이 없을땐 사용하면 안됨
    all_losses = torch.Tensor(loss).mean()
    test_y = torch.cat(test_y)
    val_preds = all_outputs.softmax(dim=1)
    pred_y = val_preds.argmax(axis=1) 
    print(pred_y.shape,test_y.shape)
    # val_accuracy = sum(val_preds.argmax(axis=1)==test_y)/len(test_y)
    acc = sklearn.metrics.accuracy_score(y_pred=pred_y,y_true=test_y)
    f1 = sklearn.metrics.f1_score(y_true=test_y,y_pred=pred_y,average='macro')
    auc = sklearn.metrics.roc_auc_score(y_true=test_y,y_score=val_preds,multi_class='ovr')
    prec = sklearn.metrics.precision_score(y_true=test_y,y_pred=pred_y,average='macro')
    recall = sklearn.metrics.recall_score(y_true=test_y,y_pred=pred_y,average='macro')
    return (prec, recall, f1, acc, auc, all_losses)
from torch.optim import Adam,SGD
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model,device_ids=available_gpu) # device_ids=device_ids
model.cuda()
ce_criterion = nn.CrossEntropyLoss()

In [19]:
optimizer = Adam(model.parameters(), lr=3e-5)
import pandas as pd
import glob
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
epoch_res = []
val_epoch_res = []

In [None]:
n_epoch = 10
for epoch in range(n_epoch):
    start_time = time.time()
    model.train()
    losses = AverageMeter()
    tbar= tqdm(trainloader)
    for inputs, targets in tbar:
        input_ids = inputs['input_ids'].long().cuda()
        attention_mask = inputs['attention_mask'].long().cuda()
        targets = targets.long().cuda()
        output = model(input_ids,attention_mask,ce=True)
        celoss = ce_criterion(output,targets)
        del targets, input_ids,output, attention_mask
        loss =celoss#+contloss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.update(loss.item(), bsz)
        tbar.set_description("loss: {}".format(losses.avg), refresh=True)
    model.eval()
    with torch.no_grad():
#         test_res = get_res(model,testloader)
        val_res = get_res(model,testloader)
#         val_res = get_res(model,valloader)
#         print(test_res)
        print(val_res)
#     epoch_res.append(test_res)
    val_epoch_res.append(val_res)
    elapsed_time = time.time() - start_time 
    print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f}  \t val_acc={:.4f} \t f1={:.4f} \t auc={:.4f} \t time={:.2f}s'.format(
            epoch + 1, n_epochs, losses.avg, val_res[-1], val_res[3], val_res[2], val_res[4],elapsed_time))
    # torch.save(model.state_dict(),f'emnlp_2022_res/_baselineCE_binary_model_{epoch}_622_re.pth')
    # torch.save(epoch_res,f'emnlp_2022_res/_baselineCE_binary_result_622_test_re.pth')
    # torch.save(val_epoch_res,f'emnlp_2022_res/_baselineCE_binary_result_622_val_re.pth')

  0%|          | 0/5675 [00:00<?, ?it/s]

In [None]:
# (prec, recall, f1, acc, auc, all_losses)
resdf = pd.DataFrame([list(v[:-1])+[(v[-1].item())] for v in val_epoch_res],columns=['precision','recall','f1','accuracy','auc','loss'])
resdf.to_csv('results/bert_result.csv')
torch.save(model.state_dict(),'results/bert_model.pth')
