In [1]:
import numpy as np
import pandas as pd
import re
import pickle
import nltk
#from pytorch_transformers import BertTokenizer
from transformers import BertTokenizer
nltk.download('punkt')
nltk.download('stopwords')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def preprocessing(documents):
    documents = documents.str.replace("[^a-zA-Z0-9 ]","")
    clean_documents = []
    for sentence in documents:
        #tokens = nltk.tokenize.word_tokenize(sentence) # 토큰화
        #tokens = [word for word in tokens if word not in nltk.corpus.stopwords.words('english')] # 불용어 제거
        #clean_texts = " ".join(tokens)
        clean_texts = re.sub('[?.,;:|\)*~`’!^\-_+<>@\#$%&-=#}※]', '', sentence) # 특수문자, 임티 제거
        clean_texts = re.sub(' +', ' ', clean_texts) # 다중 공백 제거
        clean_documents.append(clean_texts)
    return clean_documents

In [3]:
def pad_sequences(sequences, maxlen):
    """Right padding."""
    pad_seq = []
    for sequence in sequences:
        current_len = len(sequence)
        if current_len > maxlen:
            pad_seq.append(sequence[: maxlen - 1])
        else:
            extra = maxlen - current_len
            pad_seq.append(sequence + ([0] * extra))

    return pad_seq

In [4]:
def Tokenization(max_len, train_x, test_x):
    
    div = len(train_x) # 분할 지점 저장
    dataset = train_x + test_x # 학습, 테스트 데이터 합치기
    print('총 데이터셋 크기 : ', len(dataset))
    squeezed_data = np.array(dataset).squeeze().tolist()
    #print(squeezed_data)

    #sequences = [tokenizer.batch_encode_plus("[CLS] "+ t +" [SEP]", add_special_tokens=True, return_attention_mask=True, padding='longest') for t in squeezed_data] # 문자를 시퀀스로 바꿈
    #sequences = tokenizer.batch_encode_plus(squeezed_data, add_special_tokens=True, return_attention_mask=True, padding='longest')
    sequences =tokenizer.batch_encode_plus(squeezed_data, add_special_tokens=True, return_attention_mask=True, padding='longest')
    #print('시퀀스 길이', len(sequences[0]))
    #print('첫번째 시퀀스 예시', sequences[0])

    #print('총 단어 수 : ', len(tok.word_index))

    train_result = sequences['input_ids'][:div]
    test_result = sequences['input_ids'][div:]
    
    train_atmask = sequences['attention_mask'][:div]
    test_atmask = sequences['attention_mask'][div:]

    return train_result, test_result, train_atmask, test_atmask

In [5]:
def load_data():
    train = pd.read_csv("train_final.csv")
    eval = pd.read_csv("eval_final_open.csv")
    train_X = train['Sentence'] # 본문 내용을 기준으로 분류하기
    train_Y = train['Category']
    eval_X = eval['Sentence']
    return train_X, train_Y, eval_X

In [7]:
train_x, train_y, test_x = load_data()
step1_flag = True
if not step1_flag:
    print(train_x[:10])

    train_x, test_x = preprocessing(train_x), preprocessing(test_x)
    pd.DataFrame(train_x).to_csv('temp_train_x.csv')
    pd.DataFrame(train_y).to_csv('temp_train_y.csv')
    pd.DataFrame(test_x).to_csv('temp_test_x.csv')
    #print(train_x[:10])
else:
    # 임시 불러오기
    train_x, test_x = pd.read_csv('temp_train_x.csv'), pd.read_csv('temp_test_x.csv')
    #print(train_x)
    train_x = train_x.iloc[:,1:].values.tolist()
    test_x = test_x.iloc[:,1:].values.tolist()
    #print(train_x)
    #y_train, nb_classes = labeling(train_y)

    max_len = 70
    #print(train_x[0])
    x_train, x_test, train_atmask, test_atmask = Tokenization(max_len, train_x, test_x)
    # print(x_train)
    pd.DataFrame(x_train).to_csv('final_train_x.csv')
    #pd.DataFrame(y_train).to_csv('final_train_y.csv')
    pd.DataFrame(x_test).to_csv('final_test_x.csv')
    pd.DataFrame(train_atmask).to_csv('final_train_atmask.csv')
    pd.DataFrame(test_atmask).to_csv('final_test_atmask.csv')

총 데이터셋 크기 :  16964


In [7]:
def preprocessing_char(documents):
    documents = documents.str.replace("[ãàáâçèéíïñóöôûüæ\t\n\s]","")
    clean_documents = []
    for sentence in documents:
        sentence = sentence.lower()
        clean_texts = re.sub(' ', '', sentence) # 다중 공백 제거
        clean_documents.append(clean_texts)
    return clean_documents

In [8]:
def Tokenization_char(max_len, train_x, test_x, alphabet ="""abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'"\/|_@#$%^&*~`+-=<>()[]{}"""):
    
    alpha_len = len(alphabet)
    id_mat = np.identity(alpha_len + 12)
    div = len(train_x) # 분할 지점 저장
    dataset = train_x + test_x # 학습, 테스트 데이터 합치기
    #print('총 데이터셋 크기 : ', len(dataset))
    squeezed_data = np.array(dataset).squeeze().tolist()
    
    max_len = 300
    encode_data_set = np.zeros((len(dataset), alpha_len + 12, max_len))
    for idx, t in enumerate(squeezed_data):
        char_seq = []
        for i, c in enumerate(t):
            if i >= max_len:
                break
            if alphabet.find(c) == -1:
                print(c)
                print("error")
                break
            char_seq.append(id_mat[alphabet.find(c)].reshape(-1,1))
        
        char_len = len(char_seq)
        if char_len < max_len:
            char_seq.append(np.zeros((alpha_len + 12, max_len - char_len)))
        
        encode_data_set[idx] = np.hstack(char_seq)
        if idx == 0:
            print(encode_data_set[idx])

    

    train_result = encode_data_set[:div]
    test_result = encode_data_set[div:]

    return train_result, test_result

In [9]:
train_x, train_y, test_x = load_data()
step1_flag = True
if not step1_flag:

    train_x, test_x = preprocessing_char(train_x), preprocessing_char(test_x)
    pd.DataFrame(train_x).to_csv('char_temp_train_x.csv')
    pd.DataFrame(train_y).to_csv('char_temp_train_y.csv')
    pd.DataFrame(test_x).to_csv('char_temp_test_x.csv')
    #print(train_x[:10])
else:
    # 임시 불러오기
    train_x, test_x = pd.read_csv('char_temp_train_x.csv'), pd.read_csv('char_temp_test_x.csv')
    #print(train_x)
    train_x = train_x.iloc[:,1:].values.tolist()
    test_x = test_x.iloc[:,1:].values.tolist()
    print(train_x[0])
    print(train_x[1])
    print(train_x[2])
    #y_train, nb_classes = labeling(train_y)

    max_len = 70
    #print(train_x[0])
    x_train, x_test = Tokenization_char(max_len, train_x, test_x)
    print(x_train)
    pickle.dump(x_train, open("./char_final_train_x.pkl","wb"))
    #pd.DataFrame(x_train).to_csv('char_final_train_x.csv')
    #pd.DataFrame(y_train).to_csv('final_train_y.csv')
    pickle.dump(x_test, open("./char_final_test_x.pkl","wb"))
    #pd.DataFrame(x_test).to_csv('char_final_test_x.csv')

["-lrb-thefilm-rrb-tacklesthetopicofrelationshipsinsuchastraightforward,emotionallyhonestmannerthatbytheend,it'simpossibletoascertainwhetherthefilmis,atitscore,deeplypessimisticorquietlyhopeful."]
['lavishly,exhilaratinglytasteless.']
['itisalsobeautifullyacted.']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 1. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  .

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
#from mxnet.gluon import nn
import pandas as pd
import numpy as np
import pickle
import copy
#from pytorch_transformers import BertConfig, BertModel
from transformers import BertConfig, BertModel
from transformers import AdamW, get_linear_schedule_with_warmup

In [2]:
device = torch.device("cuda")

In [3]:
class CharacterLevelCNN(nn.Module):
    def __init__(self, number_of_characters, max_len, dropout=0.1, number_of_classes=5, batch_size = 32):
        super(CharacterLevelCNN, self).__init__()

        # define conv layers

        self.dropout_input = nn.Dropout2d(dropout)

        self.conv1 = nn.Sequential(
            nn.Conv1d(number_of_characters ,256,kernel_size=7, padding=0),
            nn.ReLU(),
            nn.MaxPool1d(3)
        )

        self.conv2 = nn.Sequential(nn.Conv1d(256, 256, kernel_size=7, padding=0), nn.ReLU(), nn.MaxPool1d(3))
        self.conv3 = nn.Sequential(nn.Conv1d(256, 256, kernel_size=3, padding=0), nn.ReLU())
        self.conv4 = nn.Sequential(nn.Conv1d(256, 256, kernel_size=3, padding=0), nn.ReLU())
        self.conv5 = nn.Sequential(nn.Conv1d(256, 256, kernel_size=3, padding=0), nn.ReLU())
        self.conv6 = nn.Sequential(nn.Conv1d(256, 256, kernel_size=3, padding=0), nn.ReLU(), nn.MaxPool1d(3))

        # compute the  output shape after forwarding an input to the conv layers

        input_shape = (batch_size, max_len, number_of_characters)
        self.output_dimension = self._get_conv_output(input_shape)

        # define linear layers
        self.fc1 = nn.Sequential(nn.Linear(self.output_dimension, 1024), nn.ReLU(), nn.Dropout(0.1))
        self.fc2 = nn.Sequential(nn.Linear(1024, 1024), nn.ReLU(), nn.Dropout(0.1))
        self.fc3 = nn.Linear(1024, number_of_classes)

        # initialize weights
        self._create_weights()

    # utility private functions

    def _create_weights(self, mean=0.0, std=0.05):
        for module in self.modules():
            if isinstance(module, nn.Conv1d) or isinstance(module, nn.Linear):
                module.weight.data.normal_(mean, std)

    def _get_conv_output(self, shape):
        x = torch.rand(shape)
        x = x.transpose(1, 2)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)
        x = x.view(x.size(0), -1)
        output_dimension = x.size(1)
        
        return output_dimension

    # forward

    def forward(self, x):
        #print(x.size())
        x = self.dropout_input(x)
        #print(x.size())
        #x = x.transpose(1, 2)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.fc2(x)
        #x = self.fc3(x)
        #x = F.softmax(x)
        #print(x)
        
        return x

In [4]:
class MyBertCNNClassifier(nn.Module):
    def __init__(self, bert, charcnn, num_classes=5, num_classifier=1, dropout=None):
        super(MyBertCNNClassifier, self).__init__()
        self.bert = bert
        self.charcnn =charcnn
        self.num_classes = num_classes
        self.num_classifier = num_classifier
        
        self.classifiers = nn.Sequential(
        nn.BatchNorm1d(768 + 1024),
        nn.Linear(768 + 1024, 1024),
        nn.LeakyReLU(),
        nn.Dropout(p = dropout),
        nn.Linear(1024, 128),
        nn.LeakyReLU(),
        nn.Dropout(p = dropout),
        nn.Linear(128, num_classes),
        #nn.Softmax()
        )
        
        #self.classifiers = torch.nn.ModuleList(self.classifiers)
        self.classifiers_parameters = self.classifiers.parameters()

    
    def forward(self, x, atmask, _x):
        bert_out = self.bert(x, attention_mask = atmask)
        charcnn_out = self.charcnn(_x)
        #print(bert_out[1].size(), charcnn_out.size())
        cat_in = torch.hstack([bert_out[1], charcnn_out])
        #classifers = [self.classifier(bert_out[1]).view(1,-1,self.num_classes) for _ in range(self.num_classifier)]
        pred = self.classifiers(cat_in)
        #pred = torch.mean(torch.cat(cls_pred, dim = 0), dim = 0)
        return pred
    '''
    def predict(self, x, atmask, _x):
        bert_out = self.bert(x, attention_mask = atmask)
        charcnn_out = self.charcnn(_x)
        cat_in = torch.hstack([bert_out[1], charcnn_out])
        #classifers = [self.classifier(bert_out[1]).view(1,-1,self.num_classes) for _ in range(self.num_classifier)]
        cls_pred = torch.hstack([torch.argmax(cls(cat_in), dim = 1).view(-1,1) for cls in self.classifiers])
        pred, _ = torch.mode(cls_pred)
        
        return pred
       ''' 

In [5]:
class MyBertCNNClassifier2(nn.Module):
    def __init__(self, bert, charcnn, num_classes=5, num_classifier=1, dropout=None):
        super(MyBertCNNClassifier2, self).__init__()
        self.bert = bert
        self.charcnn =charcnn
        self.num_classes = num_classes
        self.num_classifier = num_classifier
        
        self.classifier = nn.Sequential(
        nn.Dropout(p = dropout),
        nn.Linear(768, num_classes))
        
        self.classifier_char = nn.Sequential(
        nn.Dropout(p = dropout),
        nn.Linear(1024, 1024),
        nn.LeakyReLU(),
        nn.Dropout(p = dropout),
        nn.Linear(1024, 128),
        nn.LeakyReLU(),
        nn.Dropout(p = dropout),
        nn.Linear(128, num_classes))

        
        self.classifiers = torch.nn.ModuleList([self.classifier, self.classifier_char])
        self.classifiers_parameters = self.classifiers.parameters()

    
    def forward(self, x, atmask, _x):
        bert_out = self.bert(x, attention_mask = atmask)
        #charcnn_out = self.charcnn(_x)
        pred_bert = self.classifier(bert_out[1])
        #pred_char = self.classifier_char(charcnn_out)
        pred = pred_bert
        
        #print(bert_out[1].size(), charcnn_out.size())
        #cat_in = torch.hstack([bert_out[1], charcnn_out])
        #classifers = [self.classifier(bert_out[1]).view(1,-1,self.num_classes) for _ in range(self.num_classifier)]
        #pred = [cls(cat_in) for cls in self.classifiers]
        #pred = torch.mean(torch.cat(cls_pred, dim = 0), dim = 0)
        return pred
    
    def predict(self, x, atmask, _x):
        bert_out = self.bert(x, attention_mask = atmask)
        charcnn_out = self.charcnn(_x)
        cat_in = torch.hstack([bert_out[1], charcnn_out])
        #classifers = [self.classifier(bert_out[1]).view(1,-1,self.num_classes) for _ in range(self.num_classifier)]
        cls_pred = torch.hstack([torch.argmax(cls(cat_in), dim = 1).view(-1,1) for cls in self.classifiers])
        pred, _ = torch.mode(cls_pred)
        
        return pred

In [6]:
def train_one_epoch(model, N, lossfn, optimizer,scheduler, X, atmask, _X, Y, batch_size=16):
    #generator = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    model.train()
    train_loss, train_acc = 0.0, 0.0
    nb = N // batch_size
    #print(Y)
    for i in range(nb + 1):
        if i == nb:
            batch_train_x, batch_atmask, batch_train_char_x, batch_train_y = torch.tensor(X[batch_size * nb:]).long().to(device),torch.tensor(atmask[batch_size * nb:]).long().to(device), torch.tensor(_X[batch_size * nb:]).float().to(device), torch.tensor(Y[batch_size * nb:]).long().to(device)
            #batch_train_x, batch_train_y = torch.tensor(X[batch_size * nb:]).float().to(device), torch.tensor(Y[batch_size * nb:]).long().to(device)
        else:
            batch_train_x, batch_atmask, batch_train_char_x, batch_train_y = torch.tensor(X[batch_size * i:batch_size * (i+1)]).long().to(device),torch.tensor(atmask[batch_size * i:batch_size * (i+1)]).long().to(device), torch.tensor(_X[batch_size * i:batch_size * (i+1)]).float().to(device), torch.tensor(Y[batch_size * i:batch_size * (i+1)]).long().to(device)

        optimizer.zero_grad()
        logits = model(batch_train_x, batch_atmask, batch_train_char_x)
        loss = lossfn(logits, batch_train_y)
            
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()
        model.eval()
        pred_labels = torch.argmax(logits, dim = 1)
        #print(pred_labels)
        train_acc += (pred_labels == batch_train_y).sum().item()
        
        del batch_train_x
        del batch_atmask
        del batch_train_char_x
        del batch_train_y
        torch.cuda.empty_cache()
        
    train_loss /= N
    train_acc /= N
    
    return train_loss, train_acc

In [7]:
def evaluate_one_epoch(model, N, lossfn, X, atmask, _X,  Y, batch_size=8):
    #generator = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    model.eval()
    loss, acc = 0.0, 0.0
    nb = N // batch_size
    for i in range(nb + 1):
        if i == nb:
            batch_valid_x, batch_atmask, batch_valid_char_x, batch_valid_y = torch.tensor(X[batch_size * nb:]).long().to(device),torch.tensor(atmask[batch_size * nb:]).long().to(device), torch.tensor(_X[batch_size * nb:]).float().to(device), torch.tensor(Y[batch_size * nb:]).long().to(device)
        else:
            batch_valid_x, batch_atmask, batch_valid_char_x, batch_valid_y = torch.tensor(X[batch_size * i:batch_size * (i+1)]).long().to(device),torch.tensor(atmask[batch_size * i:batch_size * (i+1)]).long().to(device), torch.tensor(_X[batch_size * i:batch_size * (i+1)]).float().to(device), torch.tensor(Y[batch_size * i:batch_size * (i+1)]).long().to(device)
        
        logits = model(batch_valid_x, batch_atmask, batch_valid_char_x)

        loss += lossfn(logits, batch_valid_y).item() 
            
        #pred_labels = model.predict(batch_valid_x, batch_valid_char_x)
        pred_labels = torch.argmax(logits, dim = 1)
        acc += (pred_labels == batch_valid_y).sum().item()
            
        del batch_valid_x
        del batch_valid_char_x
        del batch_valid_y
        torch.cuda.empty_cache()
        
    loss /= N
    acc /= N

    return loss, acc

In [8]:
def test_one_epoch(model, N, lossfn, X, atmask, _X, batch_size=8):
    #generator = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    model.eval()
    test_y = None
    nb = N // batch_size
    for i in range(nb + 1):
        if i == nb:
            batch_test_x, batch_atmask, batch_test_char_x = torch.tensor(X[batch_size * nb:]).long().to(device), torch.tensor(atmask[batch_size * nb:]).long().to(device), torch.tensor(_X[batch_size * nb:]).float().to(device)
        else:
            batch_test_x, batch_atmask, batch_test_char_x = torch.tensor(X[batch_size * i:batch_size * (i+1)]).long().to(device), torch.tensor(atmask[batch_size * i:batch_size * (i+1)]).long().to(device), torch.tensor(_X[batch_size * i:batch_size * (i+1)]).float().to(device)

        logits = model(batch_test_x, batch_atmask, batch_test_char_x)
        batch_test_y = torch.argmax(logits, dim = 1).detach().cpu().numpy()
        if i == 0:
            test_y = batch_test_y
        else:
            test_y = np.concatenate([test_y, batch_test_y])
            
        del batch_test_x
        del batch_atmask
        del batch_test_char_x
        torch.cuda.empty_cache()

    return test_y.reshape(-1)

In [9]:
def train(binary=False, bert="bert-base-uncased", epochs=100, batch_size=8, train_prop = 0.98, alphabet = """abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'"\/|_@#$%^&*~`+-=<>()[]{}"""):
    #"""abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"""
    #"""abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'"\/|_@#$%^&*~`+-=<>()[]{}"""
    data_x, test_x, data_y = np.array(pd.read_csv('final_train_x.csv').iloc[:,1:]), np.array(pd.read_csv('final_test_x.csv').iloc[:,1:]), np.array(pd.read_csv('temp_train_y.csv').iloc[:,1:])
    data_atmask, test_atmask = np.array(pd.read_csv('final_train_atmask.csv').iloc[:,1:]), np.array(pd.read_csv('final_test_atmask.csv').iloc[:,1:])
    data_char_x, test_char_x = pickle.load(open('char_final_train_x.pkl','rb')), pickle.load(open('char_final_test_x.pkl','rb'))
    #print(data_x)
    data_len = data_x.shape[0]
    test_len = test_x.shape[0]
    train_div = np.int32(data_len * train_prop)
    comp_data = list(zip(data_x, data_char_x, data_atmask, data_y))
    np.random.shuffle(comp_data)
    data_x, data_char_x, data_atmask, data_y = zip(*comp_data)
    data_x, data_char_x, data_atmask, data_y = np.array(data_x), np.array(data_char_x), np.array(data_atmask), np.array(data_y)
    train_x, train_atmask, train_char_x, train_y = data_x[:train_div], data_atmask[:train_div], data_char_x[:train_div], data_y[:train_div].reshape(-1)
    valid_x, valid_atmask, valid_char_x, valid_y = data_x[train_div:], data_atmask[train_div:], data_char_x[train_div:], data_y[train_div:].reshape(-1)


    config = BertConfig.from_pretrained(bert)
    if not binary:
        config.num_labels = 5
    bert_model = BertModel.from_pretrained(bert, config=config)
    charcnn_model = CharacterLevelCNN(number_of_characters = len(alphabet) + 12, max_len = 300)
    #charcnn_model.load_state_dict(copy.deepcopy(torch.load("char_cnn_pre.pth", torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))))
    
    model = MyBertCNNClassifier(bert_model, charcnn_model, num_classes=5, dropout=0.1)
    model.load_state_dict(torch.load("./pretrained_BERTCNN15.pth"))

    #temp_charcnn_model = torch.load("char_cnn_pre.pth", torch.device("cpu"))
    #pretrained_dict = load_state_dict(torch.load("./pretrained_BERTCNN15.pth"))
    #model_dict = model.state_dict()

    # 1. filter out unnecessary keys
    #temp_dict = {}
    #for _k, k in zip(pretrained_dict.keys(), model_dict.keys()):
    #    model_dict[k] = pretrained_dict[_k]
    #{k: v for k, v in pretrained_dict.items() if k in model_dict}
    #pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
    # 2. overwrite entries in the existing state dict
    #print(temp_dict)
    #model_dict.update(temp_dict) 
    # 3. load the new state dict
    #charcnn_model.load_state_dict(model_dict)

    #model = MyBertCNNClassifier(bert_model, charcnn_model, num_classes=5, dropout=0.1)
    #model.load_state_dict(torch.load("./pretrained_BERTCNN15.pth"))

    lossfn = torch.nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps= (train_div // batch_size + 1)  * epochs)
    model.to(device)

    for epoch in range(1, epochs):
        train_loss, train_acc = train_one_epoch(model, train_div, lossfn, optimizer, scheduler, train_x , train_atmask, train_char_x, train_y, batch_size=batch_size)
        val_loss, val_acc = evaluate_one_epoch(model, data_len - train_div, lossfn,  valid_x, valid_atmask, valid_char_x, valid_y, batch_size=batch_size)
        #scheduler.step(val_loss)
        test_out = test_one_epoch(model, test_len, lossfn, test_x,test_atmask, test_char_x, batch_size=batch_size)
        sub_data = pd.DataFrame(test_out.reshape((-1,1)), columns=['Category'])
        #test_loss, test_acc = evaluate_one_epoch(model, lossfn, optimizer, testset, batch_size=batch_size)

        print(f"epoch={epoch}")
        print(f"train_loss={train_loss:.4f}, val_loss={val_loss:.4f}")
        print(f"train_acc={train_acc:.3f}, val_acc={val_acc:.3f}")
        sub_data.to_csv('submission'+str(epoch)+'.csv', index_label = ['Id'])

In [None]:
train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


epoch=1
train_loss=0.0187, val_loss=0.0129
train_acc=0.973, val_acc=0.976
epoch=2
train_loss=0.0179, val_loss=0.0077
train_acc=0.972, val_acc=0.976
epoch=3
train_loss=0.0129, val_loss=0.0054
train_acc=0.979, val_acc=0.988
