In [1]:
import torch
import pandas as pd
import numpy as np
from torch import nn
from transformers import BertModel
from transformers import BertTokenizer

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-large-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(1024, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [2]:
import torch

model = torch.load('./data/model/largemodel_sample1000_epoch12.pt', map_location=torch.device('cpu'))
model.eval()

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12, e

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

labels = {"INFJ" : 0, "INTJ" : 1, "INFP" : 2, "INTP" : 3, "ENFJ" : 4, "ENTJ" : 5,
              "ENFP" : 6, "ENTP" : 7, "ISFJ" : 8, "ISTJ" : 9, "ISFP" : 10, "ISTP" : 11,
                "ESFJ" : 12, "ESTJ" : 13, "ESFP" : 14, "ESTP" : 15}

# 결과 출력을 위해 labels key와 value 바꿔주기
resultLabels = {v:k for k,v in labels.items()}

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['type']]
        self.texts = [tokenizer(text,
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['posts']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [4]:
import re

def prediction(model, text):
    #특수 문자 제거
    text_rmv = re.sub('[-=+,#/\?:^.@*\"※~ㆍ!』‘|\(\)\[\]`\'…》\”\“\’·]', ' ', text)
    # 공백 한개로 만들기
    new_str = ' '.join(text_rmv.split())
    # 임의의 dataframe으로 만들기
    text_dict = {'posts': [new_str], 'type' : ['INTP']}
    test_data = pd.DataFrame(text_dict)

    #훈련 모델에 맞는 Dataset으로 변환
    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              predicted = output.argmax(dim=1)


    return predicted

In [5]:

from googletrans import Translator

# 입력된 텍스트가 한글인지 영어인지 판단해서 한글이 하나라도 있으면 영어로 변역한다.
def isKorean(input_s):
    k_count = 0
    e_count = 0
    for c in input_s:
        if ord('가') <= ord(c) <= ord('힣'):
            k_count+=1
        elif ord('a') <= ord(c.lower()) <= ord('z'):
            e_count+=1

    return k_count > 0

# 한글에서 영어로 번역
def languageTrans(text):
    translator = Translator()

    translation = translator.translate(text, dest = 'en')

    return translation.text


labels = {"INFJ" : 0, "INTJ" : 1, "INFP" : 2, "INTP" : 3, "ENFJ" : 4, "ENTJ" : 5,
              "ENFP" : 6, "ENTP" : 7, "ISFJ" : 8, "ISTJ" : 9, "ISFP" : 10, "ISTP" : 11,
                "ESFJ" : 12, "ESTJ" : 13, "ESFP" : 14, "ESTP" : 15}

INTP    24961
INTJ    22427
INFJ    14963
INFP    12134
ENTP    11725
ENFP     6167
ISTP     3424
ENTJ     2955
ESTP     1986
ENFJ     1534
ISTJ     1243
ISFP      875
ISFJ      650
ESTJ      482
ESFP      360
ESFJ      181

In [9]:
input_text = input('뭐든 입력해봐')

print(len(input_text))

# while True:
#     if(len(input_text) < 500):
#         input_text = input('에이~ 좀 더 길게 써봐')
#     else:
#         break

if isKorean(input_text):
    input_text = languageTrans(input_text)

predicted = prediction(model, input_text)

print(resultLabels.get(predicted.item()))

1101
ESTJ


In [25]:
testdata_df = pd.read_csv('testdata.csv')

testdata_df.head()

Unnamed: 0.1,Unnamed: 0,content
0,1,State -based and instructor full -time employe...
1,2,1. In listen (current situation) \ n due to th...
2,4,"I like to write this article to the president,..."
3,15,Teachers' promotion system scores a variety of...
4,16,8.2 I will petition for the damage caused by t...


In [26]:
temp_lst = []

for content in testdata_df['content']:
    predicted = prediction(model, content)
    temp_lst.append(resultLabels.get(predicted.item()))

In [28]:
print(temp_lst)

testdata_df['type'] = temp_lst

['ISFP', 'ESTJ', 'INFP', 'INFP', 'INFP', 'INFP', 'INFP', 'INFP', 'ESTJ', 'ENTJ', 'INFP', 'INFP', 'INFP', 'ISTJ', 'INFP', 'INFP', 'INFP', 'ISTJ', 'INFP', 'ISTJ', 'INFP', 'INFP', 'INFP', 'ESTJ', 'INFP', 'INFP', 'ISFP', 'INFP', 'ENTJ', 'INFP', 'INFP', 'ESTJ', 'ESTJ', 'ESTJ', 'INFP', 'INFP', 'ENTJ', 'ESTJ', 'INFP', 'INFP', 'ISTJ', 'INFP', 'INFP', 'INFP', 'INFP', 'INFP', 'INFP', 'INFP', 'ISTJ', 'INFP', 'INFP', 'INFP', 'INFP', 'ESTJ', 'INFP', 'INFP', 'ISTJ', 'ISTJ', 'ENTJ', 'ISTJ', 'INFP', 'INFP', 'INFP', 'ISTJ', 'ISTJ', 'INFP', 'ESTJ', 'ISTJ', 'INFP', 'INFP', 'INFP', 'ISTJ', 'INFP', 'INFP', 'INFP', 'INFP', 'INFP', 'INFP', 'ISTJ', 'INFP', 'INFP', 'INFP', 'ISTJ', 'INFP', 'ISTJ', 'INFP', 'INFP', 'INFP', 'INFP', 'INFP', 'INFP', 'INFP', 'ISTJ', 'INTJ', 'INFP', 'INFP', 'INFP', 'ESTJ', 'ISTJ', 'INFP', 'ISTJ', 'INFP', 'INFP', 'INFP', 'INFP', 'ISTJ', 'ISTJ', 'ESTJ', 'ESTJ', 'INFP', 'INFP', 'ISTJ', 'ISTJ', 'INFP', 'ESTJ', 'INFP', 'INFP', 'INFP', 'INFP', 'ISTJ', 'ISTJ', 'ESTJ', 'INFP', 'ISTJ', 'ISTJ',

In [29]:
testdata_df.groupby(['type']).count()

Unnamed: 0_level_0,Unnamed: 0,content
type,Unnamed: 1_level_1,Unnamed: 2_level_1
ENFJ,11,11
ENFP,2,2
ENTJ,158,158
ESFP,8,8
ESTJ,184,184
ESTP,5,5
INFP,934,934
INTJ,7,7
ISFP,48,48
ISTJ,370,370


In [30]:
testdata_df.to_csv('./data/csv/national petition_MBTI.csv')