In [6]:
import pandas as pd

# df = pd.read_csv('/content/drive/MyDrive/PlayData_DataEngine/DL_MiniProject/MBTI 500.csv')
df = pd.read_csv('../data/csv/MBTI_min_1000.csv')

df['type'].value_counts()

INTP    24961
INTJ    22427
INFJ    14963
INFP    12134
ENTP    11725
ENFP     6167
ISTP     3424
ENTJ     2955
ESTP     1986
ENFJ     1534
ISTJ     1243
ISFP      875
ISFJ      650
ESTJ      482
ESFP      360
ESFJ      181
Name: type, dtype: int64

In [42]:
df['type'].value_counts()

ESFP_df = df[df['type'] == 'ESFJ']

ISTJ_df = df[df['type'] == 'ISTJ']


In [43]:
ISTJ_df = ISTJ_df.groupby(['type'])
refunc = lambda g: g.sort_values(by = 'type', ascending=False)[1300:2000]
ISTJ_df = ISTJ_df.apply(refunc)

ISTJ_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 243 entries, ('ISTJ', 49409) to ('ISTJ', 50155)
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   posts   243 non-null    object
 1   type    243 non-null    object
dtypes: object(2)
memory usage: 14.7+ KB


In [7]:
df = df.groupby(['type'])
func = lambda g: g.sort_values(by = 'type', ascending=False)[:1000]

df = df.apply(func)

df['type'].value_counts()

ENFP    700
ENTJ    700
ENTP    700
INFJ    700
INFP    700
INTJ    700
INTP    700
ISTP    700
ESTP    686
ENFJ    234
Name: type, dtype: int64

In [3]:
import torch
import numpy as np
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

labels = {"INFJ" : 0, "INTJ" : 1, "INFP" : 2, "INTP" : 3, "ENFJ" : 4, "ENTJ" : 5,
              "ENFP" : 6, "ENTP" : 7, "ISFJ" : 8, "ISTJ" : 9, "ISFP" : 10, "ISTP" : 11,
                "ESFJ" : 12, "ESTJ" : 13, "ESFP" : 14, "ESTP" : 15}

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['type']]
        self.texts = [tokenizer(text,
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['posts']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [4]:
# np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1), [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

256 32 32


In [5]:
test = Dataset(df_test)

print(test[0])

({'input_ids': tensor([[  101,  1267,  3748,  2482,  2480,  1176, 11484,  3342,  2373,  2030,
           188,  2430, 27989,  6602,  7328,  2816,  1266,  6486,  8110,  1508,
          2256,  1518,  2621, 22233, 23055,  6486,  1297,  1285,  1263,  5363,
         17088,  6486,  1655,  1815,  1402,  1256,  2398,  2052, 10243,  1253,
          4161,  6486,  1700,  1567,  1920,  1660,   190,  1159,  2367,  1159,
          1736, 23423, 23423,  4161, 18029,  5113,  1928,  1541,  5113,  1587,
          2222,  1243, 10598,  1435,  1511,  2367,  2304,  2367,  2820,  1541,
          1631,  2488,  1385,  1536,  1294,  6223,  2541,  3857,  2828,  1800,
          1631,   175,  3984, 19687,  1221,  2488,  1662,  1838,  2121,  1579,
          1631,  1176,  2255, 19687,  3771,  1309,  5403,  2191,  1474,  1800,
          2330,  2255,   176,  4867,  1849,   194,  2149, 22593,  8341, 15276,
          7210,  1601,  2552,  1463,  3253,  2303,  6153,  1309,  2486,  2303,
          6153,  1712,  8264,  2486, 

In [8]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-large-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(1024, 16)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
train = Dataset(df_train)

In [17]:
train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)

print(train_dataloader)

TypeError: 'DataLoader' object is not subscriptable

In [None]:
print(train.__getitem__(0))

In [None]:
from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()

                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()

            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()

                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc

            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
            
            torch.save({'epoch' : epoch_num, 'model_state_dict' : model.state_dict(),
                         'optimizer_state_dict' : optimizer.state_dict(), 'loss' : total_loss_train})

EPOCHS = 1
model = BertClassifier()
LR = 1e-6

train(model, df_train, df_val, LR, EPOCHS)

torch.save(model, '/content/drive/MyDrive/PlayData_DataEngine/DL_MiniProject/model_sample1000_epoch15.pt')

In [10]:
model = torch.load('./data/model/largemodel_sample1000_epoch12.pt', map_location=torch.device('cpu'))
model.eval()

def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc

    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

# evaluate(model, df_test)

evaluate(model, df)


KeyboardInterrupt: 

In [None]:
# /content/drive/MyDrive/PlayData_DataEngine/DL_MiniProject/MBTI 500.csv

torch.save(model, '/content/drive/MyDrive/PlayData_DataEngine/DL_MiniProject/model_sample360_epoch12.pt')