In [40]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm

# Êï∞ÊçÆÈ¢ÑÂ§ÑÁêÜÁ±ª
class MBTIDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # ÊñáÊú¨Â∫èÂàóÂåñ
        sequence = self.tokenizer.texts_to_sequences([text])
        padded_sequence = pad_sequences(sequence, maxlen=self.max_len, padding='post')[0]

        return {
            'text': torch.tensor(padded_sequence, dtype=torch.long),
            'label': torch.tensor(label, dtype=torch.long)
        }

# LSTMÊ®°ÂûãÊû∂ÊûÑ
class MBTILSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           dropout=dropout,
                           batch_first=True)

        lstm_output_dim = hidden_dim * 2 if bidirectional else hidden_dim

        self.fc = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(lstm_output_dim, lstm_output_dim//2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(lstm_output_dim//2, output_dim)
        )

    def forward(self, text):
        embedded = self.embedding(text)

        # LSTMÂ±Ç
        output, (hidden, cell) = self.lstm(embedded)

        # Â§ÑÁêÜÂèåÂêëËæìÂá∫
        if self.lstm.bidirectional:
            hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        else:
            hidden = hidden[-1]

        return self.fc(hidden)



In [41]:
# Ë∂ÖÂèÇÊï∞ÈÖçÁΩÆ
MAX_WORDS = 10000
MAX_LEN = 500
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
OUTPUT_DIM = 16  # 16ÁßçMBTIÁ±ªÂûã
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.2
BATCH_SIZE = 64
EPOCHS = 15
LEARNING_RATE = 0.001



In [32]:
#ÂèØ‰ª•Êç¢‰∏Ä‰∏™Tokenizer
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Êï∞ÊçÆÂáÜÂ§á

df = pd.read_csv('MBTI 500.csv')
label_encoder = LabelEncoder()
label_encoder.classes_ = [
    'INTP', 'ENTP', 'INFJ', 'ENFJ', 'INTJ', 'ENTJ', 'INFP', 'ENFP',
    'ISTJ', 'ESTJ', 'ISFJ', 'ESFJ', 'ISTP', 'ESTP', 'ISFP', 'ESFP'
]  # MBTI Á±ªÂûã
texts = df['posts'].values
labels = label_encoder.fit_transform(df['type'].values)

# ÂàõÂª∫Tokenizer
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(texts)


# ‚úÖ ‰øùÂ≠ò LabelEncoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

# ‚úÖ ÊûÑÂª∫Âπ∂ËÆ≠ÁªÉ Tokenizer
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)

# ‚úÖ ‰øùÂ≠ò Tokenizer
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# ÂàíÂàÜÊï∞ÊçÆÈõÜ
X_train, X_val, y_train, y_val = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# ÂàõÂª∫DataLoader
train_dataset = MBTIDataset(X_train, y_train, tokenizer, MAX_LEN)
val_dataset = MBTIDataset(X_val, y_val, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

#whole dataset
X_test,y_test =  texts, labels
test_dataset = MBTIDataset(X_test, y_test,  tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)


In [34]:
# # ‚úÖ ‰øùÂ≠ò Tokenizer
# with open("tokenizer.pkl", "wb") as f:
#     pickle.dump(tokenizer, f)

In [24]:
print(set(y_train))


{np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15)}


In [42]:
# ÂàùÂßãÂåñÊ®°Âûã
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MBTILSTM(MAX_WORDS, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM,
                N_LAYERS, BIDIRECTIONAL, DROPOUT).to(device)

# ÂÆö‰πâ‰ºòÂåñÂô®ÂíåÊçüÂ§±ÂáΩÊï∞
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

In [43]:
EPOCHS=5

In [44]:
# ËÆ≠ÁªÉÂæ™ÁéØ
best_val_acc = 0
for epoch in range(EPOCHS):
    # ËÆ≠ÁªÉÈò∂ÊÆµ
    model.train()
    train_loss, train_acc = 0, 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        texts = batch['text'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(texts)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += (outputs.argmax(1) == labels).sum().item()

    # È™åËØÅÈò∂ÊÆµ
    model.eval()
    val_loss, val_acc = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            texts = batch['text'].to(device)
            labels = batch['label'].to(device)

            outputs = model(texts)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            val_acc += (outputs.argmax(1) == labels).sum().item()

    # ËÆ°ÁÆóÊåáÊ†á
    train_loss /= len(train_loader)
    train_acc /= len(train_dataset)
    val_loss /= len(val_loader)
    val_acc /= len(val_dataset)

    # ‰øùÂ≠òÊúÄ‰Ω≥Ê®°Âûã
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'best_model.pth')

    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}%")
    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc*100:.2f}%")

# ÊµãËØïÊ®°Âûã
model.load_state_dict(torch.load('best_model.pth'))
model.eval()
test_acc = 0
with torch.no_grad():
    for batch in val_loader:
        texts = batch['text'].to(device)
        labels = batch['label'].to(device)

        outputs = model(texts)
        test_acc += (outputs.argmax(1) == labels).sum().item()

test_acc /= len(val_dataset)
print(f'Final Test Accuracy: {test_acc*100:.2f}%')

Epoch 1/5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1326/1326 [04:30<00:00,  4.91it/s]


Train Loss: 1.9620 | Train Acc: 26.85%
Val Loss: 1.6701 | Val Acc: 35.83%


Epoch 2/5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1326/1326 [04:30<00:00,  4.91it/s]


Train Loss: 1.0929 | Train Acc: 64.04%
Val Loss: 0.7595 | Val Acc: 77.59%


Epoch 3/5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1326/1326 [04:30<00:00,  4.90it/s]


Train Loss: 0.6616 | Train Acc: 80.42%
Val Loss: 0.6446 | Val Acc: 80.61%


Epoch 4/5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1326/1326 [04:30<00:00,  4.90it/s]


Train Loss: 0.5179 | Train Acc: 84.49%
Val Loss: 0.5744 | Val Acc: 82.62%


Epoch 5/5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1326/1326 [04:30<00:00,  4.90it/s]


Train Loss: 0.3927 | Train Acc: 88.15%
Val Loss: 0.5848 | Val Acc: 82.97%
Final Test Accuracy: 82.97%


In [45]:
# ÊµãËØïÊï¥‰∏™Êï∞ÊçÆÈõÜ
model.load_state_dict(torch.load('best_model.pth'))
model.eval()
test_acc = 0
with torch.no_grad():
    for batch in test_loader:
        texts = batch['text'].to(device)
        labels = batch['label'].to(device)

        outputs = model(texts)
        test_acc += (outputs.argmax(1) == labels).sum().item()

test_acc /= len(test_dataset)
print(f' Final Test Accuracy: {test_acc*100:.2f}%')

 Final Test Accuracy: 91.06%


In [47]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import numpy as np

# Êî∂ÈõÜÈ¢ÑÊµãÂíåÁúüÂÆûÊ†áÁ≠æ
all_preds = []
all_labels = []

# Âä†ËΩΩÊ®°Âûã
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

with torch.no_grad():
    for batch in val_loader:
        texts = batch['text'].to(device)
        labels = batch['label'].to(device)
        outputs = model(texts)
        preds = outputs.argmax(dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# ËΩ¨Êàê numpy array
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

# ‰ªÖËé∑ÂèñÂÆûÈôÖÂá∫Áé∞ÁöÑÁ±ªÂà´
unique_labels = np.unique(np.concatenate((all_preds, all_labels)))
target_names = label_encoder.inverse_transform(unique_labels)

# ÊâìÂç∞ÂàÜÁ±ªÊä•Âëä
print("Classification Report:")
print(classification_report(all_labels, all_preds, labels=unique_labels, target_names=target_names))

# ÊâìÂç∞Ê∑∑Ê∑ÜÁü©Èòµ
print("Confusion Matrix:")
print(confusion_matrix(all_labels, all_preds, labels=unique_labels))

# ÂÆè F1 ÂàÜÊï∞
macro_f1 = f1_score(all_labels, all_preds, average='macro')
print(f"Macro F1 Score: {macro_f1:.4f}")

# Âä†ÊùÉ F1 ÂàÜÊï∞
weighted_f1 = f1_score(all_labels, all_preds, average='weighted')
print(f"Weighted F1 Score: {weighted_f1:.4f}")


Classification Report:
              precision    recall  f1-score   support

        ENFJ       0.73      0.68      0.70       319
        ENFP       0.88      0.77      0.82      1249
        ENTJ       0.87      0.80      0.83       577
        ENTP       0.85      0.80      0.82      2324
        ESFJ       0.00      0.00      0.00        33
        ESFP       0.75      0.53      0.62        75
        ESTJ       0.92      0.75      0.83       105
        ESTP       0.90      0.88      0.89       398
        INFJ       0.77      0.88      0.82      2954
        INFP       0.81      0.83      0.82      2391
        INTJ       0.85      0.84      0.85      4531
        INTP       0.84      0.87      0.85      5033
        ISFJ       0.65      0.55      0.59       132
        ISFP       0.79      0.55      0.64       161
        ISTJ       0.79      0.71      0.75       253
        ISTP       0.82      0.75      0.78       679

    accuracy                           0.83     21214
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [49]:
import torch
import numpy as np
import ipywidgets as widgets
from IPython.display import display
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

# ‚úÖ ËÆæÂÆöËÆæÂ§á
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ‚úÖ Âä†ËΩΩÊ®°Âûã
try:
    model.load_state_dict(torch.load('best_model.pth', map_location=device))
    model.to(device)
    model.eval()
except Exception as e:
    print(f"‚ùå Ê®°ÂûãÂä†ËΩΩÂ§±Ë¥•: {e}")

# ‚úÖ Âä†ËΩΩ tokenizer Âíå label encoder
try:
    with open('tokenizer.pkl', 'rb') as f:
        tokenizer = pickle.load(f)

    with open('label_encoder.pkl', 'rb') as f:
        label_encoder = pickle.load(f)
except Exception as e:
    print(f"‚ùå Tokenizer Êàñ LabelEncoder Âä†ËΩΩÂ§±Ë¥•: {e}")

# ‚úÖ ÂíåËÆ≠ÁªÉÊó∂‰øùÊåÅ‰∏ÄËá¥ÁöÑÊúÄÂ§ßÈïøÂ∫¶
MAX_LEN = 100

# üîπ ÂàõÂª∫ËæìÂÖ•Ê°Ü
text_input = widgets.Text(
    value='',
    placeholder='ËØ∑ËæìÂÖ•ÊñáÊú¨...',
    description='ËæìÂÖ•:',
    layout=widgets.Layout(width='400px')
)

# üîπ ÂàõÂª∫ÊåâÈíÆÂíåËæìÂá∫Âå∫
predict_button = widgets.Button(description="È¢ÑÊµã")
exit_button = widgets.Button(description="ÈÄÄÂá∫", button_style='danger')
output = widgets.Output()

# üîπ È¢ÑÊµãÂáΩÊï∞
def predict_mbti(b):
    with output:
        output.clear_output()
        user_input = text_input.value.strip()

        if not user_input:
            print("‚ö†Ô∏è ËØ∑ËæìÂÖ•ÊñáÊú¨ÂÜÖÂÆπÔºÅ")
            return

        try:
            # ÊñáÊú¨È¢ÑÂ§ÑÁêÜ
            sequence = tokenizer.texts_to_sequences([user_input])
            padded_sequence = pad_sequences(sequence, maxlen=MAX_LEN, padding='post')
            texts = torch.tensor(padded_sequence, dtype=torch.long).to(device)

            # Ê®°ÂûãÈ¢ÑÊµã
            with torch.no_grad():
                outputs = model(texts)
                predicted_label = outputs.argmax(1).item()
                predicted_mbti = label_encoder.inverse_transform([predicted_label])[0]

            # ÊòæÁ§∫ÁªìÊûú
            print("üß† Ê®°ÂûãÈ¢ÑÊµãÁªìÊûúÔºö")
            print(f"üëâ ‰Ω†ÁöÑ MBTI Á±ªÂûãÊòØÔºö**{predicted_mbti}**")
        except Exception as e:
            print(f"‚ùå È¢ÑÊµãÊó∂Âá∫Èîô: {e}")

# üîπ Ê®°ÊãüÈÄÄÂá∫ÊåâÈíÆÔºàÈÄÇÈÖç notebookÔºå‰∏ç‰ΩøÁî® sys.exitÔºâ
def exit_notebook(b):
    with output:
        output.clear_output()
        print("üìå Á®ãÂ∫èÂ∑≤ÈÄÄÂá∫ÔºåËØ∑ÊâãÂä®ÂÖ≥Èó≠ÊàñÂÅúÊ≠¢ notebook„ÄÇ")

# üîπ ÁªëÂÆöÊåâÈíÆ‰∫ã‰ª∂
predict_button.on_click(predict_mbti)
exit_button.on_click(exit_notebook)

# üîπ ÊòæÁ§∫ÁïåÈù¢
display(text_input, predict_button, exit_button, output)


Text(value='', description='ËæìÂÖ•:', layout=Layout(width='400px'), placeholder='ËØ∑ËæìÂÖ•ÊñáÊú¨...')

Button(description='È¢ÑÊµã', style=ButtonStyle())

Button(button_style='danger', description='ÈÄÄÂá∫', style=ButtonStyle())

Output()