In [1]:
import torch as torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import pandas as pd
import re
import random

import matplotlib.pyplot

from ipywidgets import FloatProgress
from IPython.display import display

In [2]:
data = pd.read_csv('mbti_preprocessed.csv')

all_characters = "abcdefghijklmnopqrstuvwxyz0123456789 ,.!?:()"
all_personality = ['INTJ', 'INTP', 'INFJ', 'INFP',
                   'ISTJ', 'ISTP', 'ISFJ', 'ISFP',
                   'ENTJ', 'ENTP', 'ENFJ', 'ENFP',
                   'ESTJ', 'ESTP', 'ESFJ', 'ESFP']

def clean_text(text):
    return re.sub("[^a-z0-9 ,\\.!?:()]", "", text.lower()) 
data['post'] = data['post'].apply(clean_text)

def filter_row(row):
    if row['post'].strip() == '':
        return False
    if all(x.isdigit() for x in row['post'].strip().split()):
        return False
    return True

data = data[data.apply(filter_row, axis=1)]
df = pd.DataFrame(data)
df

Unnamed: 0.1,Unnamed: 0,post,type
0,0,enfp and intj moments sportscenter not top ...,INFJ
1,1,what has been the most lifechanging experience...,INFJ
2,2,on repeat for most of today,INFJ
3,3,may the perc experience immerse you,INFJ
4,4,the last thing my infj friend posted on his fa...,INFJ
5,5,hello enfj7 sorry to hear of your distress its...,INFJ
7,7,welcome and stuff,INFJ
8,8,game set match,INFJ
9,9,prozac wellbrutin at least thirty minutes of m...,INFJ
10,10,basically come up with three items youve deter...,INFJ


In [3]:

def character_to_index(letter):
    ret = all_characters.find(letter)
    if ret == -1:
        raise Exception("Character not found {}".format(letter))
    return ret

def line_to_tensor(line):
    tensor = torch.zeros(len(line), 1, len(all_characters))
    for idx, character in enumerate(line):
        tensor[idx][0][character_to_index(character)] = 1
    return tensor.cuda()

def personality_to_index(personality):
    return all_personality.index(personality)

def personality_to_tensor(personality):
    return torch.LongTensor([personality_to_index(personality)]).cuda()


In [4]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
#         self.hidden_size = hidden_size
        
#         self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
#         self.i2o = nn.Linear(input_size + hidden_size, output_size)
#         self.softmax = nn.LogSoftmax(dim=1)
    
        self.l1 = nn.RNN(len(all_characters), 256, 1, nonlinearity='tanh', batch_first=True)
        self.l2 = nn.Linear(256, len(all_personality))
        self.softmax = nn.LogSoftmax(dim=0)
        
    def forward(self, input, hidden):
#         combined = torch.cat([input, hidden], 1)
#         hidden = self.i2h(combined)
#         output = self.i2o(combined)
#         output = self.softmax(output)
#         return output, hidden
        packed_output, hidden = self.l1(input, hidden)
        output = packed_output.data[packed_output.data.size()[0]-1]
        output = self.l2(output)
        output = self.softmax(output)
        return output.view(1,-1), hidden
    
    def init_hidden(self):
        #return Variable(torch.zeros(1, self.hidden_size).cuda())
        return Variable(torch.zeros(1, 1, 256).cuda()) #layers, batch, hidden_size
    


In [5]:
print("input vector size: {}, output vector size: {}".format(len(all_characters), len(all_personality)))

input vector size: 44, output vector size: 16


In [10]:
rnn = RNN(len(all_characters), 512, len(all_personality)).cuda()
criterion = nn.NLLLoss()
learning_rate = 0.0001

fp = FloatProgress(min=0, max=1)
display(fp)

def train(X, Y):
    hidden = rnn.init_hidden()
    rnn.zero_grad()
    
    seqlen = X.size()[0]
    packed = torch.nn.utils.rnn.pack_padded_sequence(X.view(1,seqlen,len(all_characters)), [seqlen], batch_first=True)
    y_hat, hidden = rnn(packed, hidden)
    
    #for i in range(X.size()[0]):
    #    y_hat, hidden = rnn(X[i].unsqueeze(0).cuda(), hidden)
    
    loss = criterion(y_hat, Y)
    loss.backward()
    
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)
    
    return y_hat, loss.data[0]

def feedforward(X):
    hidden = rnn.init_hidden()
    
    seqlen = X.size()[0]
    packed = torch.nn.utils.rnn.pack_padded_sequence(X.view(1,seqlen,len(all_characters)), [seqlen], batch_first=True)
    y_hat, hidden = rnn(packed, hidden)
    
    return y_hat

iter = 0
n_iters = 200000
all_loss = 0
print_every = 1000
correct_count = 0

train_data=data.sample(frac=0.8,random_state=200)
test_data=data.drop(train_data.index)

losses = []

for iter in range(1,n_iters+1):
    sample = train_data.sample(n=1)
    X = Variable(line_to_tensor(sample['post'].iloc[0])).cuda()
    Y = Variable(personality_to_tensor(sample['type'].iloc[0])).cuda()
    
    predicted, loss = train(X, Y)
    all_loss += loss
    maxval, argmax = predicted.max(1)
    
    is_correct = all_personality.index(sample['type'].iloc[0]) == argmax.data[0]
    if is_correct:
        correct_count += 1
        
    fp.value += 1.0 / print_every
    
    if iter % print_every == 0 and iter != 0:
        avg_loss = all_loss / print_every
        print("{}, {} avg: {} ({} / {}) ({} of {} correct)".format(iter, loss, avg_loss, all_personality[argmax.data[0]], sample['type'].iloc[0], correct_count, print_every))
        all_loss = 0
        correct_count = 0
        
        fp.value = 0.0
        

    
    

1000, 2.7004587650299072 avg: 2.765467352151871 (INFJ / INTJ) (110 of 1000 correct)
2000, 2.838440179824829 avg: 2.7446005721092224 (INFJ / INTP) (124 of 1000 correct)
3000, 2.7510931491851807 avg: 2.725553841352463 (INFJ / INTP) (127 of 1000 correct)
4000, 2.586085796356201 avg: 2.701226351737976 (INFP / INFJ) (147 of 1000 correct)
5000, 2.822556972503662 avg: 2.6854931354522704 (INFP / ENFP) (183 of 1000 correct)
6000, 2.7977817058563232 avg: 2.657362715244293 (INFP / ISTP) (197 of 1000 correct)
7000, 2.5024895668029785 avg: 2.634905413866043 (INFP / INFJ) (207 of 1000 correct)
8000, 3.0018069744110107 avg: 2.606390592098236 (INFP / ENTJ) (213 of 1000 correct)
9000, 2.2325925827026367 avg: 2.5704754209518432 (INFP / INFP) (230 of 1000 correct)
10000, 2.5511715412139893 avg: 2.551169369935989 (INFP / INTP) (211 of 1000 correct)
11000, 2.00080943107605 avg: 2.4940773149728774 (INFP / INFP) (221 of 1000 correct)
12000, 2.419696569442749 avg: 2.467173962831497 (INFP / INTP) (196 of 1000 

98000, 2.57570219039917 avg: 2.3130830299854277 (INFP / ENFP) (215 of 1000 correct)
99000, 5.194348335266113 avg: 2.257314260840416 (INFP / ESFP) (203 of 1000 correct)
100000, 1.579717993736267 avg: 2.3200736449956896 (INFP / INFP) (197 of 1000 correct)
101000, 1.7623885869979858 avg: 2.2661465101242064 (INFP / INFJ) (218 of 1000 correct)
102000, 2.637099504470825 avg: 2.2293430011272433 (INFP / ENFP) (228 of 1000 correct)
103000, 3.711519956588745 avg: 2.2962303885221482 (INFP / ISTJ) (203 of 1000 correct)
104000, 1.9166173934936523 avg: 2.280589068055153 (INFP / INTP) (212 of 1000 correct)
105000, 1.5670418739318848 avg: 2.2735647233724596 (INFP / INFP) (200 of 1000 correct)
106000, 1.8483115434646606 avg: 2.261469846367836 (INFP / INTP) (235 of 1000 correct)
107000, 1.4983508586883545 avg: 2.263433571577072 (INFP / INFP) (224 of 1000 correct)
108000, 1.9343881607055664 avg: 2.249053509712219 (INFP / INTP) (212 of 1000 correct)
109000, 1.5886085033416748 avg: 2.2555238811969756 (INFP

194000, 1.96638023853302 avg: 2.2614780131578445 (INFP / INTP) (217 of 1000 correct)
195000, 2.6370065212249756 avg: 2.2806281961202624 (INFP / ENFP) (205 of 1000 correct)
196000, 1.7006548643112183 avg: 2.2612854211330413 (INFP / INFJ) (211 of 1000 correct)
197000, 3.670851230621338 avg: 2.278805662512779 (INFP / ENTJ) (220 of 1000 correct)
198000, 1.4514758586883545 avg: 2.2496563099622726 (INFP / INFP) (244 of 1000 correct)
199000, 3.828968048095703 avg: 2.2956109367609026 (INFP / ENFJ) (202 of 1000 correct)
200000, 1.5018142461776733 avg: 2.3061864525079727 (INFP / INFP) (222 of 1000 correct)


In [8]:
display(fp)
def run_test():
    print("Beginning test - {}".format(len(test_data)))
    correct = 0
    total = 0
    fp.value = 0.0
    
    for i in range(len(test_data)):
        X = Variable(line_to_tensor(test_data['post'].iloc[i])).cuda()
        y_hat = feedforward(X)
        
        maxval, argmax = y_hat.max(1)
        is_correct = all_personality.index(test_data['type'].iloc[i]) == argmax.data[0]
        
        if is_correct:
            correct += 1
        
        total += 1
        fp.value += 1.0 / len(test_data)
    
    print("Accuracy {}/{} {}".format(correct, total, correct/total))

run_test()

Beginning test - 40893
Accuracy 8588/40893 0.21001149340963002
