In [51]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from sklearn.feature_extraction.text import TfidfVectorizer
from torchtext.vocab import GloVe,vocab
from torchtext import data

AttributeError: module 'torch' has no attribute '_utils_internal'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/My Drive/Colab Notebooks/NLP

In [45]:
def convert_personality_type_to_binary(mbti_type):
    mapper = {
        'I':0,
        'E':1,
        'N':0,
        'S':1,
        'T':0,
        'F':1,
        'J':0,
        'P':1,
    }

    return [mapper[i] for i in mbti_type]

def convert_personality_type_to_int(mbti_type):
    types = [
                'INTJ', 'INTP', 'ISFJ', 'ISFP',
                'ISTJ', 'ISTP', 'ENFJ', 'ENFP',
                'ENTJ', 'ENTP','ESFJ', 'ESFP',
                'ESTJ', 'ESTP', 'INFJ', 'INFP'
            ]
    ints = [i for i in range(len(types))]
    mapper = dict(zip(types, ints))

    return mapper[mbti_type]

In [46]:
class MBTIDataset(Dataset):
    def __init__(self, data_path, vectorizing_method = None, binary_outputs = False, max_seq_len=500):
        """
        Vectorizing methods:
        None - returns raw text
        basic - basic builtin pytorch vectorizer
        TfIdf - tf-idf vectorizer
        GloVe - Global Vectors pretrained embedding
        """
        self.df = pd.read_csv(data_path)
        self.vectorizing_method = vectorizing_method
        self.max_seq_len = max_seq_len
        self.split_dataframe(self.max_seq_len)

        if vectorizing_method:
            if vectorizing_method.lower == 'basic':
                self.tokenizer = get_tokenizer('basic_english')
                self.vocab = build_vocab_from_iterator(self.yield_tokens_from_dataframe(), specials=['<unk>'])
                self.vocab.set_default_index(self.vocab["<unk>"])

            if vectorizing_method.lower == 'tfidf' or vectorizing_method.lower == 'tf-idf':
                self.tokenizer = TfidfVectorizer(stop_words= 'english')
                self.vocab = self.tokenizer.fit_transform(self.df['posts']) # Sparse matrix representation - could use different field names
            if vectorizing_method.lower == 'glove':
                unk_index = 0
                self.global_vectors = GloVe(name='6B', dim=50)
                self.vocab=vocab(self.global_vectors.stoi)
                self.vocab.insert_token("<unk>",unk_index)
                self.vocab.set_default_index(unk_index)

                self.pretrained_embeddings = self.global_vectors.vectors
                self.pretrained_embeddings = torch.cat((torch.zeros(1,self.pretrained_embeddings.shape[1]),self.pretrained_embeddings))


        self.binary_outputs = binary_outputs
        if binary_outputs:
            self.df['type'] = self.df['type'].apply(convert_personality_type_to_binary)
        else:
            self.df['type'] = self.df['type'].apply(convert_personality_type_to_int)

    def yield_tokens_from_dataframe(self):
        for post in self.df['posts']:
            yield self.tokenizer(post)

    def split_dataframe(self, new_seq_len):
        new_df = pd.DataFrame(columns=self.df.columns)
        new_posts = []
        new_types = []
        for idx, row in self.df.iterrows():
            split_posts = row['posts'].split(' ')
            i = 0
            while i < len(split_posts):
                new_posts.append((' ').join(split_posts[i:i+new_seq_len]))
                new_types.append(row['type'])
                i += new_seq_len

        new_df['posts'] = new_posts
        new_df['type'] = new_types
        self.df = new_df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if idx >= len(self): raise IndexError

        if not self.vectorizing_method :
            return self.df['posts'][idx], self.df['type'][idx]  #Return raw text

        input_text = self.vocab(self.df['posts'][idx].split(' '))
        if len(input_text) < self.max_seq_len:
            input_text.extend([0] * (self.max_seq_len-len(input_text)))
        label = self.df['type'][idx]

        return input_text[0:self.max_seq_len], label

In [50]:
PATH =  '/content/drive/MyDrive/Colab Notebooks/NLP/MBTI 500.csv'##'data/MBTI 500.csv'
ds = MBTIDataset(PATH, 'GloVe', binary_outputs=False, max_seq_len=50)
train_set_size = int(len(ds)*0.7)
val_set_size = int(len(ds)*0.2)
test_set_size = len(ds) - train_set_size - val_set_size
train_ds, val_ds, test_ds = random_split(ds, [train_set_size, val_set_size, test_set_size])

1073461


In [None]:
train_dataloader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_ds, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_ds, batch_size=16, shuffle=True)
i, l = iter(train_dataloader).__next__()
print(i)
print(l)

In [None]:
print(ds.vocab["lol"])
print(ds.pretrained_embeddings[73048,:])

In [71]:
print(ds.vocab["lol"])
print(ds.pretrained_embeddings[73048,:])

In [None]:
train_dataloader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_ds, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_ds, batch_size=16, shuffle=True)
i, l = iter(train_dataloader).__next__()
print(i)
print(l)

[tensor([ 354,  242,   50,  563,   24,  476, 3171,    1,    9, 3949,  547,  308,
        1711,   38,  235,  134]), tensor([ 730,  259,   10,   20,  360,  282,  238,   48, 1162, 1807,   20,   93,
        5085, 5920, 1169,  193]), tensor([ 342,  961, 1916,   12, 4217,    2, 2809,    1,   88,  143,  369,    9,
         257,  844, 1152,   72]), tensor([ 1097,  7246,  2165,  2582, 41133,   690,   631,    90,   357,   254,
         1267,   269,   196,    18,     8,   237]), tensor([    2,   233,    48,   432,    21,    21,   531,     1, 14221, 11919,
         2274,     4,   224,  2123,    19,   358]), tensor([   656,      3,    877,    513,   1776,   5996,     10,     89,    106,
        172910,     75,    162,   1929,  13442,   1036,      1]), tensor([ 741, 5529, 3950, 1220, 1795,   87, 1438,  220,  121, 4320,   52,    4,
         144,  103,  284,   18]), tensor([5395,    3,  232,  432,  416, 2817, 1655, 4439, 2074,   14,  209,  108,
         140,  460,    4, 1393]), tensor([   69,    73,  

In [82]:
print(ds.vocab["lol"])
print(ds.pretrained_embeddings[73048,:])

73048
tensor([ 0.2824,  0.0689, -0.1381,  0.2578, -0.5909,  0.1503,  0.5107, -0.3674,
        -0.0866,  0.1403,  0.8691,  0.4609,  0.3474,  0.3024, -0.6127,  0.1721,
        -0.7481,  0.8336, -0.3853,  0.1543,  0.2961, -0.3778, -0.3465,  0.0034,
        -0.3395,  0.3723,  0.1311, -0.2882, -0.9532, -0.0277, -0.3494,  0.4386,
         0.2196,  0.6655,  0.0415, -0.1712,  0.2894, -0.1587, -0.0215,  0.3617,
         0.6830, -0.1733, -0.0644,  0.5270, -0.5397, -0.2778,  0.0824,  0.5076,
        -0.0374, -0.2469])


In [38]:
df = pd.read_csv('data/MBTI 500.csv')

In [40]:
df = splitter(df, 200)

In [44]:
len(df['posts'][2].split(' '))

101

In [23]:
df

Unnamed: 0,posts,type
0,know intj tool use interaction people excuse a...,INTJ
1,rap music ehh opp yeah know valid well know fa...,INTJ
2,preferably p hd low except wew lad video p min...,INTJ
3,drink like wish could drink red wine give head...,INTJ
4,space program ah bad deal meing freelance max ...,INTJ
...,...,...
106062,stay frustrate world life want take long nap w...,INFP
106063,fizzle around time mention sure mistake thing ...,INFP
106064,schedule modify hey w intp strong wing underst...,INFP
106065,enfj since january busy schedule able spend li...,INFP


In [42]:
df


Unnamed: 0,posts,type
0,know intj tool use interaction people excuse a...,INTJ
1,little aggressive always walk away flatter lik...,INTJ
2,individual farmer individual farmer corporatio...,INTJ
3,rap music ehh opp yeah know valid well know fa...,INTJ
4,back claim get back liam nesson realization co...,INTJ
...,...,...
318196,hear know really complicate wish seem way mind...,INFP
318197,seem change one bite must say good back somewh...,INFP
318198,feel like men good problem tell parent want te...,INFP
318199,fine ad make website know sometimes need recco...,INFP
