In [None]:
import os
import re
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from nltk.tokenize import word_tokenize

### Function to save and load files

In [None]:
def save_file(name, obj):
    with open(name, 'wb') as f:
        pickle.dump(obj, f)

def load_file(name):
    return pickle.load(open(name, 'rb'))

## Data Processing

In [None]:
tokens_path = 'Output/tokens.pkl'
file_path = 'Input/complaints.csv'
col_name = 'Consumer complaint narrative'

In [None]:
data = pd.read_csv('../2. MultiClass classification/Input/complaints.csv')

In [None]:
data.shape

### Drop missing values

In [None]:
data.dropna(subset=[col_name], inplace=True)

In [None]:
data.shape

In [None]:
input_text = data[col_name]

In [None]:
input_text[1]

### Convert text to lower case

In [None]:
input_text = [i.lower() for i in tqdm(input_text)]

In [None]:
input_text[0]

### Remove punctuation except apostrophe
apostrophe has some meaning so we don't remove it

In [None]:
input_text = [re.sub(r"[^\w\d'\s]+", " ", i) for i in tqdm(input_text)]

In [None]:
input_text[0]

### Remove digits

In [None]:
input_text = [re.sub('\d+', '', i) for i in tqdm(input_text)]

In [None]:
input_text[0]

#### Remove 'xxxx' in text

In [None]:
input_text = [re.sub(r'[x]{2,}', '', i) for i in tqdm(input_text)]

In [None]:
input_text[1]

### Remove additional spaces

In [None]:
input_text = [re.sub(' +', ' ', i) for i in tqdm(input_text)]

In [None]:
input_text[1]

### Tokenize the text

In [None]:
tokens = [word_tokenize(t) for t in tqdm(input_text[:100])]

### Save tokens

In [None]:
save_file(tokens_path, tokens)

## Data loader

In [None]:
k = 10 # negative samples
t = 1e-5 # tensors
context_window = 5

In [None]:
sample = [None] * context_window + tokens[0] + [None] * context_window

In [None]:
len(sample)

In [None]:
for i in range(context_window, len(sample) - context_window):
    print(sample[i], sample[i-context_window:i] + 
          sample[i+1:i+context_window+1])

In [None]:
counter = Counter([t for d in tqdm(tokens) for t in d])
len(counter)

In [None]:
class SkipGramDataset(torch.utils.data.Dataset):
    def __init__(self, input_data, context_window=5, out_path='Output', t=1e-5, k=10):
        self.k = k
        self.context_window = context_window
        counter = Counter([t for d in tqdm(tokens) for t in d])
        self.vocab_count = len(counter)
        print(f'Unique words in the corpus: {self.vocab_count}')
        print('Creating data samples...')
        self.samples = self.positive_samples(input_data)
        word2idx = dict()
        idx2word = dict()
        sampling_prob = []
        print("Generating vocabulary...")
        for i, c in enumerate(counter.most_common(len(counter))):
            word2idx[c[0]] = i
            idx2word[i] = c[0]
            sampling_prob.append(c[1])
        self.word2idx = word2idx
        self.idx2word = idx2word
        print('Calculating sampling probabilities...')
        sampling_prob = np.sqrt(t/np.array(sampling_prob))
        sampling_prob = sampling_prob / np.sum(sampling_prob)
        self.sampling_prob = sampling_prob
        print("Saving files...")
        self.save_files(out_path)
        
    def __len__(self):
        return self.samples.shape[0]
    
    def __getitem__(self, idx):
        neg_words = self.negative_samples()
        center_word = self.word2idx[self.samples.loc[idx, "center_word"]]
        context_word = self.word2idx[self.samples.loc[idx, "context_word"]]
        return torch.tensor(center_word), torch.tensor([context_word]+neg_words)
    
    def positive_samples(self, input_data):
        samples = []
        cw = self.context_window
        for data in tqdm(input_data):
            text = [None] * cw + data + [None] * cw
            for i in range(cw, len(text) - cw):
                samples.append((text[i], text[i-cw:i]+ text[i+1:i+cw+1]))
        samples = pd.DataFrame(samples, columns=['center_word', 'context_word'])
        samples = samples.explode('context_word')
        samples.dropna(inplace=True)
        samples.reset_index(drop=True, inplace=True)
        return samples
    
    def negative_samples(self):
        neg_words = list(np.random.choice(np.arange(self.vocab_count), self.k, p=self.sampling_prob))

        return neg_words
    
    def save_files(self, out_path="Output"):
        save_file(os.path.join(out_path, 'word2idx.pkl'), self.word2idx)
        save_file(os.path.join(out_path, 'idx2word.pkl'), self.idx2word)

## Skip-Gram Model

In [None]:
embedding_size = 64

In [126]:
class SkipGram(nn.Module):
    def __init__(self, vocab_len, embedding_size=64):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_len, embedding_size)
        self.weights = torch.empty(embedding_size, vocab_len, requires_grad=True).type(
            torch.FloatTensor
        )
        _ = torch.nn.init.normal_(self.weights)
        self.out = nn.LogSigmoid()
    
    def forward(self, center_word, context_words):
        embeddings_ = self.embeddings(center_word)
        weights_ = self.weights[:, context_words]
        output = torch.einsum('bi,ibo->bo', embeddings_, weights_)
        true_y = torch.zeros(output.shape[0], dtype=torch.int64)
        return self.out(output), true_y

    
    def save_files(self, out_path='Output'):
        save_file(os.path.join(out_path, 'emb.pkl'), self.embeddings)
        save_file(os.path.join(out_path, 'weights.pkl'), self.weights)
      

### Training

In [127]:
k = 10
lr = 0.01
num_epochs =  2
batch_size = 128
context_window = 5
out_path = 'Output'

In [128]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [136]:
def train_sg(dataloader, model, criterion, optimizer, device, num_epochs):
    model.train()
    best_loss = 1e8
    patience = 0
    for i in range(num_epochs):
        epoch_loss = []
        print(f"Epoch {i+1} of {num_epochs}")
        for center_word, context_words in tqdm(dataloader):
            center_word = center_word.to(device)
            context_words = context_words.to(device)
            output, true_y = model(center_word, context_words)
            loss = criterion(output, true_y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss.append(loss.item())
        epoch_loss = np.mean(epoch_loss)
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            patience = 0
        else:
            patience += 1
        print(f"Loss: {epoch_loss}")
        if patience == 5:
            print("Early stopping...")
    model.save_files()
            

In [137]:
dataset = SkipGramDataset(input_data=tokens, 
                          context_window=context_window, 
                          out_path=out_path, 
                          t=t, 
                          k=k)

100%|██████████| 100/100 [00:00<00:00, 214982.27it/s]


Unique words in the corpus: 2513
Creating data samples...


100%|██████████| 100/100 [00:00<00:00, 964.61it/s]

Generating vocabulary...
Calculating sampling probabilities...
Saving files...





In [139]:
dataloader = torch.utils.data.DataLoader(dataset, 
                                         batch_size=batch_size,
                                         shuffle=True, 
                                         drop_last=True)

In [140]:
model = SkipGram(dataset.vocab_count, embedding_size=embedding_size)

In [141]:
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [142]:
train_sg(dataloader, model, criterion, optimizer, device, num_epochs)

Epoch 1 of 2


100%|██████████| 1780/1780 [00:20<00:00, 85.72it/s]


Loss: 1.7787468166833513
Epoch 2 of 2


100%|██████████| 1780/1780 [00:20<00:00, 86.02it/s]

Loss: 0.7083093739292595





## Using embeddings to get word vectors

In [145]:
word2idx = load_file('Output/word2idx.pkl')
word2idx


{'i': 0,
 'the': 1,
 'to': 2,
 'and': 3,
 'my': 4,
 'a': 5,
 'of': 6,
 'that': 7,
 'was': 8,
 'in': 9,
 'on': 10,
 'they': 11,
 'this': 12,
 'not': 13,
 'have': 14,
 'is': 15,
 'credit': 16,
 'for': 17,
 'account': 18,
 'with': 19,
 'me': 20,
 'it': 21,
 'from': 22,
 'as': 23,
 'be': 24,
 'report': 25,
 'or': 26,
 'an': 27,
 'had': 28,
 'are': 29,
 'information': 30,
 'would': 31,
 'payment': 32,
 'any': 33,
 'been': 34,
 'bank': 35,
 'but': 36,
 'by': 37,
 'am': 38,
 'has': 39,
 'which': 40,
 'told': 41,
 'all': 42,
 'them': 43,
 'no': 44,
 'when': 45,
 'their': 46,
 'because': 47,
 'received': 48,
 'card': 49,
 'you': 50,
 'never': 51,
 'accounts': 52,
 'did': 53,
 'at': 54,
 'loan': 55,
 'we': 56,
 'so': 57,
 'company': 58,
 'were': 59,
 'debt': 60,
 'do': 61,
 'these': 62,
 'number': 63,
 'back': 64,
 'there': 65,
 'time': 66,
 'letter': 67,
 'call': 68,
 'she': 69,
 'if': 70,
 'about': 71,
 'out': 72,
 'made': 73,
 'called': 74,
 'after': 75,
 'reporting': 76,
 'amount': 77,
 'als

In [146]:
word2idx['payments']

83

In [147]:
embeddings = load_file('Output/emb.pkl')

In [149]:
embeddings(torch.tensor(83))

tensor([ 5.8895e-01,  2.1157e-01,  3.1601e-01, -1.4533e+00, -1.2353e+00,
        -5.3033e-02,  7.5406e-01, -1.4900e-01,  4.0169e-01,  1.5359e-01,
        -1.3740e-02, -6.2258e-01, -1.6137e-01,  2.4906e-01, -1.0707e+00,
         2.6062e-01,  5.3572e-01,  6.9810e-01,  6.5606e-01,  3.5893e-01,
         8.5732e-02,  1.1915e-01,  6.6459e-01, -1.0984e-01,  7.4903e-02,
        -4.1574e-01,  4.7407e-02, -8.7902e-04,  2.2150e-01,  4.8981e-01,
         4.5117e-01,  1.8424e-01,  3.5303e-02, -9.5748e-01, -6.8034e-01,
        -1.5042e+00, -2.0639e-01, -1.5200e-01,  1.9036e-02, -2.8010e-01,
         1.2788e+00, -2.0353e-01, -5.1104e-01, -3.9146e-02, -7.4931e-01,
        -2.7249e-01,  1.2151e+00, -4.5485e-01,  1.2863e-01, -1.7437e-01,
        -4.8675e-01, -1.0944e+00, -5.7456e-01,  7.7892e-01, -7.9623e-01,
        -3.0727e-01, -2.1240e-01,  2.1824e-01,  3.8599e-01, -1.7000e+00,
        -6.7727e-01, -2.6601e-01,  1.3301e-01, -4.2749e-01],
       grad_fn=<EmbeddingBackward0>)