In [4]:
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
stop_words = set(stopwords.words('english'))
import torch

In [5]:
# Load in the data
with open("job_data/out.txt", "r") as f:
    data_true_X = np.array([line.strip().lower() for line in f if len(line) > 0])
    data_true_y = np.ones(data_true_X.shape)
    
with open("not_job_data/output.txt", "r") as f:
    data_false_X = np.array([line.strip().lower() for line in f if len(line) > 0])
    data_false_y = np.zeros(data_false_X.shape)

In [6]:
# Combine the data and shuffle
data_X = np.concatenate((data_true_X, data_false_X), axis=0)
data_y = np.concatenate((data_true_y, data_false_y), axis=0)
b = np.linspace(0, len(data_X)-1, len(data_X)).astype(int)
np.random.shuffle(b)
data_X = data_X[b]
data_y = data_y[b]

In [7]:
print(data_X.shape)
print(data_y.shape)

(43849,)
(43849,)


In [8]:
# Vocabulary
vocab = {
    "0":0,
    "1":1,
    "2":2,
    "3":3,
    "4":4,
    "5":5,
    "6":6,
    "7":7,
    "8":8,
    "9":9,
    "a":10,
    "b":11,
    "c":12,
    "d":13,
    "e":14,
    "f":15,
    "g":16,
    "h":17,
    "i":18,
    'j':19,
    'k':20,
    'l':21,
    'm':22,
    'n':23,
    'o':24,
    'p':25,
    'q':26,
    'r':27,
    's':28,
    't':29,
    'u':30,
    'v':31,
    'w':32,
    'x':33,
    'y':34,
    'z':35,
    "A":36,
    'B':37,
    'C':38,
    'D':39,
    'E':40,
    'F':41,
    'G':42,
    'H':43,
    'I':44,
    'J':45,
    'K':46,
    'L':47,
    'M':48,
    'N':49,
    'O':50,
    'P':51,
    'Q':52,
    'R':53,
    'S':54,
    'T':55,
    'U':56,
    'V':57,
    'W':58,
    'X':59,
    'Y':60,
    'Z':61,
    " ":62,
    "-":63,
    "<PAD>":64
}

In [131]:
vocab_inv = {v: k for k, v in vocab.items()}

In [9]:
# Encode the sentences
data_X_enc = []
data_X_enc_one_hot = []
max_len = 0
for s in data_X:
    sent = []
    hot = []
    for l in s:
        sent.append(vocab[l])
        hot.append(torch.nn.functional.one_hot(torch.tensor(vocab[l], dtype=torch.long), num_classes=len(vocab)))
    if len(sent) > max_len:
        max_len = len(sent)
    
    if len(sent) == 0:
        sent.append(vocab["<PAD>"])
        hot.append(torch.nn.functional.one_hot(torch.tensor(vocab["<PAD>"], dtype=torch.long), num_classes=len(vocab)))
        
    data_X_enc.append(torch.tensor(sent, dtype=torch.float))
    data_X_enc_one_hot.append(torch.stack(hot).to(torch.float))

In [10]:
# PAD tensor
pad1 = torch.tensor(vocab["<PAD>"], dtype=torch.long)
pad2 = torch.nn.functional.one_hot(pad1, num_classes=len(vocab)).unsqueeze(0)

# Add padding to the sequences
for i in range(0, len(data_X_enc)):
    p = data_X_enc[i]
    p2 = data_X_enc_one_hot[i]
    
    # What is the padding size?
    padding = max_len-len(p)
    
    # Padding tensor
    pad1_ = pad1.repeat(padding)
    pad2_ = pad2.repeat(padding, 1)
    
    # Add the padding
    pad1_ = torch.cat((p, pad1_), dim=0)
    pad2_ = torch.cat((p2, pad2_), dim=0)
    
    # Save the tensors
    data_X_enc[i] = pad1_
    data_X_enc_one_hot[i] = pad2_

In [11]:
data_X_enc = torch.stack(data_X_enc)
data_X_enc_one_hot = torch.stack(data_X_enc_one_hot)

In [12]:
data_y = torch.tensor(data_y).to(torch.float32)

In [32]:
class model(torch.nn.Module):
    def __init__(self, in_, out):
        super(model, self).__init__()
        
        self.zero = torch.nn.Embedding(in_, in_)
        self.first = torch.nn.LSTM(in_, out, num_layers=2, bidirectional=True)
        self.second = torch.nn.Sequential(
            torch.nn.Linear(out*2, 1),
            torch.nn.Sigmoid()
        )
        
        self.bce = torch.nn.BCELoss(reduction="none")
        
        self.optim = torch.optim.Adam(self.parameters(), lr=0.0005)
    
    
    def forward(self, X):
        return self.second(self.first(self.zero(X))[0])[:, 0].squeeze()

In [33]:
# Model to train
M = model(len(vocab), 16)

In [37]:
epochs = 1000
batchSize = 256

In [38]:
torch.argmax(data_X_enc, dim=-1)

tensor([12, 11, 20,  ..., 20, 22, 12])

In [39]:
for epoch in range(0, epochs):
    for batch in range(0, len(data_X_enc), batchSize):
        samp = np.random.randint(0, len(data_X_enc), (batchSize))
        
        batch_X = data_X_enc[samp].int()
        batch_y = data_y[samp]
        
        out = M(batch_X)
        
        loss = M.bce(out, batch_y).mean()
        
        loss.backward()
        
        M.optim.step()
        M.optim.zero_grad()
        
        print(loss.item())

0.6059583425521851
0.6767179369926453
0.6022487878799438
0.6230435967445374
0.584203839302063
0.6088851690292358
0.6078330278396606
0.6202196478843689
0.5945796370506287
0.6312519311904907
0.6096928119659424
0.6396051645278931
0.6145243644714355
0.5918936729431152
0.6156343221664429
0.6358606219291687
0.612484335899353
0.608007550239563
0.6033207178115845
0.6481695175170898
0.6221637725830078
0.6274004578590393
0.6134487390518188
0.580052375793457
0.6372479796409607
0.6204797029495239
0.6122516393661499
0.600078821182251
0.6142035722732544
0.597164511680603
0.6229166388511658
0.6314535737037659
0.6365209221839905
0.6106297373771667
0.5737149715423584
0.6111589074134827
0.6137350797653198
0.5687386989593506
0.5814423561096191
0.6455882787704468
0.6375218033790588
0.6761428117752075
0.5763296484947205
0.5968968868255615
0.6058505177497864
0.6287354826927185
0.5907635688781738
0.5783219337463379
0.6007172465324402
0.5986545085906982
0.6133789420127869
0.5731505155563354
0.6228166222572327

KeyboardInterrupt: 

In [147]:
# Save the model
torch.save(M.state_dict(), "./models/model1.pkl")

FileNotFoundError: [Errno 2] No such file or directory: '/models/model1.pkl'

In [125]:
sents = ["ubuntu", "gabriel", "arch"]

In [126]:
sent_one_hot = []
for s in sents:
    sent = []
    hot = []
    for l in s:
        sent.append(vocab[l])
        hot.append(torch.nn.functional.one_hot(torch.tensor(vocab[l], dtype=torch.long), num_classes=len(vocab)))
    if len(sent) > max_len:
        max_len = len(sent)
    
    if len(sent) == 0:
        sent.append(vocab["<PAD>"])
        hot.append(torch.nn.functional.one_hot(torch.tensor(vocab["<PAD>"], dtype=torch.long), num_classes=len(vocab)))
        
    sent_one_hot.append(torch.stack(hot).to(torch.float))

In [127]:
# PAD tensor
pad2 = torch.nn.functional.one_hot(pad1, num_classes=len(vocab)).unsqueeze(0)

# Add padding to the sequences
for i in range(0, len(sent_one_hot)):
    p2 = sent_one_hot[i]
    
    # What is the padding size?
    padding = data_X_enc.shape[-1]-len(p2)
    
    # Padding tensor
    pad2_ = pad2.repeat(padding, 1)
    
    # Add the padding
    pad2_ = torch.cat((p2, pad2_), dim=0)
    
    # Save the tensors
    sent_one_hot[i] = pad2_

In [128]:
sent_one_hot = torch.stack(sent_one_hot)
sent_enc = torch.argmax(sent_one_hot, dim=-1)

In [129]:
sent_one_hot.shape

torch.Size([3, 88, 65])

In [130]:
M(sent_enc.detach())

tensor([0.3910, 0.3978, 0.3461], grad_fn=<SqueezeBackward0>)