In [1]:
!pip install pdfplumber
!pip install tiktoken



In [2]:
import os
import pdfplumber
from tqdm import tqdm
import tiktoken
from concurrent.futures import ThreadPoolExecutor, as_completed
import pickle
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
def extract_text_from_pdf(pdf_path):

    text = list()
    with pdfplumber.open(pdf_path) as pdf:

        for page in pdf.pages:
            page_text = page.extract_text()

            if page_text:
                text.append(page_text)

    return "\n".join(text)

In [5]:
def load_all_resumes(single_dir_abs_path):

    documents = list()

    for root,_,files in os.walk(single_dir_abs_path):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_path = os.path.join(root,file)
                text = extract_text_from_pdf(pdf_path)
                if text.strip():
                    documents.append(text)
    return documents

In [6]:
root_dir = "/kaggle/input/resume-dataset/data/data"

In [7]:
def process_resumes_per_category(single_dir):
    return single_dir, load_all_resumes(os.path.join(root_dir,single_dir))

In [8]:
"""
data_dict = dict()
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:

    parallel_pools = [pool.submit(process_resumes_per_category, single_dir) for single_dir in os.listdir(root_dir)]
    for single_pool in tqdm(as_completed(parallel_pools), total=len(parallel_pools)):
        try:
            single_dir, resumes_raw_text_list = single_pool.result()
            data_dict[single_dir] = resumes_raw_text_list
        except Exception as e:
            print(f"Error processing {single_dir}: {e}")
"""

'\ndata_dict = dict()\nwith ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:\n    \n    parallel_pools = [pool.submit(process_resumes_per_category, single_dir) for single_dir in os.listdir(root_dir)]\n    for single_pool in tqdm(as_completed(parallel_pools), total=len(parallel_pools)):\n        try:\n            single_dir, resumes_raw_text_list = single_pool.result()\n            data_dict[single_dir] = resumes_raw_text_list\n        except Exception as e:\n            print(f"Error processing {single_dir}: {e}")\n'

In [9]:
"""
gpt_tokenizer_encodings = tiktoken.get_encoding("o200k_base")
"""

'\ngpt_tokenizer_encodings = tiktoken.get_encoding("o200k_base")\n'

In [10]:
"""
with open("data_dict.pkl","wb") as file_handle:
    pickle.dump(data_dict,file_handle)
"""

'\nwith open("data_dict.pkl","wb") as file_handle:\n    pickle.dump(data_dict,file_handle)\n'

In [11]:
import pickle
with open("data_dict.pkl", "rb") as file_handle:
    data_dict = pickle.load(file_handle)

In [12]:
encoded_data_dict = {"Resume Encoded Text":[], "Suitable Job":[]}
row_idx = 0
max_len = 0

for k,v in data_dict.items():
    for resume_text in v:

        encoded_resume_text = tokenizer.tokenize(resume_text)
        encoded_resume_text = ["[CLS]"] + encoded_resume_text
        encoded_data_dict["Resume Encoded Text"].append(encoded_resume_text)
        encoded_data_dict["Suitable Job"].append(k)

        if len(encoded_resume_text)+1 > max_len:
            max_len = len(encoded_resume_text)+1

In [13]:
bert_base_context_len = 512
max_len = max_len - (max_len % bert_base_context_len)
print(max_len)

6144


In [14]:
for idx, encoded_resume_text in enumerate(encoded_data_dict["Resume Encoded Text"]):
    encoded_data_dict["Resume Encoded Text"][idx] = encoded_resume_text + [0]*((max_len-1)-len(encoded_resume_text)) + ["[SEP]"]

In [15]:
data = pd.DataFrame(data=encoded_data_dict)

In [16]:
data.head()

Unnamed: 0,Resume Encoded Text,Suitable Job
0,"[[CLS], pre, -, press, graphic, designer, summ...",DESIGNER
1,"[[CLS], principle, designer, /, owner, profess...",DESIGNER
2,"[[CLS], project, designer, summary, team, -, o...",DESIGNER
3,"[[CLS], interior, designer, summary, a, result...",DESIGNER
4,"[[CLS], presentation, designer, summary, custo...",DESIGNER


In [17]:
data["Resume Encoded Text"] = data["Resume Encoded Text"].apply(lambda x: tokenizer.convert_tokens_to_ids(x))

In [18]:
data

Unnamed: 0,Resume Encoded Text,Suitable Job
0,"[101, 3653, 1011, 2811, 8425, 5859, 12654, 554...",DESIGNER
1,"[101, 6958, 5859, 1013, 3954, 2658, 12654, 459...",DESIGNER
2,"[101, 2622, 5859, 12654, 2136, 1011, 8048, 199...",DESIGNER
3,"[101, 4592, 5859, 12654, 1037, 3463, 8048, 585...",DESIGNER
4,"[101, 8312, 5859, 12654, 8013, 2326, 1998, 702...",DESIGNER
...,...,...
2478,"[101, 4007, 3992, 6337, 5281, 4007, 3992, 2236...",ARTS
2479,"[101, 5957, 16661, 2658, 6337, 29454, 29206, 2...",ARTS
2480,"[101, 5964, 1010, 6619, 1998, 7891, 2394, 3836...",ARTS
2481,"[101, 3772, 3694, 1018, 8013, 1013, 4435, 6337...",ARTS


In [19]:
shuffled_data = data.iloc[np.random.choice(np.arange(0,data.shape[0]),size=(data.shape[0],),replace=False)]

In [20]:
shuffled_data.reset_index(inplace=True,drop=True)

In [21]:
shuffled_data

Unnamed: 0,Resume Encoded Text,Suitable Job
0,"[101, 2449, 2458, 3237, 6337, 2058, 2459, 2086...",BUSINESS-DEVELOPMENT
1,"[101, 1056, 2860, 7681, 3836, 3325, 2194, 2171...",TEACHER
2,"[101, 3026, 12233, 4341, 4387, 12654, 13079, 2...",HEALTHCARE
3,"[101, 2599, 5660, 1006, 5660, 3523, 1007, 1265...",CHEF
4,"[101, 2449, 2458, 2472, 12654, 2449, 2458, 247...",BUSINESS-DEVELOPMENT
...,...,...
2478,"[101, 3573, 13106, 2462, 2658, 12654, 1996, 38...",AVIATION
2479,"[101, 5414, 3353, 1013, 3293, 1004, 2810, 2613...",CONSTRUCTION
2480,"[101, 5734, 16661, 12654, 1045, 2031, 4961, 20...",AVIATION
2481,"[101, 3330, 2578, 3208, 3237, 6337, 1037, 3463...",ENGINEERING


In [22]:
labels2idx = dict(zip(data_dict.keys(),range(0,len(data_dict.keys()))))

In [23]:
training_data = data.iloc[0:int(0.7*data.shape[0])]
testing_data = data.iloc[int(0.7*data.shape[0]):]

In [24]:
len(training_data.iloc[0,0])

6144

In [25]:
def training_data_generator(mb_size=79):

    for i in range(training_data.shape[0]//mb_size):

        X_mb = np.array(training_data.iloc[i*mb_size:(i+1)*mb_size,0])
        y_mb = np.array(training_data.iloc[i*mb_size:(i+1)*mb_size,1])

        yield X_mb, y_mb

In [26]:
class SingleAttentionHead(torch.nn.Module):

    def __init__(self,query_key_embedding_dim,value_embedding_dim,sha_dim,masked):
        super().__init__()

        self.sha_dim = sha_dim
        self.masked = masked

        self.query_projection_layer = torch.nn.Linear(in_features=query_key_embedding_dim,
                                                     out_features=sha_dim,bias=False)
        self.key_projection_layer = torch.nn.Linear(in_features=query_key_embedding_dim,
                                                   out_features=sha_dim,bias=False)
        self.value_projection_layer = torch.nn.Linear(in_features=value_embedding_dim,
                                                     out_features=sha_dim,bias=False)
        self.softmax_activation = torch.nn.Softmax(dim=1)

    def forward(self,query_embedding,key_embedding,value_embedding):

        projected_query = self.query_projection_layer(query_embedding)
        projected_key = self.key_projection_layer(key_embedding)
        projected_value = self.value_projection_layer(value_embedding)

        query_key_similarity_search = torch.matmul(projected_query,torch.transpose(projected_key,1,0))/torch.sqrt(torch.tensor([self.sha_dim]))

        if self.masked:
            query_key_similarity_search = torch.tril(query_key_similarity_search,0)

        query_key_soft_search = self.softmax_activation(query_key_similarity_search)
        weighted_attn_embedding = torch.matmul(query_key_soft_search,projected_value)

        return weighted_attn_embedding

In [27]:
class MultiHeadAttentionLayer(torch.nn.Module):

    def __init__(self,query_key_embedding_dim,value_embedding_dim,num_attn_heads,masked):
        super().__init__()

        sha_dim = value_embedding_dim//num_attn_heads
        self.attn_heads = list()

        for _ in range(num_attn_heads):
            self.attn_heads.append(SingleAttentionHead(query_key_embedding_dim,value_embedding_dim,
                                       sha_dim,masked))

        self.mha_projection_layer = torch.nn.Linear(in_features=value_embedding_dim,
                                                   out_features=value_embedding_dim,bias=False)

    def forward(self,query_embedding,key_embedding,value_embedding):

        attn_heads_weighted_embeddings = list()

        for single_attn_head in self.attn_heads:
            attn_heads_weighted_embeddings.append(single_attn_head(query_embedding,key_embedding,
                                                                  value_embedding))

        mha_concatenated_embeddings = torch.cat(attn_heads_weighted_embeddings,dim=1)
        mha_output = self.mha_projection_layer(mha_concatenated_embeddings)

        return mha_output

In [28]:
class EncoderLayer(torch.nn.Module):

    def __init__(self,input_embedding_dim,num_attn_heads):
        super().__init__()

        self.mha_layer = MultiHeadAttentionLayer(input_embedding_dim,input_embedding_dim,
                                                num_attn_heads,False)
        self.layer_norm_1 = torch.nn.LayerNorm(input_embedding_dim)
        self.ffn_inner_layer = torch.nn.Linear(in_features=input_embedding_dim,
                                              out_features=4*input_embedding_dim)
        self.ffn_output_layer = torch.nn.Linear(in_features=4*input_embedding_dim,
                                               out_features=input_embedding_dim)
        self.layer_norm_2 = torch.nn.LayerNorm(input_embedding_dim)


    def forward(self,input_embedding):

        mha_layer_out = self.mha_layer(input_embedding,input_embedding,input_embedding)
        layer_norm1_out = self.layer_norm_1(input_embedding+mha_layer_out)
        higher_dim_projection = self.ffn_inner_layer(layer_norm1_out)
        ffn_out = self.ffn_output_layer(higher_dim_projection)
        encoder_layer_out = self.layer_norm_2(ffn_out)

        return encoder_layer_out

In [29]:
class BERT(torch.nn.Module):

    def __init__(self,vocab_size,max_context_len,model_dim,num_encoder_layers,
                 num_attn_heads,num_classes,model_context_len):
        super().__init__()


        self.model_context_len = model_context_len
        self.embedding_layer = torch.nn.Embedding(num_embeddings=vocab_size,
                                                  embedding_dim=model_dim)
        self.pos_encoding_layer = torch.nn.Embedding(num_embeddings=max_context_len,
                                                    embedding_dim=model_dim)
        self.encoder_layer_stack = list()

        for _ in range(num_encoder_layers):
            self.encoder_layer_stack.append(EncoderLayer(model_dim,num_attn_heads))

        self.classification_head = torch.nn.Linear(in_features=model_dim,out_features=num_classes)
        self.classification_head_activation = torch.nn.Softmax(dim=1)


    def forward(self,X):

        token_embedding = self.embedding_layer(X)
        position_ids = torch.arange(start=0,end=max_context_len)
        position_ids = position_ids.view(max_context_len//self.model_context_len,
                                         self.model_context_len)
        pos_encoding = self.pos_encoding_layer(position_ids)
        input_embedding = token_embedding + pos_encoding

        for single_encoding_layer in self.encoder_layer_stack:
            output_embedding = single_encoding_layer(input_embedding)
            input_embedding = output_embedding

        encoder_out = self.classification_head_activation(self.classification_head(output_embedding[:,0]))