Please execute all cells in the notebook up to the last one. For the final cell, you will need to supply values for the following variables to test the model:
- `output_pdf_path`: Specify the file path where the output PDF will be saved.
- `pdf_path`: Enter the path to the PDF file of the resume you wish to test.
- `resume_category`: Assign the desired job category for the resume classification.


In [None]:
!pip install PyMuPDF

In [None]:
pip install transformers -U

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.utils import simple_preprocess
import re
import fitz  # Import the PyMuPDF library
import string

In [None]:
# Assuming 'model' is your fine-tuned BertModel instance
# Load tokenizer and model (replace 'bert-base-uncased' with your model's name if you've saved it under a different name)
df1=pd.read_csv('worldcities.csv')
df1=df1[(df1['country']=='India') | (df1['country'] == 'United States')]
cities=df1['city'].tolist()
for i in range(0,len(cities)):
  cities[i]=cities[i].lower()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('BERT_FINETUNED')  # Or wherever you've saved your fine-tuned model

def get_average_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # Masked embeddings: zero out padding tokens
    embeddings = outputs.last_hidden_state
    mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    masked_embeddings = embeddings * mask

    # Sum over the sequence length dimension and divide by the number of non-padding tokens
    summed_embeddings = masked_embeddings.sum(1)
    num_non_padding_tokens = mask.sum(1)
    average_embedding = summed_embeddings / num_non_padding_tokens

    return average_embedding


nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
# List of months in long format
# List of months in long format with all alphabets in lowercase
months_long= [
    "january", "february", "march", "april", "may", "june",
    "july", "august", "september", "october", "november", "december"
]

# List of months in short format with all alphabets in lowercase
months_short = [
    "jan", "feb", "mar", "apr", "may", "jun",
    "jul", "aug", "sep", "oct", "nov", "dec"
]
# Extend the stopwords list with your custom deletion words
del_words = ['name', 'city', 'state', 'country', 'fullname', 'company', 'resume','intro', 'curriculum', 'vitae', 'address', 'phone',
             'email', 'linkedin', 'profile', 'summary', 'objective', 'experience', 'education', 'skill', 'skills','bachelor',
             'reference', 'references', 'contact', 'detail', 'details', 'mail', 'gmail', 'yahoo', 'hotmail', 'mailing',
             'twitter', 'facebook', 'instagram','intro','using', 'website', 'web', 'url', 'www', 'year', 'month','months','requirement','first', 'last', 'xxxx', 'rstlast', 'rstlast', 'github', 'rstlast', 'university', 'expected', 'bachelor', 'science','project', 'description', 'responsibility', 'role','time','nagpur', 'secondary','exprience']

stop_words = stopwords.words('english') + del_words +cities+months_long+months_short
# ct=0
def preprocess(text):
    text = re.sub('http\S+\s*', ' ', text)  # remove URLs
    text = re.sub('RT|cc', ' ', text)  # remove RT and cc
    text = re.sub('#\S+', '', text)  # remove hashtags
    text = re.sub('@\S+', '  ', text)  # remove mentions
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)  # remove punctuations
    text = re.sub(r'[^\x00-\x7f]',r' ', text)  # remove non-ASCII characters
    text = re.sub('\s+', ' ', text)  # remove extra whitespace
    text = re.sub('\d+', '', text)  # remove numbers
    text = text.lower()  # convert to lowercase
    result = []
    for token in simple_preprocess(text):
        if token not in stop_words and len(token) > 3:
              result.append(token)
    return " ".join(result)

def cosine_similarity_torch(vec_a, vec_b):
    # Calculate the dot product of the two vectors
    dot_product = torch.dot(vec_a, vec_b)

    # Calculate the magnitude (norm) of each vector
    norm_a = torch.norm(vec_a)
    norm_b = torch.norm(vec_b)

    # Calculate the cosine similarity
    similarity = dot_product / (norm_a * norm_b)

    return similarity


In [None]:
def testing_model(output_pdf_path,pdf_path,resume_category):
    
    resume_category=preprocess(resume_category)
    resume_category=get_average_embedding(resume_category, tokenizer, model)
    def read_pdf_line_by_line_cleaned(pdf_path):
        all_chars_to_remove = string.punctuation.replace('.', '')
        remove_chars_trans = str.maketrans('', '', all_chars_to_remove)

        # Open the PDF file
        doc = fitz.open(pdf_path)

        # Iterate through each page
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)

            # Extract text from the page
            text = page.get_text()

            # Split the text into lines
            lines = text.split('\n')
            pp=0
            ans={}
            # Iterate through each line, remove specified characters, then filter out non-ASCII characters
            for line in lines:
    #             print(pp)
                pp+=1
                clean_line = line.translate(remove_chars_trans)
                # Replace .js with empty string in each word
                clean_line = ' '.join(word.replace('.js', '') for word in clean_line.split())
                # Filter out non-ASCII characters
                clean_line = ''.join(char for char in clean_line if ord(char) < 128)
                chunks = [sentence.strip() for sentence in re.split('[.]', clean_line) if sentence]
                for chunk in chunks:
                    chunk=preprocess(chunk)
                    if len(chunk)>0:
                        similiarity=cosine_similarity_torch(get_average_embedding(chunk,tokenizer, model)[0],resume_category[0])
                        ans[chunk]=similiarity
        doc.close()
        return ans

    # Read and print each line from the PDF
    c=read_pdf_line_by_line_cleaned(pdf_path)
    # Open the PDF file
    doc = fitz.open(pdf_path)
#     print(c)
    # Get the number of pages
    num_pages = len(doc)

    print(f"The number of pages in the PDF is: {num_pages}")

    # Close the document if it's no longer needed
    doc.close()

    # Sort the dictionary by value and pick the top 10 key-value pairs
    sorted_c = dict(sorted(c.items(), key=lambda item: item[1], reverse=True)[:(10*num_pages)])
    def highlight_words_in_pdf(input_pdf_path, output_pdf_path, words_dict):
        doc = fitz.open(input_pdf_path)

        for page in doc:  # Iterate over all pages
            page.clean_contents()
            for sentence, key in words_dict.items():
                words = sentence.split()  # Split the sentence into words
                for word in words:
                    text_instances = page.search_for(word)  # Search for the word in the page

                    # Highlight each instance found
                    for inst in text_instances:
                        highlight = page.add_highlight_annot(inst)
                        highlight.set_colors(stroke=(1, key, 0))  # Set color to yellow (R, G, B)
                        highlight.update()

        doc.save(output_pdf_path)
        doc.close()
    highlight_words_in_pdf(pdf_path,output_pdf_path,sorted_c)

In [None]:
output_pdf_path='highlighted2.pdf'
# Specify the path to your PDF file
pdf_path = 'Labelled/SWE_Resume_Highlighted.pdf'
resume_category='Senior Technical Artist and VFX Specialist with extensive experience in game development across multiple platforms and engines.'

In [None]:
testing_model(output_pdf_path,pdf_path,resume_category)

In [None]:
pip install --upgrade tokenizers

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize the tokenizer and model with the Mistral model ID
model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

# Define the task and instruction for the model
messages = [
    {"role": "user", "content": "[INST] Generate a list of keywords relevant for a Data Analysis role, separated by commas. [/INST]"},
]

In [None]:

# Apply the chat template to format the input correctly
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")

# Generate the model's response
outputs = model.generate(inputs, max_new_tokens=100)

# Decode and print the generated response
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
