# Install Necessary Libraries

In [None]:
#!pip install transformers torch scikit-learn nltk

# Import Libraries and Load BERT NER Model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
import numpy as np

# Download NLTK stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load pre-trained model and tokenizer for NER
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

# Create a pipeline for NER
nlp = pipeline("ner", model=model, tokenizer=tokenizer)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Define Functions for Preprocessing and Keyword Extraction

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\W', ' ', text)   # Remove all non-word characters
    text = re.sub(r'\d', '', text)    # Remove all digits
    return text.strip()

def extract_ner_keywords(text):
    ner_results = nlp(text)
    keywords = [entity['word'] for entity in ner_results]
    return keywords

def extract_tfidf_keywords(text, top_n=10):
    vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), ngram_range=(1, 3))
    X = vectorizer.fit_transform([text])
    indices = np.argsort(vectorizer.idf_)[::-1]
    features = vectorizer.get_feature_names_out()
    top_features = [features[i] for i in indices[:top_n]]
    return top_features


# User Input for Job Descriptions and Keyword Extraction

In [None]:
def get_job_descriptions():
    print("Enter job descriptions (type 'done' to finish):")
    job_descriptions = []
    while True:
        line = input()
        if line.lower() == 'done':
            break
        job_descriptions.append(line)
    return job_descriptions

# Function to get and process job descriptions, then extract keywords
def analyze_job_descriptions():
    job_descriptions = get_job_descriptions()
    all_keywords = []
    for description in job_descriptions:
        preprocessed_text = preprocess_text(description)
        ner_keywords = extract_ner_keywords(description)
        tfidf_keywords = extract_tfidf_keywords(preprocessed_text)
        combined_keywords = list(set(ner_keywords + tfidf_keywords))  # Combine and remove duplicates
        all_keywords.extend(combined_keywords)
    unique_keywords = list(set(all_keywords))  # Remove duplicates from all keywords
    print("\nTop Keywords:")
    for keyword in unique_keywords:
        print(keyword)

# Run the analysis
analyze_job_descriptions()


Enter job descriptions (type 'done' to finish):
We are looking for a data scientist with experience in Python, machine learning, and data analysis. The ideal candidate will have expertise in Java, deep learning, and cloud computing
done

Top Keywords:
experience python machine
experience
deep learning cloud
Python
experience python
deep learning
data scientist experience
expertise
data analysis ideal
scientist experience python
Java
deep
