In [None]:
#algglomerative clustering with svm classifier
import re
import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC  # Import Support Vector Classifier
from sklearn.metrics.pairwise import cosine_similarity
from PyPDF2 import PdfReader

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    text = ''
    with open(pdf_file, 'rb') as file:
        reader = PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Function to extract skills from text using a skills dataset
def extract_skills(text, skills_list):
    tokens = nltk.word_tokenize(text)
    # Filter out stop words
    stop_words = set(stopwords.words('english'))
    skills = [token for token in tokens if token.lower() in skills_list and token.lower() not in stop_words]
    return skills

# Load skills data from CSV
skills_data = pd.read_csv('skills.csv')
skills_list = set(skills_data['Skills'].str.lower().str.split().explode().tolist())

# Load job descriptions from Excel
job_data = pd.read_csv('indeed_data.csv')  # Update with your file name and path
job_descriptions = job_data['description'].tolist()

# Preprocess job descriptions
preprocessed_job_descriptions = [preprocess_text(desc) for desc in job_descriptions]

# Extract skills from job descriptions
vectorizer = CountVectorizer(stop_words='english')
X_job = vectorizer.fit_transform(preprocessed_job_descriptions)

# Cluster job descriptions using Agglomerative Clustering
num_clusters = 5  # You can adjust the number of clusters as needed
agglomerative = AgglomerativeClustering(n_clusters=num_clusters)
job_clusters = agglomerative.fit_predict(X_job.toarray())

# Split labeled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(preprocessed_job_descriptions, job_clusters, test_size=0.2, random_state=42)

# Train a model to predict clusters
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

classifier = SVC()  # Use SVM classifier
classifier.fit(X_train_vec, y_train)
predicted_clusters = classifier.predict(X_test_vec)

# Calculate accuracy
accuracy = accuracy_score(y_test, predicted_clusters)
print("Accuracy:", accuracy)

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, predicted_clusters, average='weighted')
recall = recall_score(y_test, predicted_clusters, average='weighted')
f1 = f1_score(y_test, predicted_clusters, average='weighted')
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Extract skills from resume
resume_file = 'resume_yash.pdf'  # Replace with your PDF file path
resume_text = extract_text_from_pdf(resume_file)
preprocessed_resume = preprocess_text(resume_text)
resume_skills = extract_skills(preprocessed_resume, skills_list)

# Assign resume to the cluster with the most similar job descriptions
resume_vec = vectorizer.transform([preprocessed_resume])
predicted_cluster = classifier.predict(resume_vec)[0]

# Get job titles and URLs for the predicted cluster
predicted_jobs = job_data[job_clusters == predicted_cluster]

print("Predicted Cluster:", predicted_cluster)
print("Resume Skills:", resume_skills)
print("Predicted Jobs:")
job_similarities = []

for idx, job in predicted_jobs.iterrows():
    job_description_vec = vectorizer.transform([preprocess_text(job['description'])])
    similarity = cosine_similarity(resume_vec, job_description_vec)[0][0]
    job_similarities.append((job['title'], job['link'], similarity))

# Sort jobs by cosine similarity and print top 5
top_5_jobs = sorted(job_similarities, key=lambda x: x[2], reverse=True)[:5]
print("Top 5 Jobs:")
for title, url, similarity in top_5_jobs:
    print("Job Title:", title)
    print("Job URL:", url)
    print("Cosine Similarity:", similarity)
    print()
