# AI-Assisted Recruitment System

This notebook implements an AI-powered recruitment system that matches job postings with resumes using natural language processing and machine learning techniques.

## Features:
- Job posting analysis and cleaning
- Resume parsing and skill extraction
- Intelligent job-resume matching
- Scoring and ranking system
- Interactive matching interface


In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import warnings
from collections import Counter
from typing import List, Dict, Tuple, Optional, Union

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Natural Language Processing
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Machine Learning
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder

# BERT and Transformers
try:
    from transformers import AutoTokenizer, AutoModel
    from sentence_transformers import SentenceTransformer
    import torch
    BERT_AVAILABLE = True
except ImportError:
    BERT_AVAILABLE = False

# Web Framework
try:
    from flask import Flask, request, jsonify
    from flask_cors import CORS
    FLASK_AVAILABLE = True
except ImportError:
    FLASK_AVAILABLE = False

# Additional utilities
import json
import pickle
from datetime import datetime
import os
from pathlib import Path

# Configuration
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

# Download NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
except:
    pass

print(f"Libraries loaded. BERT: {BERT_AVAILABLE}, Flask: {FLASK_AVAILABLE}")


Libraries loaded. BERT: True, Flask: True


In [2]:
# Ensure optional flags exist
try:
    DATABASE_AVAILABLE
except NameError:
    DATABASE_AVAILABLE = False


In [3]:
# Load datasets
print("Loading datasets...")
df_jobs = pd.read_csv('Dataset/data job posts.csv')
df_resumes = pd.read_csv('Dataset/Resume.csv')
df_cleaned = pd.read_csv('Dataset/updated_data_final_cleaned.csv')

print(f"Job Posts Dataset: {df_jobs.shape}")
print(f"Resume Dataset: {df_resumes.shape}")
print(f"Cleaned Dataset: {df_cleaned.shape}")


Loading datasets...
Job Posts Dataset: (19001, 24)
Resume Dataset: (2484, 4)
Cleaned Dataset: (32481, 3)


In [4]:
# Explore job posts dataset
if df_jobs is not None:
    print(f"Job Posts: {df_jobs.shape}")
    print(f"Columns: {list(df_jobs.columns)}")
    print(f"Missing values: {df_jobs.isnull().sum().sum()}")
    print("\nSample data:")
    print(df_jobs[['Title', 'Company', 'Location']].head(3))
else:
    print("No job data loaded")


Job Posts: (19001, 24)
Columns: ['jobpost', 'date', 'Title', 'Company', 'AnnouncementCode', 'Term', 'Eligibility', 'Audience', 'StartDate', 'Duration', 'Location', 'JobDescription', 'JobRequirment', 'RequiredQual', 'Salary', 'ApplicationP', 'OpeningDate', 'Deadline', 'Notes', 'AboutC', 'Attach', 'Year', 'Month', 'IT']
Missing values: 137017

Sample data:
                                                      Title  \
0                                   Chief Financial Officer   
1  Full-time Community Connections Intern (paid internship)   
2                                       Country Coordinator   

                                           Company  \
0             AMERIA Investment Consulting Company   
1  International Research & Exchanges Board (IREX)   
2        Caucasus Environmental NGO Network (CENN)   

                                                                                              Location  
0                                                                   

In [5]:
# Explore resume dataset
print("=== RESUME DATASET ===")
print(f"Shape: {df_resumes.shape}")
print(f"\nColumns: {list(df_resumes.columns)}")
print(f"\nMissing values:")
print(df_resumes.isnull().sum())
print(f"\nResume categories:")
print(df_resumes['Category'].value_counts())
print(f"\nFirst few rows:")
df_resumes.head(3)


=== RESUME DATASET ===
Shape: (2484, 4)

Columns: ['ID', 'Resume_str', 'Resume_html', 'Category']

Missing values:
ID             0
Resume_str     0
Resume_html    0
Category       0
dtype: int64

Resume categories:
Category
INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
FINANCE                   118
ADVOCATE                  118
ACCOUNTANT                118
ENGINEERING               118
CHEF                      118
AVIATION                  117
FITNESS                   117
SALES                     116
BANKING                   115
HEALTHCARE                115
CONSULTANT                115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: count, dtype: int64

First few rows:


Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\n\nHR ADMINISTRATOR Summary Dedicated Cu...,"<div class=""fontsize fontface vmargins hmargins linespacing pagesize"" id=""document""> <div class=...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS Summary Versatile media professional with ba...","<div class=""fontsize fontface vmargins hmargins linespacing pagesize"" id=""document""> <div class=...",HR
2,33176873,"HR DIRECTOR Summary Over 20 years experience in recruiting, 15 plus years ...","<div class=""fontsize fontface vmargins hmargins linespacing pagesize"" id=""document""> <div class=...",HR


In [6]:
# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model loaded successfully")
except OSError:
    print("spaCy model not found. Please install it using: python -m spacy download en_core_web_sm")
    nlp = None


spaCy model loaded successfully


In [7]:
# Initialize BERT models
if BERT_AVAILABLE:
    try:
        bert_model = SentenceTransformer('all-MiniLM-L6-v2')
        bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        bert_model_direct = AutoModel.from_pretrained('bert-base-uncased')
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        bert_model_direct.to(device)
        print(f"‚úÖ BERT models loaded on {device}")
    except Exception as e:
        print(f"‚ùå BERT loading failed: {e}")
        bert_model = bert_tokenizer = bert_model_direct = None
else:
    bert_model = bert_tokenizer = bert_model_direct = None

# BERT Functions
def get_bert_embeddings(texts, model_type='sentence_transformer'):
    """Get BERT embeddings for texts"""
    if not BERT_AVAILABLE or not texts:
        return np.array([])
    
    try:
        if model_type == 'sentence_transformer' and bert_model:
            return bert_model.encode(texts, convert_to_tensor=False)
        return np.array([])
    except Exception as e:
        print(f"BERT embedding error: {e}")
        return np.array([])

def calculate_bert_similarity(text1, text2):
    """Calculate semantic similarity between two texts"""
    if not BERT_AVAILABLE or not bert_model:
        return 0.0
    
    try:
        embeddings = bert_model.encode([text1, text2])
        return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    except:
        return 0.0

def extract_skills_bert(text, threshold=0.7):
    """Extract skills from text using BERT"""
    if not BERT_AVAILABLE or not text:
        return []
    
    skill_keywords = [
        'python', 'java', 'javascript', 'react', 'angular', 'vue', 'node.js',
        'machine learning', 'deep learning', 'artificial intelligence', 'ai',
        'data science', 'data analysis', 'statistics', 'sql', 'database',
        'aws', 'azure', 'docker', 'kubernetes', 'git', 'github',
        'project management', 'agile', 'scrum', 'leadership', 'communication',
        'marketing', 'sales', 'finance', 'accounting', 'human resources',
        'design', 'ui', 'ux', 'photoshop', 'illustrator', 'figma',
        'mobile development', 'ios', 'android', 'swift', 'kotlin',
        'web development', 'frontend', 'backend', 'full stack', 'devops'
    ]
    
    try:
        extracted_skills = []
        text_lower = text.lower()
        
        for skill in skill_keywords:
            similarity = calculate_bert_similarity(text_lower, skill)
            if similarity >= threshold:
                extracted_skills.append(skill)
        
        return list(set(extracted_skills))
    except Exception as e:
        print(f"BERT skill extraction error: {e}")
        return []


‚úÖ BERT models loaded on cpu


In [8]:
# Setup caching system
import os
from pathlib import Path

CACHE_DIR = Path("cache")
CACHE_DIR.mkdir(exist_ok=True)

CACHE_JOBS = CACHE_DIR / "df_jobs_clean.pkl"
CACHE_RESUMES = CACHE_DIR / "df_resumes_clean.pkl"

print(f"üì¶ Cache directory: {CACHE_DIR.absolute()}")
print(f"   Jobs cache: {CACHE_JOBS.exists()}")
print(f"   Resumes cache: {CACHE_RESUMES.exists()}")


üì¶ Cache directory: /Users/sadmanrahin/Documents/btt_AI/cache
   Jobs cache: False
   Resumes cache: False


In [9]:
# BERT Matching Functions
def find_best_matches_bert(job_index, df_jobs, df_resumes, top_n=5):
    """Find best matching resumes using BERT"""
    if not BERT_AVAILABLE or bert_model is None:
        return pd.DataFrame()
    
    try:
        job_text = df_jobs.iloc[job_index]['CleanText']
        resume_texts = df_resumes['CleanText'].tolist()
        
        job_embedding = bert_model.encode([job_text])
        resume_embeddings = bert_model.encode(resume_texts)
        
        similarities = cosine_similarity(job_embedding, resume_embeddings).flatten()
        top_indices = similarities.argsort()[-top_n:][::-1]
        
        results = []
        for i, idx in enumerate(top_indices):
            results.append({
                'Rank': i + 1,
                'Resume_ID': df_resumes.iloc[idx]['ID'],
                'Category': df_resumes.iloc[idx]['Category'],
                'BERT_Similarity_Score': similarities[idx],  # Fixed column name
                'Resume_Text': df_resumes.iloc[idx]['Resume_str'][:200] + '...'
            })
        
        return pd.DataFrame(results)
    except Exception as e:
        print(f"BERT matching error: {e}")
        return pd.DataFrame()

def enhanced_matching_bert(job_index, df_jobs, df_resumes, top_n=5, skill_weight=0.3, bert_weight=0.7):
    """Enhanced matching using BERT + skill overlap"""
    if not BERT_AVAILABLE or bert_model is None:
        return pd.DataFrame()
    
    try:
        job = df_jobs.iloc[job_index]
        job_text = job['CleanText']
        job_skills = set(job['Skills']) if job['Skills'] else set()
        
        job_embedding = bert_model.encode([job_text])
        resume_texts = df_resumes['CleanText'].tolist()
        resume_embeddings = bert_model.encode(resume_texts)
        
        bert_similarities = cosine_similarity(job_embedding, resume_embeddings).flatten()
        
        skill_scores = []
        for idx, resume in df_resumes.iterrows():
            resume_skills = set(resume['Skills']) if resume['Skills'] else set()
            
            if job_skills and resume_skills:
                overlap = len(job_skills.intersection(resume_skills))
                skill_score = overlap / len(job_skills) if job_skills else 0
            else:
                skill_score = 0
            
            skill_scores.append(skill_score)
        
        combined_scores = bert_weight * bert_similarities + skill_weight * np.array(skill_scores)
        top_indices = combined_scores.argsort()[-top_n:][::-1]
        
        results = []
        for i, idx in enumerate(top_indices):
            resume = df_resumes.iloc[idx]
            results.append({
                'Rank': i + 1,
                'Resume_ID': resume['ID'],
                'Category': resume['Category'],
                'BERT_Similarity_Score': bert_similarities[idx],  # Fixed column name
                'Skill_Overlap_Score': skill_scores[idx],
                'Combined_Score': combined_scores[idx],
                'Resume_Text': resume['Resume_str'][:200] + '...'
            })
        
        return pd.DataFrame(results)
    except Exception as e:
        print(f"Enhanced BERT matching error: {e}")
        return pd.DataFrame()

def search_jobs_by_keywords_bert(keywords, df_jobs, top_n=5):
    """Search for jobs by keywords using BERT"""
    if not BERT_AVAILABLE or bert_model is None:
        return pd.DataFrame()
    
    try:
        clean_keywords = clean_text(keywords)
        keyword_embedding = bert_model.encode([clean_keywords])
        job_texts = df_jobs['CleanText'].tolist()
        job_embeddings = bert_model.encode(job_texts)
        
        similarities = cosine_similarity(keyword_embedding, job_embeddings).flatten()
        top_indices = similarities.argsort()[-top_n:][::-1]
        
        results = []
        for i, idx in enumerate(top_indices):
            job = df_jobs.iloc[idx]
            results.append({
                'Rank': i + 1,
                'Title': job['Title'],
                'Company': job['Company'],
                'Location': job['Location'],
                'BERT_Similarity_Score': similarities[idx],  # Fixed column name
                'Description': job['JobDescription'][:200] + '...' if pd.notna(job['JobDescription']) else 'N/A'
            })
        
        return pd.DataFrame(results)
    except Exception as e:
        print(f"BERT job search error: {e}")
        return pd.DataFrame()


In [21]:
# Text preprocessing functions
def clean_text(text):
    """Clean and preprocess text"""
    if pd.isna(text):
        return ""
    
    text = str(text).lower()
    text = re.sub(r'<.*?>', ' ', text)  # Remove HTML
    text = re.sub(r'[^a-z\s]', ' ', text)  # Remove special chars
    text = re.sub(r'\s+', ' ', text).strip()  # Clean whitespace
    return text

def extract_skills(text, nlp_model):
    """Extract skills from text"""
    if not nlp_model or not text:
        return []
    
    doc = nlp_model(text)
    skills = []
    
    for token in doc:
        if (token.pos_ in ['NOUN', 'PROPN'] and 
            not token.is_stop and 
            len(token.text) > 2 and
            token.text.isalpha()):
            skills.append(token.lemma_.lower())
    
    return list(set(skills))

def lemmatize_text(text, nlp_model):
    """Lemmatize text for better matching"""
    if not nlp_model or not text:
        return ""
    
    doc = nlp_model(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])

# TF-IDF matching function
def find_best_matches(job_index, resume_tfidf, job_tfidf, df_resumes, top_n=5):
    """Find best matching resumes using TF-IDF"""
    try:
        job_vector = job_tfidf[job_index]
        similarities = cosine_similarity(job_vector, resume_tfidf).flatten()
        top_indices = similarities.argsort()[-top_n:][::-1]
        
        results = []
        for i, idx in enumerate(top_indices):
            results.append({
                'Rank': i + 1,
                'Resume_ID': df_resumes.iloc[idx]['ID'],
                'Category': df_resumes.iloc[idx]['Category'],
                'Similarity_Score': similarities[idx],
                'Resume_Text': df_resumes.iloc[idx]['Resume_str'][:200] + '...'
            })
        
        return pd.DataFrame(results)
    except Exception as e:
        print(f"TF-IDF matching error: {e}")
        return pd.DataFrame()


In [22]:
# Data preprocessing with caching
if CACHE_JOBS.exists() and CACHE_RESUMES.exists():
    print("üì¶ Loading preprocessed data from cache...")
    try:
        df_jobs_clean = pd.read_pickle(CACHE_JOBS)
        df_resumes_clean = pd.read_pickle(CACHE_RESUMES)
        print(f"‚úÖ Loaded from cache: {len(df_jobs_clean)} jobs, {len(df_resumes_clean)} resumes")
        print("üí° To reprocess, delete cache/ folder or the .pkl files")
    except Exception as e:
        print(f"‚ùå Error loading cache: {e}")
        print("üîÑ Reprocessing data...")
        cache_available = False
    else:
        cache_available = True
else:
    cache_available = False
    print("üì¶ No cache found. Processing data...")

# Process data if cache not available
if not cache_available and df_jobs is not None and df_resumes is not None:
    # Use subset for testing (remove this line for full processing)
    df_jobs_subset = df_jobs.head(1000)  # Process only first 1000 jobs
    df_resumes_subset = df_resumes.head(5000)  # Process only first 5000 resumes
    
    print(f"Processing subset: {len(df_jobs_subset)} jobs, {len(df_resumes_subset)} resumes")
    
    # Clean job posts
    job_columns = ['Title', 'Company', 'Location', 'JobDescription', 'JobRequirment', 'RequiredQual']
    df_jobs_clean = df_jobs_subset[job_columns].copy()
    df_jobs_clean = df_jobs_clean.dropna(subset=['Title', 'JobDescription'])
    df_jobs_clean = df_jobs_clean.reset_index(drop=True)
    
    # Create combined text
    df_jobs_clean['CombinedText'] = (
        df_jobs_clean['Title'].fillna('') + ' ' +
        df_jobs_clean['JobDescription'].fillna('') + ' ' +
        df_jobs_clean['JobRequirment'].fillna('') + ' ' +
        df_jobs_clean['RequiredQual'].fillna('')
    )
    
    # Clean text
    print("Cleaning job text...")
    df_jobs_clean['CleanText'] = df_jobs_clean['CombinedText'].apply(clean_text)
    
    # Extract skills
    if nlp:
        print("Extracting job skills...")
        df_jobs_clean['Skills'] = df_jobs_clean['CleanText'].apply(lambda x: extract_skills(x, nlp))
        df_jobs_clean['LemmatizedText'] = df_jobs_clean['CleanText'].apply(lambda x: lemmatize_text(x, nlp))
    else:
        df_jobs_clean['Skills'] = [[] for _ in range(len(df_jobs_clean))]
        df_jobs_clean['LemmatizedText'] = df_jobs_clean['CleanText']
    
    # Clean resumes
    df_resumes_clean = df_resumes_subset.copy()
    print("Cleaning resume text...")
    df_resumes_clean['CleanText'] = df_resumes_clean['Resume_str'].apply(clean_text)
    
    if nlp:
        print("Extracting resume skills...")
        df_resumes_clean['Skills'] = df_resumes_clean['CleanText'].apply(lambda x: extract_skills(x, nlp))
        df_resumes_clean['LemmatizedText'] = df_resumes_clean['CleanText'].apply(lambda x: lemmatize_text(x, nlp))
    else:
        df_resumes_clean['Skills'] = [[] for _ in range(len(df_resumes_clean))]
        df_resumes_clean['LemmatizedText'] = df_resumes_clean['CleanText']
    
    # Save to cache
    print("üíæ Saving preprocessed data to cache...")
    df_jobs_clean.to_pickle(CACHE_JOBS)
    df_resumes_clean.to_pickle(CACHE_RESUMES)
    print(f"‚úÖ Saved to cache: {len(df_jobs_clean)} jobs, {len(df_resumes_clean)} resumes")
    
elif cache_available:
    print("‚úÖ Using cached preprocessed data")
elif df_jobs is None or df_resumes is None:
    print("‚ùå No data to process")

üì¶ Loading preprocessed data from cache...
‚úÖ Loaded from cache: 822 jobs, 2484 resumes
üí° To reprocess, delete cache/ folder or the .pkl files
‚úÖ Using cached preprocessed data


In [23]:
# Create TF-IDF vectors
if 'df_jobs_clean' in locals() and 'df_resumes_clean' in locals():
    print("Creating TF-IDF vectors...")
    
    # Combine all text for vocabulary
    all_texts = list(df_jobs_clean['LemmatizedText']) + list(df_resumes_clean['LemmatizedText'])
    
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        max_features=5000,
        stop_words='english',
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.8
    )
    
    # Fit and transform
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    
    # Split back into job posts and resumes
    n_jobs = len(df_jobs_clean)
    job_tfidf = tfidf_matrix[:n_jobs]
    resume_tfidf = tfidf_matrix[n_jobs:]
    
    print(f"‚úÖ TF-IDF matrix created: {tfidf_matrix.shape}")
else:
    print("‚ùå Cleaned data not available")


Creating TF-IDF vectors...
‚úÖ TF-IDF matrix created: (3306, 5000)


In [None]:
# Test BERT vs TF-IDF Performance Comparison
print("üß™ Testing BERT vs TF-IDF performance...")

def compare_matching_methods(job_index=0, top_n=5):
    """Compare BERT and TF-IDF matching methods"""
    if df_jobs is None or df_resumes is None:
        print("‚ùå Datasets not loaded. Please run the data loading cell first.")
        return
    
    print(f"üîç Comparing matching methods for job: {df_jobs.iloc[job_index]['Title']}")
    print("=" * 80)
    
    # Test TF-IDF method (original)
    print("üìä Testing TF-IDF method...")
    try:
        tfidf_matches = find_best_matches(job_index, resume_tfidf, job_tfidf, df_resumes_clean, top_n)
        print("‚úÖ TF-IDF method completed")
    except Exception as e:
        print(f"‚ùå TF-IDF method failed: {str(e)}")
        tfidf_matches = pd.DataFrame()
    
    # Test BERT method
    print("\nüß† Testing BERT method...")
    try:
        bert_matches = find_best_matches_bert(job_index, df_jobs_clean, df_resumes_clean, top_n)
        print("‚úÖ BERT method completed")
    except Exception as e:
        print(f"‚ùå BERT method failed: {str(e)}")
        bert_matches = pd.DataFrame()
    
    # Test Enhanced BERT method
    print("\nüöÄ Testing Enhanced BERT method...")
    try:
        enhanced_bert_matches = enhanced_matching_bert(job_index, df_jobs_clean, df_resumes_clean, top_n)
        print("‚úÖ Enhanced BERT method completed")
    except Exception as e:
        print(f"‚ùå Enhanced BERT method failed: {str(e)}")
        enhanced_bert_matches = pd.DataFrame()
    
    # Display results
    print("\nüìã RESULTS COMPARISON:")
    print("=" * 80)
    
    if not tfidf_matches.empty:
        print("\nüî§ TF-IDF Results:")
        print(tfidf_matches[['Rank', 'Resume_ID', 'Category', 'Similarity_Score']].to_string(index=False))
    
    if not bert_matches.empty:
        print("\nüß† BERT Results:")
        print(bert_matches[['Rank', 'Resume_ID', 'Category', 'BERT_Similarity_Score']].to_string(index=False))
    
    if not enhanced_bert_matches.empty:
        print("\nüöÄ Enhanced BERT Results:")
        print(enhanced_bert_matches[['Rank', 'Resume_ID', 'Category', 'BERT_Similarity_Score', 'Skill_Overlap_Score', 'Combined_Score']].to_string(index=False))
    
    return {
        'tfidf_matches': tfidf_matches,
        'bert_matches': bert_matches,
        'enhanced_bert_matches': enhanced_bert_matches
    }

def test_skill_extraction_comparison():
    """Compare skill extraction between spaCy and BERT methods"""
    if df_jobs is None or not BERT_AVAILABLE:
        print("‚ùå Datasets not loaded or BERT not available.")
        return
    
    print("üîç Comparing skill extraction methods...")
    print("=" * 60)
    
    # Test on a sample job from cleaned data
    if 'df_jobs_clean' in globals() and len(globals()['df_jobs_clean']) > 0:
        sample_job = df_jobs_clean.iloc[0]
        job_text = sample_job['CleanText']
        
        print(f"üìã Sample Job: {sample_job['Title']}")
        print(f"üìù Job Text: {job_text[:200]}...")
        
        # spaCy skill extraction
        if nlp:
            spacy_skills = extract_skills(job_text, nlp)
            print(f"\nüî§ spaCy Skills ({len(spacy_skills)}): {spacy_skills[:10]}")
        
        # BERT skill extraction
        bert_skills = extract_skills_bert(job_text)
        print(f"\nüß† BERT Skills ({len(bert_skills)}): {bert_skills}")
        
        # Compare
        if nlp and spacy_skills:
            spacy_set = set(spacy_skills)
            bert_set = set(bert_skills)
            overlap = len(spacy_set.intersection(bert_set))
            union = len(spacy_set.union(bert_set))
            
            print(f"\nüìä Skill Extraction Comparison:")
            print(f"   spaCy skills: {len(spacy_skills)}")
            print(f"   BERT skills: {len(bert_skills)}")
            print(f"   Overlap: {overlap}")
            print(f"   Jaccard similarity: {overlap/union:.3f}")
    else:
        print("‚ùå Cleaned job data not available")

# Run comparison tests
print("üöÄ Running BERT integration tests...")

# Test matching methods
comparison_results = compare_matching_methods(job_index=0, top_n=5)

# Test skill extraction
test_skill_extraction_comparison()

print("\n‚úÖ BERT integration testing completed!")


üß™ Testing BERT vs TF-IDF performance...
üöÄ Running BERT integration tests...
üîç Comparing matching methods for job: Chief Financial Officer
üìä Testing TF-IDF method...
‚úÖ TF-IDF method completed

üß† Testing BERT method...
‚úÖ BERT method completed

üöÄ Testing Enhanced BERT method...
‚úÖ Enhanced BERT method completed

üìã RESULTS COMPARISON:

üî§ TF-IDF Results:
 Rank  Resume_ID Category  Similarity_Score
    1   12071138  FINANCE          0.438454
    2   19234823 ADVOCATE          0.431891
    3   18636651  FINANCE          0.426520
    4   17392859  FINANCE          0.401932
    5   84356308  FINANCE          0.398916

üß† BERT Results:
 Rank  Resume_ID Category  BERT_Similarity_Score
    1   17392859  FINANCE               0.752344
    2   15891494  FINANCE               0.743289
    3   14722634  FINANCE               0.737250
    4   26767199  FINANCE               0.730880
    5   38441665  FINANCE               0.723948

üöÄ Enhanced BERT Results:
 Rank  Resum

In [25]:
# Test the system
print("üß™ Testing system...")

# Test BERT similarity
if BERT_AVAILABLE:
    similarity = calculate_bert_similarity("python developer", "software engineer")
    print(f"BERT similarity test: {similarity:.3f}")

# Test skill extraction
if nlp:
    skills = extract_skills("I know Python and machine learning", nlp)
    print(f"Skills extracted: {skills}")

print("‚úÖ System test completed")


üß™ Testing system...
BERT similarity test: 0.494
Skills extracted: ['machine', 'learning', 'python']
‚úÖ System test completed


In [26]:
# Simple comparison test
def simple_test():
    """Simple test that works with any column names"""
    if df_jobs is None or df_resumes is None:
        print("‚ùå Datasets not loaded")
        return
    
    print("üß™ Simple comparison test...")
    
    # Test TF-IDF
    try:
        tfidf_results = find_best_matches(0, resume_tfidf, job_tfidf, df_resumes_clean, 3)
        print(f"‚úÖ TF-IDF: Found {len(tfidf_results)} matches")
        if not tfidf_results.empty:
            print("Top TF-IDF match:", tfidf_results.iloc[0]['Resume_ID'])
    except Exception as e:
        print(f"‚ùå TF-IDF error: {e}")
    
    # Test BERT
    try:
        bert_results = find_best_matches_bert(0, df_jobs_clean, df_resumes_clean, 3)
        print(f"‚úÖ BERT: Found {len(bert_results)} matches")
        if not bert_results.empty:
            print("Top BERT match:", bert_results.iloc[0]['Resume_ID'])
            print("Available columns:", list(bert_results.columns))
    except Exception as e:
        print(f"‚ùå BERT error: {e}")
    
    print("‚úÖ Test completed")

# Run the test
simple_test()


üß™ Simple comparison test...
‚úÖ TF-IDF: Found 3 matches
Top TF-IDF match: 12071138
‚úÖ BERT: Found 3 matches
Top BERT match: 17392859
Available columns: ['Rank', 'Resume_ID', 'Category', 'BERT_Similarity_Score', 'Resume_Text']
‚úÖ Test completed


In [27]:
# Check data availability
print("üîç Checking data availability...")

print(f"df_jobs: {'‚úÖ Available' if 'df_jobs' in locals() and df_jobs is not None else '‚ùå Not available'}")
print(f"df_resumes: {'‚úÖ Available' if 'df_resumes' in locals() and df_resumes is not None else '‚ùå Not available'}")
print(f"df_jobs_clean: {'‚úÖ Available' if 'df_jobs_clean' in locals() else '‚ùå Not available'}")
print(f"df_resumes_clean: {'‚úÖ Available' if 'df_resumes_clean' in locals() else '‚ùå Not available'}")
print(f"job_tfidf: {'‚úÖ Available' if 'job_tfidf' in locals() else '‚ùå Not available'}")
print(f"resume_tfidf: {'‚úÖ Available' if 'resume_tfidf' in locals() else '‚ùå Not available'}")

if 'df_jobs' in locals() and df_jobs is not None:
    print(f"\nOriginal data shapes:")
    print(f"  Jobs: {df_jobs.shape}")
    print(f"  Resumes: {df_resumes.shape if 'df_resumes' in locals() and df_resumes is not None else 'N/A'}")

if 'df_jobs_clean' in locals():
    print(f"\nCleaned data shapes:")
    print(f"  Jobs: {df_jobs_clean.shape}")
    print(f"  Resumes: {df_resumes_clean.shape if 'df_resumes_clean' in locals() else 'N/A'}")

print("\nüí° If cleaned data is not available, run the data preprocessing cell (Cell 9) first!")


üîç Checking data availability...
df_jobs: ‚úÖ Available
df_resumes: ‚úÖ Available
df_jobs_clean: ‚úÖ Available
df_resumes_clean: ‚úÖ Available
job_tfidf: ‚úÖ Available
resume_tfidf: ‚úÖ Available

Original data shapes:
  Jobs: (19001, 24)
  Resumes: (2484, 4)

Cleaned data shapes:
  Jobs: (822, 10)
  Resumes: (2484, 7)

üí° If cleaned data is not available, run the data preprocessing cell (Cell 9) first!


In [28]:
# BERT Integration Demo and Usage Guide
print("üéØ BERT Integration Demo")
print("=" * 50)

def demo_bert_features():
    """
    Demonstrate the new BERT features in the recruitment system
    """
    print("üöÄ Welcome to the Enhanced AI Recruitment System with BERT!")
    print("\nüìã Available Features:")
    print("1. üß† BERT-based semantic matching")
    print("2. üîç Enhanced skill extraction")
    print("3. üìä Performance comparison (BERT vs TF-IDF)")
    print("4. üéØ Improved job-resume matching accuracy")
    
    if not BERT_AVAILABLE:
        print("\n‚ùå BERT is not available. Please install transformers and torch.")
        return
    
    print(f"\n‚úÖ BERT Status: Available")
    print(f"üñ•Ô∏è Device: {device if 'device' in globals() else 'CPU'}")
    
    # Demo BERT similarity calculation
    print("\nüß™ Demo: BERT Semantic Similarity")
    print("-" * 40)
    
    test_pairs = [
        ("machine learning engineer", "data scientist"),
        ("python developer", "software engineer"),
        ("project manager", "team lead"),
        ("marketing specialist", "sales representative")
    ]
    
    for text1, text2 in test_pairs:
        similarity = calculate_bert_similarity(text1, text2)
        print(f"'{text1}' ‚Üî '{text2}': {similarity:.3f}")
    
    # Demo skill extraction
    print("\nüîç Demo: BERT Skill Extraction")
    print("-" * 40)
    
    sample_text = "I have experience with Python, machine learning, AWS, and project management using agile methodologies."
    skills = extract_skills_bert(sample_text)
    print(f"Text: {sample_text}")
    print(f"Extracted skills: {skills}")

def usage_guide():
    """
    Provide usage guide for the new BERT features
    """
    print("\nüìñ USAGE GUIDE")
    print("=" * 50)
    
    print("\nüîß How to use BERT features:")
    print("1. Load your datasets using the data loading cell")
    print("2. Initialize BERT models using the BERT initialization cell")
    print("3. Use the following functions:")
    
    print("\nüìã Available Functions:")
    print("‚Ä¢ find_best_matches_bert() - BERT-based resume matching")
    print("‚Ä¢ enhanced_matching_bert() - BERT + skill overlap matching")
    print("‚Ä¢ search_jobs_by_keywords_bert() - BERT-based job search")
    print("‚Ä¢ extract_skills_bert() - BERT-based skill extraction")
    print("‚Ä¢ calculate_bert_similarity() - Semantic similarity between texts")
    print("‚Ä¢ compare_matching_methods() - Compare BERT vs TF-IDF")
    
    print("\nüí° Example Usage:")
    print("```python")
    print("# Find best resumes for a job using BERT")
    print("bert_matches = find_best_matches_bert(job_index=0, df_jobs_clean, df_resumes_clean)")
    print("")
    print("# Search jobs using semantic understanding")
    print("job_results = search_jobs_by_keywords_bert('machine learning python', df_jobs_clean)")
    print("")
    print("# Extract skills using BERT")
    print("skills = extract_skills_bert('I know Python and machine learning')")
    print("```")
    
    print("\n‚ö° Performance Tips:")
    print("‚Ä¢ BERT is slower than TF-IDF but more accurate")
    print("‚Ä¢ Use batch processing for large datasets")
    print("‚Ä¢ Consider using GPU for faster processing")
    print("‚Ä¢ Adjust similarity thresholds based on your needs")

def system_status():
    """
    Display current system status and capabilities
    """
    print("\nüìä SYSTEM STATUS")
    print("=" * 50)
    
    print(f"üì¶ Datasets loaded: {'‚úÖ' if df_jobs is not None and df_resumes is not None else '‚ùå'}")
    print(f"üß† BERT available: {'‚úÖ' if BERT_AVAILABLE else '‚ùå'}")
    print(f"üî§ spaCy available: {'‚úÖ' if nlp is not None else '‚ùå'}")
    print(f"üåê Flask available: {'‚úÖ' if FLASK_AVAILABLE else '‚ùå'}")
    print(f"üóÑÔ∏è Database available: {'‚úÖ' if DATABASE_AVAILABLE else '‚ùå'}")
    
    if df_jobs is not None and df_resumes is not None:
        print(f"\nüìà Dataset Statistics:")
        print(f"   Job posts: {len(df_jobs):,}")
        print(f"   Resumes: {len(df_resumes):,}")
        print(f"   Resume categories: {df_resumes['Category'].nunique()}")
    
    print(f"\nüéØ Matching Methods Available:")
    print(f"   TF-IDF + Cosine Similarity: ‚úÖ")
    print(f"   BERT Semantic Matching: {'‚úÖ' if BERT_AVAILABLE else '‚ùå'}")
    print(f"   Enhanced BERT + Skills: {'‚úÖ' if BERT_AVAILABLE else '‚ùå'}")

# Run the demo
if __name__ == "__main__":
    demo_bert_features()
    usage_guide()
    system_status()
    
    print("\nüéâ BERT Integration Complete!")
    print("Your AI recruitment system now has advanced semantic understanding capabilities!")


üéØ BERT Integration Demo
üöÄ Welcome to the Enhanced AI Recruitment System with BERT!

üìã Available Features:
1. üß† BERT-based semantic matching
2. üîç Enhanced skill extraction
3. üìä Performance comparison (BERT vs TF-IDF)
4. üéØ Improved job-resume matching accuracy

‚úÖ BERT Status: Available
üñ•Ô∏è Device: cpu

üß™ Demo: BERT Semantic Similarity
----------------------------------------
'machine learning engineer' ‚Üî 'data scientist': 0.608
'python developer' ‚Üî 'software engineer': 0.494
'project manager' ‚Üî 'team lead': 0.309
'marketing specialist' ‚Üî 'sales representative': 0.598

üîç Demo: BERT Skill Extraction
----------------------------------------
Text: I have experience with Python, machine learning, AWS, and project management using agile methodologies.
Extracted skills: []

üìñ USAGE GUIDE

üîß How to use BERT features:
1. Load your datasets using the data loading cell
2. Initialize BERT models using the BERT initialization cell
3. Use the following fu

In [32]:
# Text preprocessing functions
def clean_text(text):
    """Clean and preprocess text"""
    if pd.isna(text):
        return ""
    
    text = str(text).lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove special characters and digits
    text = re.sub(r'[^a-z\s]', ' ', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def extract_skills(text, nlp_model):
    """Extract skills and important keywords from text"""
    if not nlp_model or not text:
        return []
    
    doc = nlp_model(text)
    skills = []
    
    # Extract nouns and proper nouns (potential skills)
    for token in doc:
        if (token.pos_ in ['NOUN', 'PROPN'] and 
            not token.is_stop and 
            len(token.text) > 2 and
            token.text.isalpha()):
            skills.append(token.lemma_.lower())
    
    return list(set(skills))

def lemmatize_text(text, nlp_model):
    """Lemmatize text for better matching"""
    if not nlp_model or not text:
        return ""
    
    doc = nlp_model(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])


In [34]:
# Interactive Job Search Function
def search_jobs_by_keywords(keywords, df_jobs, job_tfidf, vectorizer, top_n=5):
    """Search for jobs by keywords"""
    
    # Clean keywords
    clean_keywords = clean_text(keywords)
    
    # Transform keywords to TF-IDF
    keyword_vector = vectorizer.transform([clean_keywords])
    
    # Calculate similarity
    similarities = cosine_similarity(keyword_vector, job_tfidf).flatten()
    
    # Get top matches
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    # Create results
    results = []
    for i, idx in enumerate(top_indices):
        job = df_jobs.iloc[idx]
        results.append({
            'Rank': i + 1,
            'Title': job['Title'],
            'Company': job['Company'],
            'Location': job['Location'],
            'Similarity_Score': similarities[idx],
            'Description': job['JobDescription'][:200] + '...' if pd.notna(job['JobDescription']) else 'N/A'
        })
    
    return pd.DataFrame(results)

# Test job search
print("Testing job search...")
search_results = search_jobs_by_keywords("software developer python", df_jobs_clean, job_tfidf, vectorizer)
print("\nJob search results for 'software developer python':")
print(search_results)


Testing job search...

Job search results for 'software developer python':
   Rank                                          Title  \
0     1                  Software Developer/Programmer   
1     2                             Software developer   
2     3  Senior Software Developer (several positions)   
3     4                             Software Developer   
4     5                         Developers Team Leader   

                                       Company          Location  \
0                                      IIG LLC  Yerevan, Armenia   
1                                     Xalt LLC  Yerevan, Armenia   
2                                    ZenteX.AM  Yerevan, Armenia   
3  Synergy International Systems, Inc./Armenia  Yerevan, Armenia   
4                                    Zenteq.am  Yerevan, Armenia   

   Similarity_Score  \
0          0.436298   
1          0.419878   
2          0.412981   
3          0.345645   
4          0.313760   

                            

In [35]:
# Export results
print("Exporting results...")

# Export cleaned datasets
df_jobs_clean.to_csv('cleaned_job_posts.csv', index=False)
df_resumes_clean.to_csv('cleaned_resumes.csv', index=False)

# Export matching results
if 'enhanced_matches' in locals():
    enhanced_matches.to_csv('job_resume_matches.csv', index=False)

if 'search_results' in locals():
    search_results.to_csv('job_search_results.csv', index=False)

print("\nExported files:")
print("- cleaned_job_posts.csv")
print("- cleaned_resumes.csv")
print("- job_resume_matches.csv")
print("- job_search_results.csv")


Exporting results...

Exported files:
- cleaned_job_posts.csv
- cleaned_resumes.csv
- job_resume_matches.csv
- job_search_results.csv


In [36]:
# Summary statistics
print("=== AI RECRUITMENT SYSTEM SUMMARY ===")
print(f"\nDataset Statistics:")
print(f"- Total Job Posts: {len(df_jobs_clean)}")
print(f"- Total Resumes: {len(df_resumes_clean)}")
print(f"- Resume Categories: {df_resumes_clean['Category'].nunique()}")
print(f"- Unique Companies: {df_jobs_clean['Company'].nunique()}")

print(f"\nSystem Features:")
print("‚úì Job posting analysis and cleaning")
print("‚úì Resume parsing and skill extraction")
print("‚úì TF-IDF based text similarity matching")
print("‚úì Skill-based enhanced matching")
print("‚úì Interactive job search by keywords")
print("‚úì Comprehensive analysis and visualization")
print("‚úì Results export to CSV")

print(f"\nNext Steps:")
print("1. Run individual cells to test specific features")
print("2. Modify matching parameters for better results")
print("3. Add more sophisticated ML models")
print("4. Create a web interface for the system")
print("5. Implement real-time matching API")


=== AI RECRUITMENT SYSTEM SUMMARY ===

Dataset Statistics:
- Total Job Posts: 822
- Total Resumes: 2484
- Resume Categories: 24
- Unique Companies: 416

System Features:
‚úì Job posting analysis and cleaning
‚úì Resume parsing and skill extraction
‚úì TF-IDF based text similarity matching
‚úì Skill-based enhanced matching
‚úì Interactive job search by keywords
‚úì Comprehensive analysis and visualization
‚úì Results export to CSV

Next Steps:
1. Run individual cells to test specific features
2. Modify matching parameters for better results
3. Add more sophisticated ML models
4. Create a web interface for the system
5. Implement real-time matching API


In [37]:
# Top-10 candidates for a given job (by index or title substring)
def top_candidates_for_job(job_index=None, title_contains=None, top_n=10):
    # Preconditions
    required = ['df_jobs_clean', 'df_resumes_clean']
    for v in required:
        if v not in globals():
            raise RuntimeError(f"{v} not found. Run the preprocessing cells first.")
    if job_index is None and not title_contains:
        raise ValueError("Provide either job_index or title_contains.")

    # Pick job index
    if job_index is None:
        mask = df_jobs_clean['Title'].fillna('').str.contains(title_contains, case=False, na=False)
        if not mask.any():
            raise ValueError(f"No job found with title containing: {title_contains}")
        job_index = mask.idxmax()

    job_row = df_jobs_clean.iloc[job_index]
    print(f"Job [{job_index}] ‚Äî {job_row['Title']} | {job_row.get('Company','N/A')} | {job_row.get('Location','N/A')}")

    # Try BERT first
    use_bert = 'find_best_matches_bert' in globals() and BERT_AVAILABLE and (bert_model is not None)
    results = None

    if use_bert:
        try:
            results = find_best_matches_bert(job_index, df_jobs_clean, df_resumes_clean, top_n=top_n)
            results = results[['Rank','Resume_ID','Category','BERT_Similarity_Score','Resume_Text']]
            results = results.rename(columns={'BERT_Similarity_Score':'Score'})
            method = "BERT"
        except Exception as e:
            print(f"BERT matching unavailable: {e}")
            results = None

    # Fallback TF-IDF
    if results is None:
        required_tfidf = ['job_tfidf','resume_tfidf','find_best_matches']
        if not all(r in globals() for r in required_tfidf):
            raise RuntimeError("TF-IDF artifacts missing. Run the TF-IDF vectorization cell first.")
        results = find_best_matches(job_index, resume_tfidf, job_tfidf, df_resumes_clean, top_n=top_n)
        results = results[['Rank','Resume_ID','Category','Similarity_Score','Resume_Text']]
        results = results.rename(columns={'Similarity_Score':'Score'})
        method = "TF-IDF"

    print(f"\nTop {top_n} candidates ({method}):")
    display(results)
    return results

# Examples:
# By index
top10 = top_candidates_for_job(job_index=0, top_n=10)

# Or by title substring
# top10 = top_candidates_for_job(title_contains="data scientist", top_n=10)

Job [0] ‚Äî Chief Financial Officer | AMERIA Investment Consulting Company | Yerevan, Armenia

Top 10 candidates (BERT):


Unnamed: 0,Rank,Resume_ID,Category,Score,Resume_Text
0,1,17392859,FINANCE,0.752344,DIRECTOR OF FINANCE Professional Summary Seeking a position in financial/g...
1,2,15891494,FINANCE,0.743289,FINANCE OFFICER Summary Profile: An experience Accountant and data base worker...
2,3,14722634,FINANCE,0.73725,FINANCE DIRECTOR Summary Remarkably astute and analytical professional with ov...
3,4,26767199,FINANCE,0.73088,FINANCE MANAGER Summary Flexible Financial Manager with the ability to mult...
4,5,38441665,FINANCE,0.723948,"FINANCE DIRECTOR Professional Summary Results oriented, dependable and mo..."
5,6,38907798,FINANCE,0.723342,SENIOR FINANCE MANAGER Summary Highly driven finance professional with ove...
6,7,19234823,ADVOCATE,0.721044,FINANCE DIRECTOR Professional Summary To find a new and challenging position t...
7,8,88691367,CONSULTANT,0.716494,CONSULTANT Summary Accomplished and highly skilled Controller with a prove...
8,9,16449850,FINANCE,0.715431,DIRECTOR OF FINANCE Professional Summary Senior financial hospitality execut...
9,10,81677620,FINANCE,0.715356,"FINANCE MANAGER Summary preparing annual budgets, monitoring key accounts ..."
