PHASE 1: CLEANING & PREPROCESSING

In [None]:
import pandas as pd
import numpy as np

# Load datasets
job_df = pd.read_csv('IT_Job_Roles_Skills.csv', encoding='latin1')
resume_df = pd.read_csv('UpdatedResumeDataSet.csv', encoding='latin1')

# Explore
print(job_df.info())
print(job_df.head())

print(resume_df.info())
print(resume_df.head())

# Check missing values
print(job_df.isnull().sum())
print(resume_df.isnull().sum())

In [None]:
def clean_skills(skill_str):
    if pd.isna(skill_str):
        return []
    return [skill.strip().lower() for skill in skill_str.split(',')]

job_df['Skills_clean'] = job_df['Skills'].apply(clean_skills)

# Similarly clean Job Titles and Descriptions
job_df['Job Title'] = job_df['Job Title'].str.strip().str.lower()
job_df['Job Description'] = job_df['Job Description'].fillna('').str.lower()

resume_df = resume_df.dropna(subset=['Resume'])  # Assuming 'Resume' column name
resume_df['Resume'] = resume_df['Resume'].str.lower()


if 'Skills' in resume_df.columns:
    resume_df['Skills_clean'] = resume_df['Skills'].apply(clean_skills)
else:
    
    pass

# Summary stats
print("Unique skills in jobs:", set(sum(job_df['Skills_clean'], [])))
print("Unique skills in resumes:", set(sum(resume_df['Skills_clean'], [])))


In [None]:

job_df['Job Title'] = job_df['Job Title'].str.strip().str.lower()
job_df['Job Description'] = job_df['Job Description'].fillna('')

all_job_skills = set(skill for skills_list in job_df['Skills_clean'] for skill in skills_list)

def extract_skills_from_resume(resume_text):
    if pd.isna(resume_text):
        return []
    
    resume_text = resume_text.lower()
    found_skills = []
    
    for skill in all_job_skills:
        if skill in resume_text:
            found_skills.append(skill)
    
    return found_skills

resume_df['Skills_extracted'] = resume_df['Resume'].apply(extract_skills_from_resume)

resume_df['Skills_count'] = resume_df['Skills_extracted'].apply(len)

resume_df['Top_skill_areas'] = resume_df['Skills_extracted'].apply(
    lambda skills: ', '.join(skills[:5]) if len(skills) > 0 else "No matching skills found"
)

print(f"Jobs dataset shape: {job_df.shape}")
print(f"Resume dataset shape: {resume_df.shape}")
print("\nSample of extracted skills from resumes:")
print(resume_df[['Category', 'Skills_extracted', 'Skills_count']].head())

PHASE 2: FEATURE ENGINEERING

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sentence_transformers import SentenceTransformer

import scipy.sparse as sparse

skill_vocabulary = sorted(list(all_job_skills))
print(f"Total unique skills in vocabulary: {len(skill_vocabulary)}")

def create_skill_vector(skills_list):

    indices = [skill_vocabulary.index(skill) for skill in skills_list if skill in skill_vocabulary]
    if not indices:
        return sparse.csr_matrix((1, len(skill_vocabulary)), dtype=np.int8)
    
    data = np.ones(len(indices), dtype=np.int8)
    indptr = np.array([0, len(indices)])
    return sparse.csr_matrix((data, indices, indptr), shape=(1, len(skill_vocabulary)))

# Apply vectorization
job_skill_vectors = sparse.vstack([create_skill_vector(skills) for skills in job_df['Skills_clean']])
resume_skill_vectors = sparse.vstack([create_skill_vector(skills) for skills in resume_df['Skills_extracted']])

print(f"Job skill vectors shape: {job_skill_vectors.shape}")
print(f"Resume skill vectors shape: {resume_skill_vectors.shape}")

if 'Category' in job_df.columns:
    job_encoder = OneHotEncoder(sparse_output=True)
    job_categories = job_encoder.fit_transform(job_df[['Category']])
    print(f"Job categories encoded shape: {job_categories.shape}")

resume_encoder = OneHotEncoder(sparse_output=True)
resume_categories = resume_encoder.fit_transform(resume_df[['Category']])
print(f"Resume categories encoded shape: {resume_categories.shape}")

print("Creating semantic embeddings with Sentence Transformer...")
model = SentenceTransformer('all-MiniLM-L6-v2')

sample_size = min(50, len(job_df))
sample_jobs = job_df.sample(sample_size)
sample_resumes = resume_df.sample(sample_size)
s
sample_job_embeddings = model.encode(sample_jobs['Job Description'].tolist(), 
                                    show_progress_bar=True,
                                    batch_size=16)
sample_resume_embeddings = model.encode(sample_resumes['Resume'].tolist(),
                                       show_progress_bar=True,
                                       batch_size=16)

print(f"Job embedding dimensions: {sample_job_embeddings.shape}")
print(f"Resume embedding dimensions: {sample_resume_embeddings.shape}")

resume_df['Text_Length'] = resume_df['Resume'].str.len()
resume_df['Word_Count'] = resume_df['Resume'].apply(lambda x: len(str(x).split()))
resume_df['Skills_Density'] = resume_df['Skills_count'] / resume_df['Word_Count'].clip(lower=1)

print("\nFeature engineering complete!")
print("Features created:")
print("1. Sparse skill vectors for efficient representation")
print("2. Category encodings for both resumes and jobs")
print("3. Semantic embeddings using Sentence Transformer")
print("4. Text statistics (length, word count, skills density)")

PHASE 3: MODEL TRAINING FOR JOB-RESUME MATCHING

In [None]:
import random
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp
# Create pairs of resume-job data with similarity metrics

n_resumes = len(resume_df)
n_jobs = len(job_df)

max_pairs = 10000
sample_ratio = min(1.0, max_pairs / (n_resumes * n_jobs))

pairs = []

resume_indices = list(range(n_resumes))
job_indices = list(range(n_jobs))

selected_pairs = []
if sample_ratio < 1.0:
   
    for _ in range(max_pairs):
        resume_idx = random.choice(resume_indices)
        job_idx = random.choice(job_indices)
        selected_pairs.append((resume_idx, job_idx))
else:
   
    selected_pairs = [(i, j) for i in resume_indices[:100] for j in job_indices[:100]]

for resume_idx, job_idx in selected_pairs:
    resume_skills = set(resume_df.iloc[resume_idx]['Skills_extracted'])
    job_skills = set(job_df.iloc[job_idx]['Skills_clean'])
    
    if len(resume_skills) == 0 or len(job_skills) == 0:
        jaccard_similarity = 0.0
    else:
        jaccard_similarity = len(resume_skills.intersection(job_skills)) / len(resume_skills.union(job_skills))

    common_skills_count = len(resume_skills.intersection(job_skills))
    
    resume_vec = resume_skill_vectors[resume_idx]
    job_vec = job_skill_vectors[job_idx]
    
    skill_vector_similarity = 0.0
    if not (resume_vec.nnz == 0 or job_vec.nnz == 0):
        skill_vector_similarity = cosine_similarity(resume_vec, job_vec)[0][0]
    
    content_similarity = 0.0
    
    match_score = (0.3 * jaccard_similarity + 
                  0.2 * (common_skills_count / max(1, len(job_skills))) + 
                  0.5 * skill_vector_similarity)
    
    pair_data = {
        'resume_idx': resume_idx,
        'job_idx': job_idx,
        'resume_category': resume_df.iloc[resume_idx]['Category'],
        'job_title': job_df.iloc[job_idx]['Job Title'],
        'jaccard_similarity': jaccard_similarity,
        'common_skills_count': common_skills_count,
        'content_similarity': content_similarity,
        'skill_vector_similarity': skill_vector_similarity,
        'match_score': match_score,
        'skills_density': resume_df.iloc[resume_idx]['Skills_Density'],
        'resume_skills_count': len(resume_skills),
        'job_skills_count': len(job_skills)
    }
    
    pairs.append(pair_data)

# Create DataFrame with all pairs
pairs_df = pd.DataFrame(pairs)

def categorize_match(score):
    if score == 0:
        return float('nan')  
    elif score < 0.1:
        return "Very Low"
    elif score < 0.25:
        return "Low"
    elif score < 0.5:
        return "Medium"
    elif score < 0.75:
        return "High"
    else:
        return "Very High"

pairs_df['match_level'] = pairs_df['match_score'].apply(categorize_match)
pairs_df['match_level'] = pairs_df['match_level'].astype('category')

pairs_df_model = pairs_df[['resume_idx', 'job_idx', 'jaccard_similarity', 
                          'common_skills_count', 'content_similarity', 
                          'skill_vector_similarity', 'match_score']]

print(f"Created {len(pairs_df)} resume-job pairs for analysis")
print(f"Match level distribution:\n{pairs_df['match_level'].value_counts(dropna=False)}")
print(f"Match score statistics:\n{pairs_df['match_score'].describe()}")

PHASE 4: MODEL TRAINING

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import joblib
import matplotlib.pyplot as plt

print("Preparing features for model training...")

model_data = pairs_df.copy()

model_data['skill_count_diff'] = abs(model_data['resume_skills_count'] - model_data['job_skills_count'])
model_data['skill_count_ratio'] = model_data['common_skills_count'] / model_data['job_skills_count'].clip(lower=1)


features = [
    'jaccard_similarity',
    'common_skills_count', 
    'skill_vector_similarity',
    'skill_count_diff',
    'skill_count_ratio',
    'skills_density'
]

model_data = model_data.dropna(subset=features)

X = model_data[features].values
y = model_data['match_score'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training models with hyperparameter tuning...")

rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

rf = RandomForestRegressor(random_state=42)
rf_grid = GridSearchCV(rf, rf_params, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
rf_grid.fit(X_train_scaled, y_train)

gb_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}

gb = GradientBoostingRegressor(random_state=42)
gb_grid = GridSearchCV(gb, gb_params, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
gb_grid.fit(X_train_scaled, y_train)

rf_best = rf_grid.best_estimator_
gb_best = gb_grid.best_estimator_

rf_pred = rf_best.predict(X_test_scaled)
gb_pred = gb_best.predict(X_test_scaled)

rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_pred))

print(f"Random Forest RMSE: {rf_rmse:.4f}")
print(f"Gradient Boosting RMSE: {gb_rmse:.4f}")

if rf_rmse <= gb_rmse:
    best_model = rf_best
    best_pred = rf_pred
    print("Random Forest selected as best model")
else:
    best_model = gb_best
    best_pred = gb_pred
    print("Gradient Boosting selected as best model")

#Feature importance analysis
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': best_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

#Evaluation metrics
print("\nModel Evaluation:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, best_pred)):.4f}")
print(f"MAE: {mean_absolute_error(y_test, best_pred):.4f}")
print(f"R² Score: {r2_score(y_test, best_pred):.4f}")

#Visualize predictions vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, best_pred, alpha=0.5)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('Actual Match Score')
plt.ylabel('Predicted Match Score')
plt.title('Predicted vs Actual Match Scores')
plt.tight_layout()
plt.savefig('match_score_prediction.png')
plt.show()

#Save the model for future use
joblib.dump(best_model, 'resume_job_matching_model.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')
joblib.dump(features, 'model_features.pkl')

print("\nModel saved as 'resume_job_matching_model.pkl'")
print("Feature scaler saved as 'feature_scaler.pkl'")
print("Training complete!")

PHASE 5: TESTING MODEL PERFORMANCE AND VISUALIZATION

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import joblib
import matplotlib.pyplot as plt

model = joblib.load('resume_job_matching_model.pkl')
scaler = joblib.load('feature_scaler.pkl')
features = joblib.load('model_features.pkl')

def predict_match_score(resume_idx, job_idx):
    """Predict match score between a resume and job"""

    resume_skills = set(resume_df.iloc[resume_idx]['Skills_extracted'])

    job_skills = set(job_df.iloc[job_idx]['Skills_clean'])

    jaccard_similarity = 0.0
    if len(resume_skills) > 0 and len(job_skills) > 0:
        jaccard_similarity = len(resume_skills.intersection(job_skills)) / len(resume_skills.union(job_skills))
    
    common_skills_count = len(resume_skills.intersection(job_skills))

    resume_vec = resume_skill_vectors[resume_idx]
    job_vec = job_skill_vectors[job_idx]
    
    skill_vector_similarity = 0.0
    if not (resume_vec.nnz == 0 or job_vec.nnz == 0):
        skill_vector_similarity = cosine_similarity(resume_vec, job_vec)[0][0]

    skill_count_diff = abs(len(resume_skills) - len(job_skills))
    skill_count_ratio = common_skills_count / max(1, len(job_skills))
    skills_density = resume_df.iloc[resume_idx]['Skills_Density']
    
    X = np.array([[
        jaccard_similarity,
        common_skills_count,
        skill_vector_similarity,
        skill_count_diff,
        skill_count_ratio,
        skills_density
    ]])

    X_scaled = scaler.transform(X)

    match_score = model.predict(X_scaled)[0]
    
    return {
        'resume_category': resume_df.iloc[resume_idx]['Category'],
        'job_title': job_df.iloc[job_idx]['Job Title'],
        'match_score': match_score,
        'common_skills': list(resume_skills.intersection(job_skills)),
        'jaccard_similarity': jaccard_similarity,
        'skill_vector_similarity': skill_vector_similarity
    }

np.random.seed(42)
test_size = 100
test_pairs = []

for _ in range(test_size):
    resume_idx = np.random.randint(0, len(resume_df))
    job_idx = np.random.randint(0, len(job_df))

    resume_skills = set(resume_df.iloc[resume_idx]['Skills_extracted'])
    job_skills = set(job_df.iloc[job_idx]['Skills_clean'])
    
    jaccard_similarity = 0.0
    if len(resume_skills) > 0 and len(job_skills) > 0:
        jaccard_similarity = len(resume_skills.intersection(job_skills)) / len(resume_skills.union(job_skills))
    
    common_skills_count = len(resume_skills.intersection(job_skills))
    
    resume_vec = resume_skill_vectors[resume_idx]
    job_vec = job_skill_vectors[job_idx]
    
    skill_vector_similarity = 0.0
    if not (resume_vec.nnz == 0 or job_vec.nnz == 0):
        skill_vector_similarity = cosine_similarity(resume_vec, job_vec)[0][0]
    
    actual_score = (0.3 * jaccard_similarity + 
                   0.2 * (common_skills_count / max(1, len(job_skills))) + 
                   0.5 * skill_vector_similarity)
   
    prediction = predict_match_score(resume_idx, job_idx)
    
    test_pairs.append({
        'resume_idx': resume_idx,
        'job_idx': job_idx,
        'resume_category': resume_df.iloc[resume_idx]['Category'],
        'job_title': job_df.iloc[job_idx]['Job Title'],
        'actual_score': actual_score,
        'predicted_score': prediction['match_score'],
        'error': prediction['match_score'] - actual_score
    })

test_df = pd.DataFrame(test_pairs)

mae = np.mean(np.abs(test_df['error']))
rmse = np.sqrt(np.mean(np.square(test_df['error'])))
r2 = 1 - (np.sum(np.square(test_df['error'])) / np.sum(np.square(test_df['actual_score'] - np.mean(test_df['actual_score']))))

print(f"Model Performance on Test Set:")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

plt.figure(figsize=(10, 6))
plt.scatter(test_df['actual_score'], test_df['predicted_score'], alpha=0.7)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('Actual Match Score')
plt.ylabel('Predicted Match Score')
plt.title('Model Prediction Accuracy: Actual vs Predicted Match Scores')
plt.grid(True, alpha=0.3)

plt.text(0.05, 0.95, f"MAE: {mae:.4f}\nRMSE: {rmse:.4f}\nR²: {r2:.4f}", 
         transform=plt.gca().transAxes, bbox=dict(facecolor='white', alpha=0.8))

plt.tight_layout()
plt.savefig('model_prediction_accuracy.png')
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(test_df['error'], bins=20, kde=True)
plt.axvline(x=0, color='r', linestyle='--')
plt.xlabel('Prediction Error (Predicted - Actual)')
plt.ylabel('Frequency')
plt.title('Distribution of Prediction Errors')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('error_distribution.png')
plt.show()

plt.figure(figsize=(12, 8))
category_errors = test_df.groupby('resume_category')['error'].agg(['mean', 'std']).sort_values('mean')
category_errors.plot(kind='bar', y='mean', yerr='std', capsize=4, figsize=(12, 8))
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Mean Prediction Error by Resume Category')
plt.ylabel('Mean Error')
plt.xticks(rotation=90)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('category_error_analysis.png')
plt.show()

feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature')
plt.title('Feature Importance in Match Score Prediction')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.show()

print("\nTest Summary Statistics:")
print(test_df[['actual_score', 'predicted_score', 'error']].describe())

print("\nExample Job-Resume Matches:")
top_matches = test_df.nlargest(5, 'predicted_score')
for i, match in top_matches.iterrows():
    print(f"\nResume Category: {match['resume_category']}")
    print(f"Job Title: {match['job_title']}")
    print(f"Predicted Match Score: {match['predicted_score']:.4f}")
    print(f"Actual Match Score: {match['actual_score']:.4f}")

    resume_skills = set(resume_df.iloc[int(match['resume_idx'])]['Skills_extracted'])
    job_skills = set(job_df.iloc[int(match['job_idx'])]['Skills_clean'])
    common = resume_skills.intersection(job_skills)
    if common:
        print(f"Common Skills: {', '.join(list(common)[:5])}")
    else:
        print("No common skills found")