In [5]:
# ========================
# 1. Install dependencies
# ========================
%pip install -q sentence-transformers scikit-learn pandas numpy faiss-cpu joblib tqdm


Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:

# ===============================
# Job Role Recommender System
# ===============================

# 1. Install required libraries
%pip install pandas scikit-learn faiss-cpu joblib

# -------------------------------
# 2. Imports
# -------------------------------
import pandas as pd
import numpy as np
import faiss
import re
import joblib
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# -------------------------------
# 3. Load dataset
# -------------------------------
# Update path to your dataset file
df = pd.read_csv(r"D:\project\UpdatedResumeDataSet.csv")  

print("Dataset shape:", df.shape)
print(df.head())

# -------------------------------
# 4. Preprocessing
# -------------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["cleaned"] = df["Resume"].apply(clean_text)

# -------------------------------
# 5. Encode target labels
# -------------------------------
le = LabelEncoder()
df["label"] = le.fit_transform(df["Category"])

# -------------------------------
# 6. Train-test split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    df["cleaned"], df["label"], test_size=0.2, random_state=42
)

# -------------------------------
# 7. Vectorization
# -------------------------------
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# -------------------------------
# 8. Train classifier
# -------------------------------
clf = LogisticRegression(max_iter=200)
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)
print("\nClassifier Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

# -------------------------------
# 9. FAISS Index
# -------------------------------
X_vec_all = vectorizer.transform(df["cleaned"])
X_faiss = X_vec_all.astype('float32').toarray()

d = X_faiss.shape[1]
index = faiss.IndexFlatL2(d)
index.add(X_faiss)

print("FAISS index size:", index.ntotal)

# -------------------------------
# 10. Build job skills dictionary
# -------------------------------
job_skills_dict = {}
for role in df["Category"].unique():
    role_texts = df[df["Category"] == role]["cleaned"]
    words = " ".join(role_texts).split()
    top_words = pd.Series(words).value_counts().head(50).index.tolist()
    job_skills_dict[role] = top_words

# -------------------------------
# 11. Recommend function
# -------------------------------
def recommend_roles(resume_text, model, vectorizer, index, job_labels, job_skills_dict, top_n=3):
    vec = vectorizer.transform([resume_text])
    probs = model.predict_proba(vec)[0]

    query_vec = vec.astype('float32').toarray()
    D, I = index.search(query_vec, k=top_n*2)

    results = []
    for i, role in enumerate(job_labels):
        classifier_prob = probs[i]

        similarity_score = 0.0
        if len(I[0]) > 0:
            for j, idx in enumerate(I[0]):
                if idx < len(job_labels) and job_labels[idx] == role:
                    similarity_score = float(1 - D[0][j])
                    break

        resume_tokens = set(resume_text.lower().split())
        role_skills = set(job_skills_dict.get(role, []))
        skill_overlap = len(resume_tokens & role_skills) / max(1, len(role_skills))

        combined_score = 0.5 * classifier_prob + 0.3 * similarity_score + 0.2 * skill_overlap

        results.append({
            "role": role,
            "classifier_score": classifier_prob,
            "similarity_score": similarity_score,
            "skill_overlap": skill_overlap,
            "combined_score": combined_score
        })

    sorted_results = sorted(results, key=lambda x: x['combined_score'], reverse=True)[:top_n]

    # Normalize scores so top_n add up to 100%
    total_score = sum(r["combined_score"] for r in sorted_results)
    for r in sorted_results:
        r["percentage"] = round((r["combined_score"] / total_score) * 100, 2) if total_score > 0 else 0

    return sorted_results

# -------------------------------
# 12. Test the recommender
# -------------------------------
job_labels = list(le.classes_)

resume_example = "Experienced Python developer with Django, Flask, REST APIs, PostgreSQL, AWS, and Docker."
top_roles = recommend_roles(resume_example, clf, vectorizer, index, job_labels, job_skills_dict, top_n=3)

print("\nTop recommended roles with percentages:")
for r in top_roles:
    print(f"{r['role']} - {r['percentage']}% match")

# -------------------------------
# 13. Save models for deployment
# -------------------------------
# Save with joblib (optional)
joblib.dump(clf, "jobrole_classifier.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")
joblib.dump(le, "label_encoder.pkl")
faiss.write_index(index, "faiss_index.idx")

# Save as a single pickle file for Flask
model_data = {
    "vectorizer": vectorizer,
    "model": clf,
    "job_labels": job_labels,
    "faiss_index": index,
    "job_skills_dict": job_skills_dict
}

with open("recommender_model.pkl", "wb") as f:
    pickle.dump(model_data, f)

print("✅ Model and components saved as recommender_model.pkl")



Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Dataset shape: (962, 2)
       Category                                             Resume
0  Data Science  Skills * Programming Languages: Python (pandas...
1  Data Science  Education Details \r\nMay 2013 to May 2017 B.E...
2  Data Science  Areas of Interest Deep Learning, Control Syste...
3  Data Science  Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4  Data Science  Education Details \r\n MCA   YMCAUST,  Faridab...

Classifier Report:
                            precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         3
                     Arts       1.00      1.00      1.00         6
       Automation Testing       1.00      1.00      1.00         5
               Blockchain       1.00      1.00      1.00         7
         Business Analyst       1.00      1.00      1.00         4
      

In [8]:
import pickle

# Save model, vectorizer and job labels
model_data = {
    "vectorizer": vectorizer,   # your TF-IDF vectorizer
    "model": clf,               # your trained classifier
    "job_labels": job_labels    # list of job categories
}

with open("recommender_model.pkl", "wb") as f:
    pickle.dump(model_data, f)

print("✅ Model saved as recommender_model.pkl")


✅ Model saved as recommender_model.pkl
