In [64]:
import pandas as pd

df = pd.read_csv('jobs.csv')  # Replace with your dataset file path
print(df.head())

   Unnamed: 0                    Job Salary Job Experience Required  \
0           0   Not Disclosed by Recruiter               5 - 10 yrs   
1           1   Not Disclosed by Recruiter                2 - 5 yrs   
2           2   Not Disclosed by Recruiter                0 - 1 yrs   
3           3       2,00,000 - 4,00,000 PA.               0 - 5 yrs   
4           4   Not Disclosed by Recruiter                2 - 5 yrs   

                                          Key Skills  \
0                      Media Planning| Digital Media   
1   pre sales| closing| software knowledge| clien...   
2   Computer science| Fabrication| Quality check|...   
3                                  Technical Support   
4   manual testing| test engineering| test cases|...   

                                Role Category  \
0                                 Advertising   
1                                Retail Sales   
2                                         R&D   
3  Admin/Maintenance/Security/Datawareho

In [65]:
df.shape

(27010, 8)

In [66]:
print(df.isnull().sum())

Unnamed: 0                 0
Job Salary                 0
Job Experience Required    0
Key Skills                 0
Role Category              0
Functional Area            0
Industry                   0
Job Title                  0
dtype: int64


In [67]:
df.shape

(27010, 8)

In [68]:
# Check data types of each column
print(df.dtypes)


Unnamed: 0                  int64
Job Salary                 object
Job Experience Required    object
Key Skills                 object
Role Category              object
Functional Area            object
Industry                   object
Job Title                  object
dtype: object


In [69]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert Experience to Numeric (average of the range)
def extract_experience(exp):
    if isinstance(exp, str) and '-' in exp:
        years = exp.split('-')
        return (int(years[0].strip()) + int(years[1].split()[0].strip())) / 2
    return np.nan

df['Experience (Years)'] = df['Job Experience Required'].apply(extract_experience)

# Fill missing values
df['Experience (Years)'].fillna(df['Experience (Years)'].median(), inplace=True)
df['Key Skills'].fillna("", inplace=True)

# Convert Key Skills into TF-IDF Vectors
vectorizer = TfidfVectorizer(stop_words='english')
skills_matrix = vectorizer.fit_transform(df['Key Skills'])

print("TF-IDF Matrix Shape:", skills_matrix.shape)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Experience (Years)'].fillna(df['Experience (Years)'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Key Skills'].fillna("", inplace=True)


TF-IDF Matrix Shape: (27010, 6372)


In [70]:
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler

# Scale experience and reshape it
scaler = StandardScaler()
experience_scaled = scaler.fit_transform(df[['Experience (Years)']])

# Combine skills and experience into a single feature matrix
job_features = hstack((skills_matrix, experience_scaled)).tocsr()

print("Final Feature Matrix Shape:", job_features.shape)


Final Feature Matrix Shape: (27010, 6373)


In [71]:
from sklearn.neighbors import NearestNeighbors

# Train KNN model
knn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
knn_model.fit(job_features)

print("KNN Model Trained!")


KNN Model Trained!


In [72]:
def recommend_jobs(job_title, top_n=5):
    if job_title not in df['Job Title'].values:
        return "Job not found in dataset"

    # Find job index
    job_idx = df[df['Job Title'] == job_title].index[0]

    # Find K nearest jobs
    distances, indices = knn_model.kneighbors(job_features[job_idx], n_neighbors=top_n+1)

    # Get recommended jobs
    recommended_jobs = df.iloc[indices[0][1:]][['Job Title', 'Key Skills', 'Experience (Years)']]
    
    return recommended_jobs

# Example Usage
recommend_jobs("Technical Support Engineer")


Unnamed: 0,Job Title,Key Skills,Experience (Years)
14585,Associate/Senior Associate -(Technical),Technical Support,2.5
2974,Technical Support Engineer,Technical Support,2.5
15254,Technical Support Engineer,technical support representative| technical s...,2.0
20174,Post Sales Consultant,Technical Support,4.0
16670,Associate/Senior Associate -(Technical),technical support| technical support voice,3.0


In [77]:
def recommend_jobs_by_skills(user_skills, experience, top_n=5):
    user_skills_vector = vectorizer.transform([user_skills])
    user_experience_scaled = scaler.transform([[experience]])

    # Combine user input into feature matrix
    user_features = hstack((user_skills_vector, user_experience_scaled))

    # Find K nearest jobs
    distances, indices = knn_model.kneighbors(user_features, n_neighbors=top_n)

    # Get recommended jobs
    recommended_jobs = df.iloc[indices[0]][['Job Title', 'Key Skills', 'Experience (Years)']]
    
    return recommended_jobs

# Example Usage
recommend_jobs_by_skills("Python", 5)




Unnamed: 0,Job Title,Key Skills,Experience (Years)
3284,Software Developer,python| c++,5.5
1699,Software Developer,c++|python,5.5
5622,Software Developer,Python,3.5
13298,Software Developer,Python| Python Developer,5.0
14422,Software Developer,python,7.5


In [74]:
def evaluate_knn_recommendations(n_samples=100, k=5):
    import numpy as np

    relevant_jobs = 0
    total_jobs = 0

    sample_indices = np.random.choice(df.index, n_samples, replace=False)

    for idx in sample_indices:
        job_title = df.iloc[idx]['Job Title']
        recommendations = recommend_jobs(job_title, k)

        if job_title in recommendations['Job Title'].values:
            relevant_jobs += 1
        total_jobs += 1

    precision = relevant_jobs / (total_jobs * k)
    recall = relevant_jobs / total_jobs

    return precision, recall

# Run evaluation
precision, recall = evaluate_knn_recommendations()
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}")


Precision: 0.1260, Recall: 0.6300
