In [52]:
# Load necessary packages
import numpy as np
import pandas as pd

#to visualise all the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics import precision_score, recall_score, f1_score

## **Data Exploration**

In [53]:
# 1. Education dataset
ed = pd.read_csv('education.csv')
ed.head()

Unnamed: 0,Id,primarySchool,primaryPercentage,primaryPassoutYear,secondarySchool,secondaryPercentage,secondaryPassoutYear,graduation,graduationPercentage,graduationPassoutYear,orgId,applicantId,certificates,name,degree,fieldOfStudy,percentage,certificate,createdOn
0,966,,0,0,,0.0,0,,0.0,0,,AIJP1291,,,,,,,48:16.3
1,967,Marks High school,86,1999,Marks High school,80.0,2001,,0.0,0,,AIJV7014,,,,,,,48:16.3
2,968,,0,0,,0.0,0,,0.0,0,,AIAA7617,,,,,,,48:16.3
3,969,,0,0,,0.0,0,,0.0,0,,AIRK6266,,,,,,,48:16.3
4,976,,0,0,,0.0,0,,0.0,0,,AIDK9811,,,,,,,48:16.3


In [54]:
# Total no.of columns and rows
ed.shape

(188, 19)

In [55]:
# List of columns
ed.columns

Index(['Id', 'primarySchool', 'primaryPercentage', 'primaryPassoutYear',
       'secondarySchool', 'secondaryPercentage', 'secondaryPassoutYear',
       'graduation', 'graduationPercentage', 'graduationPassoutYear', 'orgId',
       'applicantId', 'certificates', 'name', 'degree', 'fieldOfStudy',
       'percentage', 'certificate', 'createdOn'],
      dtype='object')

In [56]:
# Check datatypes of all the columns
ed.dtypes

Id                         int64
primarySchool             object
primaryPercentage          int64
primaryPassoutYear         int64
secondarySchool           object
secondaryPercentage      float64
secondaryPassoutYear       int64
graduation                object
graduationPercentage     float64
graduationPassoutYear      int64
orgId                    float64
applicantId               object
certificates              object
name                     float64
degree                   float64
fieldOfStudy             float64
percentage               float64
certificate              float64
createdOn                 object
dtype: object

In [57]:
# Information of the dataset
ed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188 entries, 0 to 187
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Id                     188 non-null    int64  
 1   primarySchool          11 non-null     object 
 2   primaryPercentage      188 non-null    int64  
 3   primaryPassoutYear     188 non-null    int64  
 4   secondarySchool        8 non-null      object 
 5   secondaryPercentage    188 non-null    float64
 6   secondaryPassoutYear   188 non-null    int64  
 7   graduation             9 non-null      object 
 8   graduationPercentage   188 non-null    float64
 9   graduationPassoutYear  188 non-null    int64  
 10  orgId                  0 non-null      float64
 11  applicantId            188 non-null    object 
 12  certificates           15 non-null     object 
 13  name                   0 non-null      float64
 14  degree                 0 non-null      float64
 15  fieldO

In [58]:
# Statistical info of the dataset
ed.describe()

Unnamed: 0,Id,primaryPercentage,primaryPassoutYear,secondaryPercentage,secondaryPassoutYear,graduationPercentage,graduationPassoutYear,orgId,name,degree,fieldOfStudy,percentage,certificate
count,188.0,188.0,188.0,188.0,188.0,188.0,188.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,1101.771277,2.085106,77.218085,2.026596,66.574468,1.979255,88.207447,,,,,,
std,66.721032,11.914275,373.435188,11.550028,346.151925,11.504272,400.824477,,,,,,
min,966.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,
25%,1057.75,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,
50%,1110.5,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,
75%,1157.25,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,
max,1204.0,89.0,2017.0,90.0,2019.0,89.0,2023.0,,,,,,


In [59]:
# Check for missing values
ed.isnull().sum()

Id                         0
primarySchool            177
primaryPercentage          0
primaryPassoutYear         0
secondarySchool          180
secondaryPercentage        0
secondaryPassoutYear       0
graduation               179
graduationPercentage       0
graduationPassoutYear      0
orgId                    188
applicantId                0
certificates             173
name                     188
degree                   188
fieldOfStudy             188
percentage               188
certificate              188
createdOn                  0
dtype: int64

In [60]:
# Dropping columns that are not contibuting for further analysis
ed = ed.drop(['primarySchool', 'secondarySchool', 'graduation', 'primaryPassoutYear', 'secondaryPassoutYear', 'graduationPassoutYear', 'orgId', 'certificates', 'name', 'degree', 'fieldOfStudy', 'percentage', 'certificate', 'createdOn'], axis=1)
ed.columns

Index(['Id', 'primaryPercentage', 'secondaryPercentage',
       'graduationPercentage', 'applicantId'],
      dtype='object')

In [61]:
# Check string columns
string_cols_ed = ed.select_dtypes(include='object')
string_cols_ed.columns

Index(['applicantId'], dtype='object')

In [62]:
# Check numerical columns
numerical_cols_ed = ed.select_dtypes(include='number')
numerical_cols_ed.columns

Index(['Id', 'primaryPercentage', 'secondaryPercentage',
       'graduationPercentage'],
      dtype='object')

In [63]:
# 2. Employment dataset
emp = pd.read_csv('employment.csv')
emp.head()

Unnamed: 0,employmentId,uploadResume,currentCompany,currentDesignation,annualSalary,industry,functionalArea,role,currentLocation,orgId,noticePeriod,skills,totalWorkExp,minExp,maxExp,applicantId,preferedLocation,holdingAnyOffer,expectedCtc,feedback,comment,isCurrent,companyEmail,packageOffered,lastWorkingDay,remarks,previousCompany,offerLetter,experienceLetter,relieveingLetter,isRecent,paySlips,createdOn
0,48,,ESMOB Technologies,React JS Developer,3.5,1.0,Software Developer,,Hyderabad,1.0,,"{css,js}",2.75,,,AISB2256,,,,,,,,,,,,,,,,,49:44.0
1,61,,Lejara Global IT Solutions Pvt. Ltd,RPA Developer,4.5,1.0,Software Developer,,Hyderabad,1.0,,"{RPA,python}",6.0,,,AIPH2278,,,,,,,,,,,,,,,,,49:44.0
2,62,,"Dhatri Info Solutions Pvt. Ltd,",Java Developer,360000.0,1.0,Software Developer,,Hyderabad,1.0,,"{java,spring,hibernate}",5.0,,,AIST2288,,,,,,,,,,,,,,,,,49:44.0
3,63,,IT Contractors,Java Developer,490000.0,1.0,IT,,Bangalore,1.0,,"{Java,Springboot}",3.0,,,AIVV2277,,,,,,,,,,,,,,,,,49:44.0
4,64,,Novell Logic,Java Developer,500000.0,1.0,IT,,Bangalore,1.0,,"{Java,springboot}",3.0,,,AIMV2241,,,,,,,,,,,,,,,,,49:44.0


In [64]:
# Total no.of columns and rows
emp.shape

(2099, 33)

In [65]:
# List of columns
emp.columns

Index(['employmentId', 'uploadResume', 'currentCompany', 'currentDesignation',
       'annualSalary', 'industry', 'functionalArea', 'role', 'currentLocation',
       'orgId', 'noticePeriod', 'skills', 'totalWorkExp', 'minExp', 'maxExp',
       'applicantId', 'preferedLocation', 'holdingAnyOffer', 'expectedCtc',
       'feedback', 'comment', 'isCurrent', 'companyEmail', 'packageOffered',
       'lastWorkingDay', 'remarks', 'previousCompany', 'offerLetter',
       'experienceLetter', 'relieveingLetter', 'isRecent', 'paySlips',
       'createdOn'],
      dtype='object')

In [66]:
# Check datatypes of all the columns
emp.dtypes

employmentId            int64
uploadResume           object
currentCompany         object
currentDesignation     object
annualSalary           object
industry              float64
functionalArea         object
role                  float64
currentLocation        object
orgId                 float64
noticePeriod           object
skills                 object
totalWorkExp          float64
minExp                float64
maxExp                float64
applicantId            object
preferedLocation       object
holdingAnyOffer        object
expectedCtc            object
feedback               object
comment                object
isCurrent              object
companyEmail           object
packageOffered        float64
lastWorkingDay         object
remarks                object
previousCompany       float64
offerLetter           float64
experienceLetter      float64
relieveingLetter      float64
isRecent              float64
paySlips              float64
createdOn              object
dtype: obj

In [67]:
# Information of the dataset
emp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2099 entries, 0 to 2098
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   employmentId        2099 non-null   int64  
 1   uploadResume        1600 non-null   object 
 2   currentCompany      2002 non-null   object 
 3   currentDesignation  2056 non-null   object 
 4   annualSalary        1994 non-null   object 
 5   industry            2004 non-null   float64
 6   functionalArea      1986 non-null   object 
 7   role                0 non-null      float64
 8   currentLocation     1999 non-null   object 
 9   orgId               1953 non-null   float64
 10  noticePeriod        1919 non-null   object 
 11  skills              2099 non-null   object 
 12  totalWorkExp        2002 non-null   float64
 13  minExp              0 non-null      float64
 14  maxExp              0 non-null      float64
 15  applicantId         2099 non-null   object 
 16  prefer

In [68]:
# Statistical info of the dataset
emp.describe()

Unnamed: 0,employmentId,industry,role,orgId,totalWorkExp,minExp,maxExp,packageOffered,previousCompany,offerLetter,experienceLetter,relieveingLetter,isRecent,paySlips
count,2099.0,2004.0,0.0,1953.0,2002.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,1200.003811,2.056886,,1.0,11.116773,,,560008.2,,,,,,
std,627.419107,4.399741,,0.0,104.7665,,,637834.7,,,,,,
min,48.0,0.0,,1.0,0.0,,,12.0,,,,,,
25%,679.5,1.0,,1.0,2.0,,,17.0,,,,,,
50%,1212.0,1.0,,1.0,3.4,,,480000.0,,,,,,
75%,1737.5,1.0,,1.0,6.0,,,925000.0,,,,,,
max,2276.0,165.0,,1.0,3278.0,,,1900000.0,,,,,,


In [69]:
# Check for missing values
emp.isnull().sum()

employmentId             0
uploadResume           499
currentCompany          97
currentDesignation      43
annualSalary           105
industry                95
functionalArea         113
role                  2099
currentLocation        100
orgId                  146
noticePeriod           180
skills                   0
totalWorkExp            97
minExp                2099
maxExp                2099
applicantId              0
preferedLocation       563
holdingAnyOffer        585
expectedCtc            560
feedback              2078
comment               1940
isCurrent              906
companyEmail          1405
packageOffered        2084
lastWorkingDay        2078
remarks               2040
previousCompany       2099
offerLetter           2099
experienceLetter      2099
relieveingLetter      2099
isRecent              2099
paySlips              2099
createdOn                0
dtype: int64

In [70]:
emp = emp.drop(['currentCompany', 'currentDesignation', 'currentDesignation', 'annualSalary', 'functionalArea', 'role', 'currentLocation', 'orgId', 'noticePeriod', 'totalWorkExp', 'minExp', 'maxExp', 'preferedLocation', 'holdingAnyOffer', 'expectedCtc', 'feedback', 'comment', 'isCurrent', 'companyEmail', 'packageOffered', 'lastWorkingDay', 'remarks', 'previousCompany', 'offerLetter', 'experienceLetter', 'relieveingLetter', 'isRecent', 'paySlips', 'createdOn'], axis=1)

In [71]:
emp.columns

Index(['employmentId', 'uploadResume', 'industry', 'skills', 'applicantId'], dtype='object')

In [72]:
# 3. Jobs dataset
jobs = pd.read_csv('jobs.csv')
jobs.head()

Unnamed: 0,jobTitle,position,minBudget,maxBudget,travelReq,location,description,attachment,orgId,jobType,skills,recruiter,client,minExp,maxExp,noticePeriod,status,jobId,createDate,vacancies,priority,priceType,workType,comment,subvendorId,Questionnaires,country,department,eligibility,responsibilities
0,Plant Maintenance Engineer,Plant Maintenance Engineer,300000.0,600000.0,True,{Hyderabad},"Read and understand the data from P&ID, PEFS, ...",,1,Permanent,"{""Plant Maintenance""}",DB413862,CIHI7757,2.0,5.0,15-30 days,Active,JDA&5399,2023-01-31 11:09:26.36189,2.0,High,INR,Work from Office,,,,,,,
1,Adobe Analytics developer,AdobeAnalyticsdeveloper,14.0,15.0,True,{ChennaiBangaloreHyderabad},An understanding of the marketing technologies...,,1,Permanent,"{""Adobe anaytics"",Agile,tableau,Marketo,""power...",Venkata Sai Rohith,CIHW9344,3.0,4.0,15-30 days,Active,JDAA1476,2022-03-30 11:20:03.031659,1.0,High,,,,,,,,,
2,AEM architect,Aemarchitect,12.0,15.0,True,{bangalore},"To analyze, understand the client’s business u...",,1,Permanent,"{JIRA,Confluence,AEM}",Venkata Sai Rohith,CIHW9344,4.0,6.0,30-45 days,Active,JDAA5139,2022-03-07 13:21:41.393509,,Low,,,,,,,,,
3,Azure Administrator,Azure Administrator,14.0,16.0,False,"{""Pan India""}",1.\tArchitecture and Design for large scale Az...,,1,Permanent,"{""Azure Administrator ""}",PC138515,CIQC2196,6.0,8.0,45-60 days,Active,JDAA6310,2023-05-04 12:23:28.16582,1.0,High,INR,Work from Office,,,,,,,
4,AEM architect,Aemarchitect,30.0,32.0,True,{bangalore},"Should have experience in java, OSGI, Componen...",https://storage.googleapis.com/hrm-musquare/jo...,1,Permanent,"{sling,jackrabbit,OSGI,felix,JCR}",Venkata Sai Rohith,CIHW9344,9.96,15.0,30-45 days,Active,JDAA7883,2022-03-07 13:35:36.779486,,,,,,,,,,,


In [73]:
# Total no.of columns and rows
jobs.shape

(307, 30)

In [74]:
# List of columns
jobs.columns

Index(['jobTitle', 'position', 'minBudget', 'maxBudget', 'travelReq',
       'location', 'description', 'attachment', 'orgId', 'jobType', 'skills',
       'recruiter', 'client', 'minExp', 'maxExp', 'noticePeriod', 'status',
       'jobId', 'createDate', 'vacancies', 'priority', 'priceType', 'workType',
       'comment', 'subvendorId', 'Questionnaires', 'country', 'department',
       'eligibility', 'responsibilities'],
      dtype='object')

In [75]:
# Check datatypes of all the columns
jobs.dtypes

jobTitle             object
position             object
minBudget           float64
maxBudget           float64
travelReq              bool
location             object
description          object
attachment           object
orgId                 int64
jobType              object
skills               object
recruiter            object
client               object
minExp              float64
maxExp              float64
noticePeriod         object
status               object
jobId                object
createDate           object
vacancies           float64
priority             object
priceType            object
workType             object
comment              object
subvendorId         float64
Questionnaires      float64
country              object
department          float64
eligibility         float64
responsibilities    float64
dtype: object

In [76]:
# Information of the dataset
jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307 entries, 0 to 306
Data columns (total 30 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   jobTitle          307 non-null    object 
 1   position          307 non-null    object 
 2   minBudget         306 non-null    float64
 3   maxBudget         306 non-null    float64
 4   travelReq         307 non-null    bool   
 5   location          307 non-null    object 
 6   description       307 non-null    object 
 7   attachment        68 non-null     object 
 8   orgId             307 non-null    int64  
 9   jobType           307 non-null    object 
 10  skills            307 non-null    object 
 11  recruiter         307 non-null    object 
 12  client            307 non-null    object 
 13  minExp            307 non-null    float64
 14  maxExp            307 non-null    float64
 15  noticePeriod      307 non-null    object 
 16  status            307 non-null    object 
 1

In [77]:
# Statistical info of the dataset
jobs.describe()

Unnamed: 0,minBudget,maxBudget,orgId,minExp,maxExp,vacancies,subvendorId,Questionnaires,department,eligibility,responsibilities
count,306.0,306.0,307.0,307.0,307.0,270.0,0.0,0.0,0.0,0.0,0.0
mean,458916.0,874875.8,1.0,4.339707,7.091466,2.266667,,,,,
std,1532037.0,2216359.0,0.0,6.940431,9.066452,3.757916,,,,,
min,0.0,2.0,1.0,0.0,0.0,-1.0,,,,,
25%,6.0,15.0,1.0,2.0,5.0,1.0,,,,,
50%,24.5,275001.0,1.0,4.0,6.0,1.0,,,,,
75%,600000.0,1200000.0,1.0,5.0,8.0,2.0,,,,,
max,25000000.0,30000000.0,1.0,108.0,144.0,50.0,,,,,


In [78]:
# Check for missing values
jobs.isnull().sum()

jobTitle              0
position              0
minBudget             1
maxBudget             1
travelReq             0
location              0
description           0
attachment          239
orgId                 0
jobType               0
skills                0
recruiter             0
client                0
minExp                0
maxExp                0
noticePeriod          0
status                0
jobId                 0
createDate            0
vacancies            37
priority             45
priceType            50
workType             48
comment             302
subvendorId         307
Questionnaires      307
country             292
department          307
eligibility         307
responsibilities    307
dtype: int64

In [79]:
jobs = jobs.drop(['position', 'travelReq', 'attachment', 'orgId', 'recruiter', 'client', 'priority', 'priceType', 'workType', 'comment', 'subvendorId', 'Questionnaires', 'country', 'department', 'eligibility', 'responsibilities', 'minBudget', 'maxBudget', 'minExp', 'maxExp', 'noticePeriod', 'status', 'createDate', 'vacancies'], axis=1)

In [80]:
jobs.columns

Index(['jobTitle', 'location', 'description', 'jobType', 'skills', 'jobId'], dtype='object')

# **Data Cleaning**

In [81]:
# Initialize the SimpleImputer with a strategy (e.g., 'most_frequent' for categorical, 'mean' for numerical)
imputer = SimpleImputer(strategy='most_frequent')

In [82]:
# Impute missing values in each dataset
ed = pd.DataFrame(imputer.fit_transform(ed), columns=ed.columns)
emp = pd.DataFrame(imputer.fit_transform(emp), columns=emp.columns)
jobs = pd.DataFrame(imputer.fit_transform(jobs), columns=jobs.columns)

In [83]:
print("Missing values after imputation:")
print("Education missing values:\n", ed.isnull().sum())
print("Employment missing values:\n", emp.isnull().sum())
print("Jobs missing values:\n", jobs.isnull().sum())

Missing values after imputation:
Education missing values:
 Id                      0
primaryPercentage       0
secondaryPercentage     0
graduationPercentage    0
applicantId             0
dtype: int64
Employment missing values:
 employmentId    0
uploadResume    0
industry        0
skills          0
applicantId     0
dtype: int64
Jobs missing values:
 jobTitle       0
location       0
description    0
jobType        0
skills         0
jobId          0
dtype: int64


In [84]:
# Standardize the format of the 'skills' column across all datasets
def standardize_skills(skills):
    return skills.lower().replace(',', ' ').replace(';', ' ').replace('/', ' ').split()

emp['skills'] = emp['skills'].apply(standardize_skills)
jobs['skills'] = jobs['skills'].apply(standardize_skills)

# **Data Integration**

In [85]:
# Merge the education and employment datasets on applicantId
merged_df = pd.merge(ed, emp, on='applicantId', how='inner')
merged_df.columns

Index(['Id', 'primaryPercentage', 'secondaryPercentage',
       'graduationPercentage', 'applicantId', 'employmentId', 'uploadResume',
       'industry', 'skills'],
      dtype='object')

# **Feature Engineering**

In [86]:
# Convert all entries in the 'skills' columns to strings, then to lowercase, and fill missing values
merged_df['skills'] = merged_df['skills'].astype(str).str.lower().fillna('')
jobs['skills'] = jobs['skills'].astype(str).str.lower().fillna('')

In [87]:
# Create a function to check if any of the applicant's skills match the job skills
def match_skills(applicant_skills, job_skills):
    applicant_skills_set = set(applicant_skills.split(', '))
    job_skills_set = set(job_skills.split(', '))
    return bool(applicant_skills_set & job_skills_set)

In [88]:
# Filter the jobs dataframe
filtered_jobs = jobs[jobs['skills'].apply(lambda x: any(
    match_skills(applicant_skill, x) for applicant_skill in merged_df['skills']
))]

# Display the filtered jobs
filtered_jobs.head()

Unnamed: 0,jobTitle,location,description,jobType,skills,jobId
3,Azure Administrator,"{""Pan India""}",1.\tArchitecture and Design for large scale Az...,Permanent,"['{""azure', 'administrator', '""}']",JDAA6310
5,Assistant Brand Manager,{Ghargaon},Classification: All - Finance / Accounts / Inv...,Permanent,"['{sales', '""social', 'media', 'marketing""', '...",JDAB3826
6,Azure BI Developer - Excellerate,{Hyderabad},Resource must have 5+ years of hands on experi...,Permanent,"['{azure', '""', 'synapse""', '""', 'adf""}']",JDAB7664
7,Angular Developer,{Hyderabad},Requirements\nHands-on experience in UI develo...,Permanent,"['{angular', '""rest', 'api""}']",JDAD1928
8,Angular Developer,{Hyderabad},Minimum Qualifications:\nBachelor's degree in ...,Permanent,['{angular}'],JDAD1983


# **Model Building**

### Step 1: Combine Job Descriptions and Skills into a Single Feature

In [89]:
# Combine the job description and skills into a single feature
jobs['combined_features'] = jobs['description'] + ' ' + jobs['skills']
jobs[['jobTitle', 'combined_features']].head()

Unnamed: 0,jobTitle,combined_features
0,Plant Maintenance Engineer,"Read and understand the data from P&ID, PEFS, ..."
1,Adobe Analytics developer,An understanding of the marketing technologies...
2,AEM architect,"To analyze, understand the client’s business u..."
3,Azure Administrator,1.\tArchitecture and Design for large scale Az...
4,AEM architect,"Should have experience in java, OSGI, Componen..."


### Step 2: Vectorize the Combined Features

In [90]:
# Initialize TF-IDF Vectorizer and transform the job skills
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [91]:
# Fit and transform the combined features
tfidf_matrix = tfidf_vectorizer.fit_transform(jobs['combined_features'])
tfidf_matrix.shape

(307, 3672)

### Step 3: Compute the Cosine Similarity Matrix

In [92]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.08676152, 0.03274689, ..., 0.        , 0.0494567 ,
        0.01844573],
       [0.08676152, 1.        , 0.08878978, ..., 0.04140992, 0.08685324,
        0.        ],
       [0.03274689, 0.08878978, 1.        , ..., 0.04224572, 0.0406752 ,
        0.        ],
       ...,
       [0.        , 0.04140992, 0.04224572, ..., 1.        , 0.12658502,
        0.        ],
       [0.0494567 , 0.08685324, 0.0406752 , ..., 0.12658502, 1.        ,
        0.01468348],
       [0.01844573, 0.        , 0.        , ..., 0.        , 0.01468348,
        1.        ]])

### Step 4: Create a Function to Recommend Jobs

In [93]:
# Create a reverse mapping of job titles to index
job_indices = pd.Series(jobs.index, index=jobs['jobTitle']).drop_duplicates()

def get_skill_based_recommendations(skills, tfidf_matrix=tfidf_matrix, tfidf_vectorizer=tfidf_vectorizer, jobs_df=jobs, top_n=4):
    # Vectorize the input skills using the same TF-IDF vectorizer
    skill_vector = tfidf_vectorizer.transform([skills])

    # Compute the cosine similarity between the input skills and all job postings
    skill_sim = cosine_similarity(skill_vector, tfidf_matrix)

    # Get the pairwise similarity scores of all jobs with the input skills
    sim_scores = list(enumerate(skill_sim[0]))

    # Filter out jobs where the required skills are not mentioned in the job description or skills
    filtered_scores = []
    for idx, score in sim_scores:
        job_skills = jobs_df['skills'].iloc[idx]
        if any(skill.lower() in job_skills.lower() for skill in skills.split()):
            filtered_scores.append((idx, score))

    # Sort the jobs based on the similarity scores
    filtered_scores = sorted(filtered_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top N most similar jobs
    top_scores = filtered_scores[:top_n]

    # Get the job indices
    job_indices_list = [i[0] for i in top_scores]

    # Return the top N most similar jobs
    return jobs_df.iloc[job_indices_list]

In [99]:
# Test the recommendation function
recommended_jobs = get_skill_based_recommendations('Python, Machine Learning, Data Analyst')

print('Top job recommendations based on your skills:\n')

# Display the output neatly
for job in recommended_jobs.itertuples():
    print(f"Job ID: {job.jobId}\n")
    print(f"Job Title: {job.jobTitle}\n")

    # Remove all curly braces, square brackets, and quotes from skills
    skills = job.skills.replace('{', '').replace('}', '').replace('[', '').replace(']', '').replace('"', '').replace("'", '')
    print(f"Skills Required: {skills}\n")

    print(f"Description: {job.description}\n")
    print(f"Job Type: {job.jobType}\n")

    # Remove curly braces from location
    location = job.location.strip('{}')
    print(f"Location: {location}\n")
    print("\n\n")

Top job recommendations based on your skills:

Job ID: JDDS6273

Job Title: Data Scientist

Skills Required: sql, python, machine, learning, natural, language, processing

Description: Classification: All - Finance / Accounts / Investment Banking

Skills Required: SQL, Python, Machine Learning, Natural Language Processing (NLP),NoSQL, PyTorch Software Library, Scikit-learn, TensorFlow Software Library
Job Description

Experience: 3-5 Years
Location: Bangalore
Notice Period: Immediate-30 days
Interview process: Round 1: Assignment and Round 2: Technical Interview

Responsibilities:

1.Design, Implement and Evaluate models and Design machine learning systems(NLP, text analytics, information retrieval, search, and recommendation systems, Knowledge graph, conversational system, Time-series based modelling, forecasting)
2.Design, development, evaluate and deploy innovative and highly scalable ML to improve the quality of products.
3.You should be passionate about working with data sets and 

In [95]:
# Split the data into training and testing sets
jobs_train, jobs_test = train_test_split(jobs, test_size=0.2, random_state=42)

# Recreate the TF-IDF vectorizer and model using the training data
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix_train = tfidf_vectorizer.fit_transform(jobs_train['skills'])

# Function to get recommendations
def get_recommendations(skills, tfidf_matrix, tfidf_vectorizer, jobs_df, top_n=5):
    skills_tfidf = tfidf_vectorizer.transform([skills])
    cosine_similarities = linear_kernel(skills_tfidf, tfidf_matrix).flatten()
    related_job_indices = cosine_similarities.argsort()[-top_n:][::-1]
    return jobs_df.iloc[related_job_indices]

# Evaluate the model's performance
y_true = []
y_pred = []

for i, row in jobs_test.iterrows():
    skills = row['skills']
    recommended_jobs = get_recommendations(skills, tfidf_matrix_train, tfidf_vectorizer, jobs_train)

    true_job_title = row['jobTitle']
    pred_job_titles = recommended_jobs['jobTitle'].values

    y_true.append(true_job_title)
    y_pred.append(pred_job_titles[0] if len(pred_job_titles) > 0 else 'No Recommendation')

# Calculate precision, recall, and F1-score
precision = precision_score(y_true, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_true, y_pred, average='weighted', zero_division=1)
f1 = f1_score(y_true, y_pred, average='weighted', zero_division=1)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Precision: 0.9811827956989246
Recall: 0.06451612903225806
F1 Score: 0.056451612903225805


# **Model Deployment**

### Save the Model Components

In [96]:
import joblib

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# Save the job dataset
jobs_train.to_csv('jobs_train.csv', index=False)

Further step requires to create Streamlit App