In [2]:
import pandas as pd
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import nltk

In [3]:
# Download NLTK data (only need to run once)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Amol\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Amol\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Amol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Amol\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:

def clean_job_description(text):
    # Step 1: Lowercasing
    text = text.lower()
    # Step 2: Remove HTML Tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Step 3: Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    def replace_range_with_average(match):
        num1, num2 = map(int, match.group().split('-'))
        return str((num1 + num2) // 2)
    text = re.sub(r'\b\d+-\d+\b', replace_range_with_average, text)

    # Step 4: Remove Non-Alphanumeric Characters
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)    
    # Step 5: Tokenization
    tokens = word_tokenize(text)
    # Step 6: Remove Stop Words
    stop_words = set(stopwords.words('english'))
    custom_stop_words = {"details", "visit", "looking", "we", "are", "must", "have"}
    stop_words.update(custom_stop_words)
    tokens = [word for word in tokens if word not in stop_words]
    # Step 8: Join Tokens
    cleaned_text = ' '.join(tokens)
    # Step 9: Capitalize First Letter of Each Word
    capitalized_text = cleaned_text.title()
    return capitalized_text


In [5]:
df = pd.read_csv(r'.\data\job_data.csv')
df['Processed Job Description'] = df['Description'].apply(clean_job_description)
df.head()

Unnamed: 0,Title,Description,Detail URL,Location,Company Name,Company Logo,Company Apply Url,Processed Job Description,Required Experience,skills
0,Fullstack Developer,"Introduction\n\nAt IBM, work is more than a jo...",https://www.linkedin.com/jobs/view/4025815970/,"Bengaluru East, Karnataka, India",IBM,https://media.licdn.com/dms/image/v2/D560BAQGi...,https://IBM.contacthr.com/142516460?Codes=SN_L...,Introduction Ibm Work Job Calling Build Design...,8.0,Typescript Api Java Javascript Ai Rest Css Nos...
1,Remote Data Scientist,A leading US client is seeking Data Scientists...,https://www.linkedin.com/jobs/view/4040359115/,India,Turing,https://media.licdn.com/dms/image/v2/D4D0BAQFw...,https://www.turing.com/remote-developer-jobs/j...,Leading Us Client Seeking Data Scientists Data...,2.0,Python Sql Engineering
2,AWS DevOps Engineer,Job Title: Sr. AWS DevOps Engineer\n Job Locat...,https://www.linkedin.com/jobs/view/4057613660/,"Bengaluru, Karnataka, India",Centilytics | Intelligent Cloud Management,https://media.licdn.com/dms/image/v2/C510BAQFW...,https://www.linkedin.com/job-apply/4057613660,Job Title Sr Aws Devops Engineer Job Location ...,7.5,Scripting Shell Azure Python Mysql Aws Cloud Git
3,Data Scientist,ABOUT THE POSITION:\n\nThis position will be w...,https://www.linkedin.com/jobs/view/4054420394/,"Bengaluru, Karnataka, India",Fluence,https://media.licdn.com/dms/image/v2/D4E0BAQFX...,https://jobs.lever.co/fluence/58068d9c-5e16-49...,Position Position Within Fluence Data Science ...,5.0,Pandas Engineering Python Cloud Sql Numpy Ai
4,Data Scientist III,Intuit Credit Karma is a mission-driven compan...,https://www.linkedin.com/jobs/view/4019123395/,"Bengaluru, Karnataka, India",Credit Karma,https://media.licdn.com/dms/image/v2/D560BAQHE...,https://www.linkedin.com/job-apply/4019123395,Intuit Credit Karma Mission Driven Company Foc...,7.0,Engineering Python Tensorflow Cloud Sql


In [6]:
def average_work_experience(job_description):  
    pattern = re.compile(r'''  
        (?<!\d)  
        (?:(\d)\s*(?:-\s*|\s*to\s*)(\d)\s*years?| 
        (\d)\s*years?|   
        (?:minimum|at least)\s*(\d)\s*years?|    
        (\d)\s*[^-]*?(?:experience|exp|needed|required|or|and)?)(?!\d)
    ''', re.VERBOSE | re.IGNORECASE)  

    matches = pattern.findall(job_description)  
    years = []  

    for match in matches:  
        # Collect years from the captures  
        if match[0] and match[1]: 
            low_exp = int(match[0])  
            high_exp = int(match[1])   
            if low_exp < 10 and high_exp < 10:    
                years.append((low_exp + high_exp) / 2) 
        elif match[2]:   
            years.append(int(match[2]))   
        elif match[3]:   
            years.append(int(match[3]))   
        elif match[4]:   
            years.append(int(match[4])) 
    
    if years:  
        max_year = max(years)
        return max_year
    return 0


In [7]:
df['Required Experience'] = df['Description'].apply(average_work_experience)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head()

Unnamed: 0,Title,Description,Detail URL,Location,Company Name,Company Logo,Company Apply Url,Processed Job Description,Required Experience,skills
0,Applications Developer 3,"Job Description\n\nAnalyze, design develop, tr...",https://www.linkedin.com/jobs/view/4074922230/,"Bengaluru, Karnataka, India",Oracle,https://media.licdn.com/dms/image/v2/D4E0BAQHY...,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...,Job Description Analyze Design Develop Trouble...,6.0,Java Saas Git Cloud
1,C++ Software Engineer - Gaming,Description\n\nThe Team and Role\n\nThe Gaming...,https://www.linkedin.com/jobs/view/4075387645/,"Chennai, Tamil Nadu, India",Logitech,https://media.licdn.com/dms/image/v2/D560BAQGZ...,https://jobs.jobvite.com/logitech/job/ob9Oufwm...,Description Team Role Gaming Brand Logitech G ...,7.0,C++ Python C React
2,Application Engineer - II,Role Description\n\nFlipkart is shaping the e-...,https://www.linkedin.com/jobs/view/3927701360/,"Bengaluru, Karnataka, India",Flipkart,https://media.licdn.com/dms/image/v2/C560BAQF6...,http://www.flipkartcareers.com/#!/job-view/app...,Role Description Flipkart Shaping E Commerce E...,4.5,Scripting Engineering Java Javascript Python L...
3,Data Scientists,AI Engineer: Design and implement generative A...,https://www.linkedin.com/jobs/view/4042203122/,"Bangalore Urban, Karnataka, India",Virtusa,https://media.licdn.com/dms/image/v2/D560BAQGE...,https://www.virtusa.com/careers/in/bangalore/a...,Ai Engineer Design Implement Generative Ai Mod...,0.0,Keras Pytorch Api Java Azure Python Opencv Ten...
4,"Software Engineer I, ITC","Become a Part of the NIKE, Inc. Team\n\nNIKE, ...",https://www.linkedin.com/jobs/view/4076648322/,"Karnataka, India",Nike,https://media.licdn.com/dms/image/v2/C560BAQFc...,https://click.appcast.io/track/l4rehql-org?cs=...,Become Part Nike Inc Team Nike Inc Outfit Worl...,1.0,Engineering Javascript Python Nosql React Clou...


In [8]:
df.to_csv(r'.\data\job_data.csv', index=False, header=True)

In [11]:
def get_unique_skills(s):
    df = pd.read_csv(r'.\data\job_data.csv')
    # Filter rows where title contains 'data scientist' (case insensitive)
    filtered_df = df[df['Title'].str.contains(s, case=False)]

    # Extract skills from the filtered rows and split into individual skills
    skills = filtered_df['skills'].str.split().explode()

    # Get unique skills
    unique_skills = skills.unique()

    # Print the unique skills
    print(f"Unique skills for jobs with {s} in the title:")
    print(unique_skills)
    
get_unique_skills('Engineer')

Unique skills for jobs with Engineer in the title:
['C++' 'Python' 'C' 'React' 'Scripting' 'Engineering' 'Java' 'Javascript'
 'Linux' 'Nosql' 'Cloud' 'Aws' 'Git' 'Azure' 'Postgres' 'Mysql' 'Rest'
 'Ai' 'Html' 'Css' 'Coding' 'Postgresql' 'Sql' 'Mongodb' 'Api'
 'Typescript' 'Shell' 'Saas']
