In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import nltk

In [2]:
# Download NLTK data (only need to run once)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Amol\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Amol\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Amol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Amol\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:

def clean_job_description(text):
    # Step 1: Lowercasing
    text = text.lower()
    # Step 2: Remove HTML Tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Step 3: Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    def replace_range_with_average(match):
        num1, num2 = map(int, match.group().split('-'))
        return str((num1 + num2) // 2)
    text = re.sub(r'\b\d+-\d+\b', replace_range_with_average, text)

    # Step 4: Remove Non-Alphanumeric Characters
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)    
    # Step 5: Tokenization
    tokens = word_tokenize(text)
    # Step 6: Remove Stop Words
    stop_words = set(stopwords.words('english'))
    custom_stop_words = {"details", "visit", "looking", "we", "are", "must", "have"}
    stop_words.update(custom_stop_words)
    tokens = [word for word in tokens if word not in stop_words]
    # Step 8: Join Tokens
    cleaned_text = ' '.join(tokens)
    # Step 9: Capitalize First Letter of Each Word
    capitalized_text = cleaned_text.title()
    return capitalized_text


In [4]:
df = pd.read_csv(r'.\data\job_data.csv')
df['Processed Job Description'] = df['Description'].apply(clean_job_description)
df.head()

Unnamed: 0,Title,Description,Detail URL,Location,Company Name,Company Logo,Company Apply Url,Processed Job Description,work experience
0,Full-Stack Developer [1 month NP MAX],Project Description:Luxoft has been asked to c...,https://www.linkedin.com/jobs/view/3918301973/,"Bengaluru, Karnataka, India",Luxoft,https://media.licdn.com/dms/image/v2/C560BAQGE...,https://www.linkedin.com/job-apply/3918301973,Project Description Luxoft Asked Contract Full...,5.0
1,Full Stack Developer,"Introduction\n\nAt IBM, work is more than a jo...",https://www.linkedin.com/jobs/view/4035894613/,"Bengaluru East, Karnataka, India",IBM,https://media.licdn.com/dms/image/v2/D560BAQGi...,https://IBM.contacthr.com/142997828?Codes=SN_L...,Introduction Ibm Work Job Calling Build Design...,5.0
2,Senior Frontend Developer,Exp Level: 5-7 yearsProject:Enterprise Portal ...,https://www.linkedin.com/jobs/view/4075231539/,Greater Bengaluru Area,Recro,https://media.licdn.com/dms/image/v2/D560BAQH7...,https://www.linkedin.com/job-apply/4075231539,Exp Level 6 Yearsproject Enterprise Portal Dev...,6.0
3,Web Backend Developer,"Who We Are\n\nAt Kyndryl, we design, build, ma...",https://www.linkedin.com/jobs/view/4041912483/,"Bengaluru, Karnataka, India",Kyndryl India,https://media.licdn.com/dms/image/v2/C4E0BAQGW...,https://kyndryl.wd5.myworkdayjobs.com/KyndrylP...,Kyndryl Design Build Manage Modernize Mission ...,7.5
4,Web Developer - Intelligent Automation,We require someone with 5-7 years experience i...,https://www.linkedin.com/jobs/view/4045773818/,"Bengaluru, Karnataka, India",BSH Home Appliances India,https://media.licdn.com/dms/image/v2/D4D0BAQGD...,https://jobs.bsh-group.de/34629-web-developer-...,Require Someone 6 Years Experience Web Develop...,6.0


In [5]:
df.to_csv(r'.\data\job_data.csv', index=False, header=True)

In [6]:
def average_work_experience(job_description):  
    # Improved regex pattern to capture only one-digit years  
    pattern = re.compile(r'''  
        (?<!\d)  # Negative lookbehind to avoid digits before  
        (?:(\d)\s*(?:-\s*|\s*to\s*)(\d)\s*years?|  # Range in years (e.g., 2-5 years or 2 to 5 years)  
        (\d)\s*years?|  # Single experience (e.g., 5 years)  
        (?:minimum|at least)\s*(\d)\s*years?|  # Minimum experience (e.g., minimum 3 years)  
        (\d)\s*[^-]*?(?:experience|exp|needed|required|or|and)?)(?!\d)  # Years without years keyword  
    ''', re.VERBOSE | re.IGNORECASE)  

    matches = pattern.findall(job_description)  
    years = []  

    for match in matches:  
        # Collect years from the captures  
        if match[0] and match[1]:  # Range match (e.g., 2-5)  
            low_exp = int(match[0])  # Only considers the first digit  
            high_exp = int(match[1])  # Only considers the second digit  
            if low_exp < 10 and high_exp < 10:  # Check if both are single digits  
                years.append((low_exp + high_exp) / 2)  # Average of the range  
        elif match[2]:  # Single year match  
            years.append(int(match[2]))  # Only considers single digit year  
        elif match[3]:  # Minimum experience  
            years.append(int(match[3]))  # Only considers single digit year  
        elif match[4]:  # Years associated with keywords  
            years.append(int(match[4]))  # Only considers single digit year  
    
    if years:  
        max_year = max(years)  # Identify the maximum value  
        return max_year
        # Calculate weighted average  
        weighted_average = (sum(years) + max_year) / (len(years) + 1)  # Adding max_year to sum and 1 to count  
        return weighted_average
    return 0
    return 0  # Return 0 if no valid years found  

In [7]:
df['Required Experience'] = df['Description'].apply(average_work_experience)
df.head()

Unnamed: 0,Title,Description,Detail URL,Location,Company Name,Company Logo,Company Apply Url,Processed Job Description,work experience,Required Experience
0,Full-Stack Developer [1 month NP MAX],Project Description:Luxoft has been asked to c...,https://www.linkedin.com/jobs/view/3918301973/,"Bengaluru, Karnataka, India",Luxoft,https://media.licdn.com/dms/image/v2/C560BAQGE...,https://www.linkedin.com/job-apply/3918301973,Project Description Luxoft Asked Contract Full...,5.0,5.0
1,Full Stack Developer,"Introduction\n\nAt IBM, work is more than a jo...",https://www.linkedin.com/jobs/view/4035894613/,"Bengaluru East, Karnataka, India",IBM,https://media.licdn.com/dms/image/v2/D560BAQGi...,https://IBM.contacthr.com/142997828?Codes=SN_L...,Introduction Ibm Work Job Calling Build Design...,5.0,5.0
2,Senior Frontend Developer,Exp Level: 5-7 yearsProject:Enterprise Portal ...,https://www.linkedin.com/jobs/view/4075231539/,Greater Bengaluru Area,Recro,https://media.licdn.com/dms/image/v2/D560BAQH7...,https://www.linkedin.com/job-apply/4075231539,Exp Level 6 Yearsproject Enterprise Portal Dev...,6.0,6.0
3,Web Backend Developer,"Who We Are\n\nAt Kyndryl, we design, build, ma...",https://www.linkedin.com/jobs/view/4041912483/,"Bengaluru, Karnataka, India",Kyndryl India,https://media.licdn.com/dms/image/v2/C4E0BAQGW...,https://kyndryl.wd5.myworkdayjobs.com/KyndrylP...,Kyndryl Design Build Manage Modernize Mission ...,7.5,7.5
4,Web Developer - Intelligent Automation,We require someone with 5-7 years experience i...,https://www.linkedin.com/jobs/view/4045773818/,"Bengaluru, Karnataka, India",BSH Home Appliances India,https://media.licdn.com/dms/image/v2/D4D0BAQGD...,https://jobs.bsh-group.de/34629-web-developer-...,Require Someone 6 Years Experience Web Develop...,6.0,6.0


In [8]:
df.to_csv(r'.\data\job_data.csv', index=False, header=True)