# Forecasting AI and ML Job Trends

At this stage, we clean the text data in preparation for training

## Dependencies

In [1]:
import re
import pandas as pd

from tqdm.notebook import tqdm  # Or use tqdm instead of tqdm.notebook

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

## Data Loading

In [None]:
filename = "data/b_job_postings_labelled.parquet"
data = pd.read_parquet(filename)

In [3]:
print(f"{len(data):,} job postings loaded from {filename}")
data.sample(5)

1,296,381 job postings loaded from ../data/b_job_postings_labelled.parquet


Unnamed: 0,job_description,label
498029,campus ambassador- front desk receptionist \n ...,0
550128,rn - big sky family medicine clinic (ft- 0.9 f...,0
280939,automotive c level technician \n assemb...,0
589332,stu engage coord/residence dir \n direc...,0
347389,store manager in training \n equal-oppo...,0


## Text Cleaning

Download necessary NLTK data

In [4]:
nltk.download('punkt')
nltk.download('punkt_tab') 
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/mzitoh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/mzitoh/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mzitoh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mzitoh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
class TextCleaner:
    def __init__(self):
        self.stop_words = set(stopwords.words("english"))
        self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        # Remove newline characters and extra spaces
        text = re.sub(r"\s+", " ", text)

        # Convert text to lowercase
        text = text.lower()

        # Remove special characters and punctuation
        text = re.sub(r"[^\w\s]", "", text)

        # Tokenize the text
        tokens = word_tokenize(text)

        # Remove stopwords
        tokens = [word for word in tokens if word not in self.stop_words]

        # Lemmatize the words
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens]

        # Join tokens back into a single string
        cleaned_text = " ".join(tokens)

        return cleaned_text

    def clean_dataframe(self, df, text_column):
        tqdm.pandas()
        cleaned_texts = df[text_column].progress_apply(self.clean_text)
        cleaned_data = pd.DataFrame({'text': cleaned_texts, 'label': df['label']})
        return cleaned_data

In [6]:
text_cleaner = TextCleaner()
cleaned_data = text_cleaner.clean_dataframe(data, 'job_description')

  0%|          | 0/1296381 [00:00<?, ?it/s]

In [7]:
cleaned_data.sample(5)

Unnamed: 0,text,label
977955,residential manager stock fitter western state...,0
709925,retail key holderjefferson mall manager custom...,0
867375,state local tax financial service manager mana...,0
325765,sql database reliability engineer agricultural...,0
669320,crna needed locum tenens coverage facility nea...,0


## Save cleaned data

In [None]:
cleaned_filename = "data/c_job_postings_dataset_cleaned.parquet"
cleaned_data.to_parquet(cleaned_filename, index=False)

print(f"Saved cleaned data to {cleaned_filename}")

Saved cleaned data to ../data/c_job_postings_dataset_cleaned.parquet
