## Download and loading of dataset

In [6]:
import kagglehub
import os
import pandas as pd

# Download latest version to the specified directory
path = kagglehub.dataset_download("arshkon/linkedin-job-postings")

print(f"Path to dataset files: {path}")
print(f"List of files in the dataset: {os.listdir(path)}")

Path to dataset files: /home/gabriel/.cache/kagglehub/datasets/arshkon/linkedin-job-postings/versions/13
List of files in the dataset: ['postings.csv', 'mappings', 'jobs', 'companies']


**Drop indexes with NaN values**

In [7]:
postings_path = path + "/postings.csv"
postings_df = pd.read_csv(
    postings_path, usecols=["title", "location", "company_name", "description"]
)

# Set zip_code column type to int
# postings_df["zip_code"] = postings_df["zip_code"].astype("Int64", errors="raise")

# Count rows where any of these columns is NaN
rows_with_any_nan = (
    postings_df[["company_name", "description", "title"]].isna().any(axis=1).sum()
)
print(f"Rows with at least one NaN value: {rows_with_any_nan}")

# drop rows with NaN values in specific columns
print(f"Number of rows before dropping NaN values: {postings_df.shape[0]}")
postings_df.dropna(subset=["company_name", "description", "title"], inplace=True)

print(f"Number of rows after dropping NaN values: {postings_df.shape[0]}")

Rows with at least one NaN value: 1725
Number of rows before dropping NaN values: 123849
Number of rows after dropping NaN values: 122124


In [8]:
postings_df.reset_index(drop=True, inplace=True)

In [9]:
postings_df

Unnamed: 0,company_name,title,description,location
0,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,"Princeton, NJ"
1,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,"Cincinnati, OH"
2,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,"New Hyde Park, NY"
3,Downtown Raleigh Alliance,Economic Development and Planning Intern,Job summary:The Economic Development & Plannin...,"Raleigh, NC"
4,Raw Cereal,Producer,Company DescriptionRaw Cereal is a creative de...,United States
...,...,...,...,...
122119,Lozano Smith,Title IX/Investigations Attorney,Our Walnut Creek office is currently seeking a...,"Walnut Creek, CA"
122120,Pinterest,"Staff Software Engineer, ML Serving Platform",About Pinterest:\n\nMillions of people across ...,United States
122121,EPS Learning,"Account Executive, Oregon/Washington",Company Overview\n\nEPS Learning is a leading ...,"Spokane, WA"
122122,Trelleborg Applied Technologies,Business Development Manager,The Business Development Manager is a 'hunter'...,"Texas, United States"


**Cleaning the descriptions of unwanted characters such as emojis etc**

In [10]:
import re


# Text cleaning with camelCase splitting
def clean_text(text):
    # Split camelCase words (insert space before capital letters that follow lowercase letters)
    text = re.sub(r"(?<=[a-z])(?=[A-Z])", " ", text)

    # Continue with regular cleaning
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(r"\d{10,}", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s.,!?]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


# Apply the clean_text function and ASSIGN the result back
postings_df["description"] = postings_df["description"].apply(lambda x: clean_text(x))

postings_df

Unnamed: 0,company_name,title,description,location
0,Corcoran Sawyer Smith,Marketing Coordinator,job description a leading real estate firm in ...,"Princeton, NJ"
1,The National Exemplar,Assitant Restaurant Manager,the national exemplar is accepting application...,"Cincinnati, OH"
2,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,senior associate attorney elder law trusts and...,"New Hyde Park, NY"
3,Downtown Raleigh Alliance,Economic Development and Planning Intern,job summary the economic development planning ...,"Raleigh, NC"
4,Raw Cereal,Producer,company description raw cereal is a creative d...,United States
...,...,...,...,...
122119,Lozano Smith,Title IX/Investigations Attorney,our walnut creek office is currently seeking a...,"Walnut Creek, CA"
122120,Pinterest,"Staff Software Engineer, ML Serving Platform",about pinterest millions of people across the ...,United States
122121,EPS Learning,"Account Executive, Oregon/Washington",company overview eps learning is a leading k 1...,"Spokane, WA"
122122,Trelleborg Applied Technologies,Business Development Manager,the business development manager is a hunter t...,"Texas, United States"
