#### notebooks/02_Data_Preprocessing.ipynb

In [1]:
#Import libraries
import os
import sys
import pandas as pd
pd.set_option('display.max_colwidth', None)

from IPython.display import display
# Add project root to Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import the preprocessing functions
from src.preprocess import clean_data, save_clean_data
from src.config import SKILLS_LIST, JOB_CATEGORIES, CLUSTER_COLORS


In [2]:
# Load raw data
df_raw = pd.read_csv('../data/raw/jobs_raw_full.csv')

# Use the clean_data function (which internally uses all other functions)
df_clean = clean_data(df_raw)

# Save the results
save_clean_data(df_clean, '../data/processed/jobs_clean.csv')


Performing advanced skill extraction...
Jobs with no skills after first pass: 1040
Applying fallback categorization...
Jobs with no skills after fallback: 0


In [3]:
print("Cleaned data shape:", df_clean.shape)
print("\nColumns after cleaning:")
print(df_clean.columns.tolist())
print("\nFirst few rows of cleaned data:")
df_clean.head(2)

Cleaned data shape: (1783, 12)

Columns after cleaning:
['title', 'description', 'salary_min', 'salary_max', 'latitude', 'longitude', 'salary_mid', 'country', 'region', 'county', 'city', 'skills_extracted']

First few rows of cleaned data:


Unnamed: 0,title,description,salary_min,salary_max,latitude,longitude,salary_mid,country,region,county,city,skills_extracted
0,Data Engineer,"At NTT DATA, we know that with the right people on board, anything is possible. The quality, integrity, and commitment of our employees are key factors in our company’s growth, market presence and our ability to help our clients stay a step ahead of the competition. By hiring the best people and helping them grow both professionally and personally, we ensure a bright future for NTT DATA and for the people who work here. NTT DATA is currently looking for a Data Engineer for our growing team in t…",60358.53,60358.53,55.86026,-4.25596,60358.53,UK,Scotland,Glasgow,Unknown,general tech skills
1,Senior Data Analyst,"Competitive salary | UK/Glasgow: hybrid working model (2-3 days on site) At NTT DATA, we know that with the right people on board, anything is possible. The quality, integrity, and commitment of our employees are key factors in our company’s growth, market presence and our ability to help our clients stay a step ahead of the competition. By hiring the best people and helping them grow both professionally and personally, we ensure a bright future for NTT DATA and for the people who work here. NT…",72252.8,72252.8,55.86026,-4.25596,72252.8,UK,Scotland,Glasgow,Unknown,general tech skills


In [4]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1783 entries, 0 to 1782
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             1783 non-null   object 
 1   description       1783 non-null   object 
 2   salary_min        1783 non-null   float64
 3   salary_max        1783 non-null   float64
 4   latitude          1783 non-null   float64
 5   longitude         1783 non-null   float64
 6   salary_mid        1783 non-null   float64
 7   country           1783 non-null   object 
 8   region            1783 non-null   object 
 9   county            1783 non-null   object 
 10  city              1783 non-null   object 
 11  skills_extracted  1783 non-null   object 
dtypes: float64(5), object(7)
memory usage: 167.3+ KB


In [5]:
# Check for any remaining missing values
print("\nMissing values after cleaning:")
print(df_clean.isna().sum())


Missing values after cleaning:
title               0
description         0
salary_min          0
salary_max          0
latitude            0
longitude           0
salary_mid          0
country             0
region              0
county              0
city                0
skills_extracted    0
dtype: int64


In [6]:
#..Save the fully cleaned data
clean_data_path = os.path.join(project_root, 'data', 'processed', 'jobs_clean.csv')
save_clean_data(df_clean, clean_data_path)

#print(f"\nFull cleaned data saved to: {clean_data_path}")

In [7]:
# Save a sample clean data for demonstration
sample_data_path = os.path.join(project_root, 'data', 'processed', 'jobs_clean_sample.csv')
save_clean_data(df_clean, sample_data_path, sample_size=50)

#print(f"Sample data saved to: {sample_data_path}")


In [8]:

# Basic statistics
print(f"\nNumber of jobs: {len(df_clean)}")
print(f"Average salary: £{df_clean['salary_mid'].mean():.2f}")
print(f"Salary range: £{df_clean['salary_mid'].min():.2f} - £{df_clean['salary_mid'].max():.2f}")



Number of jobs: 1783
Average salary: £54516.29
Salary range: £12.00 - £312000.00


In [9]:
# Explore job distribution by region
print("\nJobs by region:")
display(df_clean['region'].value_counts())


Jobs by region:


region
London                      344
South East England          306
North West England          195
Eastern England             191
South West England          171
West Midlands               129
Yorkshire And The Humber    123
Scotland                    111
East Midlands                88
Wales                        64
North East England           38
Northern Ireland             22
Channel Isles                 1
Name: count, dtype: int64