#### notebooks/01_Data_Collection.ipynb

In [11]:
# Import libraries
import os
import sys
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_colwidth', None)
from IPython.display import display

from dotenv import load_dotenv

# Add project root to Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import the function from the module
from src.data_collection import fetch_multiple_categories 
from src.config import SKILLS_LIST, JOB_CATEGORIES, CLUSTER_COLORS

In [12]:
#  Load environment variables

load_dotenv(os.path.join(project_root, '.env'))
APP_ID = os.getenv('ADZUNA_APP_ID')
APP_KEY = os.getenv('ADZUNA_APP_KEY')

In [13]:
# Validate credentials
if not APP_ID or not APP_KEY:
    raise RuntimeError("Missing APP_ID or APP_KEY. Check your .env file.")

In [14]:
#  Define the search queries
queries = JOB_CATEGORIES

In [15]:

#  Fetch all jobs
df = fetch_multiple_categories(queries, APP_ID, APP_KEY)

Fetching data jobs...
Fetching ai jobs...
Fetching healthcare jobs...
Fetched 2967 unique jobs across all categories


In [16]:
#  Basic fetched data info
print(f"Final dataset: {len(df)} unique jobs")
print("\nFirst few rows:")
df.head(3)

Final dataset: 2967 unique jobs

First few rows:


Unnamed: 0,company,salary_max,salary_is_predicted,title,description,category,location,__CLASS__,created,salary_min,id,redirect_url,contract_time,adref,job_type,longitude,latitude,contract_type
0,"{'__CLASS__': 'Adzuna::API::Response::Company', 'display_name': 'VAST Data'}",58175.97,1,Senior Systems Engineer,"VAST Data is looking to hire a Senior Systems Engineer! This is a great opportunity to be part of one of the fastest-growing infrastructure companies in history, an organization that is in the center of the hurricane being created by the revolution in artificial intelligence. ""VASTs data management vision is the future of the market.""- Forbes VAST Data is the data platform company for the AI era. We are building the enterprise software infrastructure to capture, catalog, refine, enrich, and pro…","{'__CLASS__': 'Adzuna::API::Response::Category', 'label': 'IT Jobs', 'tag': 'it-jobs'}","{'display_name': 'London, UK', '__CLASS__': 'Adzuna::API::Response::Location', 'area': ['UK', 'London']}",Adzuna::API::Response::Job,2025-08-30T14:53:47Z,58175.97,5375120328,https://www.adzuna.co.uk/jobs/details/5375120328?utm_medium=api&utm_source=2ffe0866,full_time,eyJhbGciOiJIUzI1NiJ9.eyJpIjoiNTM3NTEyMDMyOCIsInMiOiJZcnFIZlVTbjhCRzR2dlBVblhFbVpBIn0.pr-oQA8ijM13Pr6GahDaV8iE3OI5KEpjnRO_VGDtQts,data,,,
1,"{'display_name': 'VAST Data', '__CLASS__': 'Adzuna::API::Response::Company'}",54171.54,1,Sales Engineer,"This is a great opportunity to be part of one of the fastest-growing infrastructure companies in history, an organization that is in the centre of the hurricane being created by the revolution in artificial intelligence. ""VASTs data management vision is the future of the market.""- Forbes VAST Data is the data platform company for the AI era. We are building the enterprise software infrastructure to capture, catalog, refine, enrich, and protect massive datasets and make them available for real-t…","{'__CLASS__': 'Adzuna::API::Response::Category', 'label': 'IT Jobs', 'tag': 'it-jobs'}","{'area': ['UK', 'London'], 'display_name': 'London, UK', '__CLASS__': 'Adzuna::API::Response::Location'}",Adzuna::API::Response::Job,2025-09-03T14:19:03Z,54171.54,5381176581,https://www.adzuna.co.uk/jobs/details/5381176581?utm_medium=api&utm_source=2ffe0866,full_time,eyJhbGciOiJIUzI1NiJ9.eyJzIjoiWXJxSGZVU244Qkc0dnZQVW5YRW1aQSIsImkiOiI1MzgxMTc2NTgxIn0.R6EfeR8RUWUFIJf0Q2KWSj18C44l3ln1VvG2uusOOi0,data,,,
2,"{'__CLASS__': 'Adzuna::API::Response::Company', 'display_name': 'VAST Data'}",60681.4,1,Sales Engineer,"This is a great opportunity to be part of one of the fastest-growing infrastructure companies in history, an organization that is in the centre of the hurricane being created by the revolution in artificial intelligence. ""VASTs data management vision is the future of the market.""- Forbes VAST Data is the data platform company for the AI era. We are building the enterprise software infrastructure to capture, catalog, refine, enrich, and protect massive datasets and make them available for real-t…","{'label': 'IT Jobs', '__CLASS__': 'Adzuna::API::Response::Category', 'tag': 'it-jobs'}","{'area': ['UK'], 'display_name': 'UK', '__CLASS__': 'Adzuna::API::Response::Location'}",Adzuna::API::Response::Job,2025-09-03T14:19:03Z,60681.4,5381176580,https://www.adzuna.co.uk/jobs/details/5381176580?utm_medium=api&utm_source=2ffe0866,full_time,eyJhbGciOiJIUzI1NiJ9.eyJpIjoiNTM4MTE3NjU4MCIsInMiOiJZcnFIZlVTbjhCRzR2dlBVblhFbVpBIn0.ITpGfFXMvdJzdfddQNdhxqGhONntxaffMiT8NOEs__U,data,,,


In [17]:
# Data columns 
df.columns

Index(['company', 'salary_max', 'salary_is_predicted', 'title', 'description',
       'category', 'location', '__CLASS__', 'created', 'salary_min', 'id',
       'redirect_url', 'contract_time', 'adref', 'job_type', 'longitude',
       'latitude', 'contract_type'],
      dtype='object')

In [18]:
#  Save the FULL raw data (for the real analysis)
full_data_path = os.path.join(project_root, 'data', 'raw', 'jobs_raw_full.csv')
os.makedirs(os.path.dirname(full_data_path), exist_ok=True)
df.to_csv(full_data_path, index=False)

#print(f"Full raw data saved to: {full_data_path}")

In [19]:
#Save some sampled raw data for future use/reference 
random_sample = df.sample(n=50, random_state=42).reset_index(drop=True)  # Random sample of 100 entries

# To make sure directory exists first
os.makedirs(os.path.join(project_root, 'data', 'raw'), exist_ok=True)
random_sample.to_csv(os.path.join(project_root, 'data', 'raw', 'jobs_raw.csv'), index=False)
print("Sample data saved to ../data/raw/jobs_raw.csv")

Sample data saved to ../data/raw/jobs_raw.csv


In [20]:
df.shape

(2967, 18)