# NZ Job Market Analysis (Tech Roles)

This notebook performs an exploratory analysis on New Zealand's tech job postings.
Replace `data/jobs_sample.csv` with your real dataset when ready.


In [None]:
# Setup
import pandas as pd
import matplotlib.pyplot as plt
import pathlib
from datetime import datetime
from src.utils import normalize_city, salary_midpoint

DATA_PATH = pathlib.Path('data/jobs_sample.csv')
df = pd.read_csv(DATA_PATH, parse_dates=['date_posted'])
print(df.head())

## Cleaning

In [None]:
# Basic cleaning
df['city'] = df['city'].apply(normalize_city)
df['seniority'] = df['seniority'].str.title()
df['salary_mid'] = df.apply(salary_midpoint, axis=1)
df['skills_list'] = df['skills'].fillna('').apply(lambda s: [x.strip() for x in str(s).split(';') if x.strip()])

print(df.isna().sum())
df.head()

## Overview: Roles and Locations

In [None]:
# Role counts
role_counts = df['role'].value_counts().sort_values(ascending=False)
print(role_counts)

plt.figure()
role_counts.plot(kind='bar')
plt.title('Role Counts')
plt.xlabel('Role')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# City counts
city_counts = df['city'].value_counts().sort_values(ascending=False)
print(city_counts)

plt.figure()
city_counts.plot(kind='bar')
plt.title('City Counts')
plt.xlabel('City')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

## Salary Distributions

In [None]:
# Salary midpoint by role
salary_by_role = df.groupby('role')['salary_mid'].describe()
print(salary_by_role)

plt.figure()
df.boxplot(column='salary_mid', by='role', rot=45)
plt.title('Salary Midpoint by Role')
plt.suptitle('')
plt.xlabel('Role')
plt.ylabel('Salary (NZD)')
plt.tight_layout()
plt.show()

## Skills Frequency

In [None]:
# Flatten skills
from collections import Counter
skills = Counter([skill for row in df['skills_list'] for skill in row])
top_skills = pd.Series(dict(skills.most_common(10)))

print(top_skills)

plt.figure()
top_skills.plot(kind='bar')
plt.title('Top Skills (Top 10)')
plt.xlabel('Skill')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

## Time Trend (Postings per Day)

In [None]:
# Postings over time
daily = df.groupby('date_posted').size()
print(daily)

plt.figure()
daily.plot(kind='line', marker='o')
plt.title('Job Postings Over Time')
plt.xlabel('Date')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

## Save Outputs

In [None]:
# Save cleaned dataset & figures (optional)
import os, pathlib
out_data = pathlib.Path('data/processed')
fig_dir = pathlib.Path('figures')
out_data.mkdir(parents=True, exist_ok=True)
fig_dir.mkdir(parents=True, exist_ok=True)

df.to_csv(out_data / 'jobs_cleaned.csv', index=False)
print('Saved:', out_data / 'jobs_cleaned.csv')