# Part 1: Data Preparation

 - Objective: Analyze changes in Data Science and Analytics (DSA) job content over the past 10 months in Taipei.

 - Data Collection: Compilation of job data in Taipei, focusing on positions related to Data Science and Analytics.

 - Data Scope: The dataset includes some positions that may not be directly relevant to DSA.

 - Initial Step: Filter out job postings that are not pertinent to DSA.
 
 - Analysis Method: Employ dynamic topic modeling to identify shifts in the content of DSA job descriptions during the 10-month period.

# Environment Setup

In [None]:
import utils, json
import pandas as pd
import numpy as np

In [2]:
bigquery_client = utils.get_bigquery_client("job")

# Data Acquisition

## From BigQuery

In [3]:
query = """
with
  raw_data as 
    (
      select
        *,
        row_number() over(partition by id order by update_date desc) as order_id
      from `{{project}}.{{dataset}}.{{datatable}}`
    ),
  latest as 
    (
      select
        * except(order_id)
      from raw_data
      where order_id = 1
    )
  select
    id,
    job_name,
    job_desc
  from latest
  where ( salary like any ("%面議%", "%月薪%", "%年薪%") ) 
    and lower(job_name) not like any ("%實習%", "%工讀%", "%打工%", "%接案%", "%parttime%", "%part time%", "%part-time%", "%intern%")
"""
data = utils.get_bigquery_query(query, bigquery_client)

In [4]:
data.info()

In [5]:
data.to_csv("data/20231119_job_market_job.csv", index=False)

## Get Embeddings

In [6]:
list_job_name = utils.get_splited_list(data["job_name"].tolist(), 2000)
list_job_name_embeddings = [utils.get_openai_embedding(i) for i in list_job_name]
list_job_name_embeddings = [i for j in list_job_name_embeddings for i in j]
list_job_name_embeddings = [i.embedding for i in list_job_name_embeddings]

In [7]:
json.dump(list_job_name_embeddings, open("data/20231119_job_market_job_name_embeddings.json", "w"))

## Load

In [8]:
data = pd.read_csv("data/20231119_job_market_job.csv")
list_job_name_embeddings = json.load(open("data/20231119_job_market_job_name_embeddings.json", "r"))