# Table for constructing a star schema


In [76]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)

In [77]:
df = pd.read_csv("../cleaned data/cleaned_df.csv")
print(df.head(10))

      id                                 title                       company       type                                industry           created_at  is_active sub_region      region salary_bucket accredited_label  Excel  Power BI  Tableau  Looker  Qlik  SPSS  Stata  Python  R  Java  JavaScript  TypeScript  C++  C  Ruby  PHP  Scala  SQL  NoSQL  MySQL  PostgreSQL  SQLite  MongoDB  Oracle  T-SQL  PL/SQL  AWS  Azure  Google Cloud  GCP  Firebase  Snowflake  Databricks  BigQuery  Git  GitHub  Docker  Kubernetes  Jenkins  VS Code  IntelliJ  Eclipse  Confluence  Jira  Selenium  Postman  Cypress  JUnit  TestNG  Pytest  Mocha  Chai  scikit-learn  TensorFlow  Keras  PyTorch  OpenCV  Pandas  NumPy  Airflow  Apache Spark  Kafka  Talend  Informatica  HTML  CSS  React  Angular  Vue.js  Bootstrap  jQuery  SASS  REST API  GraphQL  Salesforce  Microsoft Office  Outlook  SAP  Xero  QuickBooks  Dynamics 365
0  21070                 Deafblind Coordinator           Blind Low Vision NZ  Full time        Com

## Dimension Tables

### 1. dim_skills

In [78]:
# 提取从'excel'列到最后一列，作为job_table
start_col = 'Excel'
if start_col in df.columns:
    start_idx = df.columns.get_loc(start_col)
    job_table = df.iloc[:, start_idx:]
    print(job_table.head(10))


   Excel  Power BI  Tableau  Looker  Qlik  SPSS  Stata  Python  R  Java  JavaScript  TypeScript  C++  C  Ruby  PHP  Scala  SQL  NoSQL  MySQL  PostgreSQL  SQLite  MongoDB  Oracle  T-SQL  PL/SQL  AWS  Azure  Google Cloud  GCP  Firebase  Snowflake  Databricks  BigQuery  Git  GitHub  Docker  Kubernetes  Jenkins  VS Code  IntelliJ  Eclipse  Confluence  Jira  Selenium  Postman  Cypress  JUnit  TestNG  Pytest  Mocha  Chai  scikit-learn  TensorFlow  Keras  PyTorch  OpenCV  Pandas  NumPy  Airflow  Apache Spark  Kafka  Talend  Informatica  HTML  CSS  React  Angular  Vue.js  Bootstrap  jQuery  SASS  REST API  GraphQL  Salesforce  Microsoft Office  Outlook  SAP  Xero  QuickBooks  Dynamics 365
0      0         0        0       0     0     0      0       0  0     0           0           0    0  0     0    0      0    0      0      0           0       0        0       0      0       0    0      0             0    0         0          0           0         0    0       0       0           0        0  

In [79]:
# 为每种唯一技能组合创建skill_id，并只保留每个id一行（作为dimension table）
if 'Excel' in df.columns:
    # 提取所有技能列（从Excel到最后一列）
    skill_cols = df.columns[df.columns.get_loc('Excel'):]
    # 将技能组合转为字符串用于唯一性
    job_table['skill_combo'] = job_table[skill_cols].astype(str).agg('-'.join, axis=1)
    # 为每种唯一组合分配skill_id
    combo2id = {combo: idx+1 for idx, combo in enumerate(job_table['skill_combo'].unique())}
    job_table['skill_id'] = job_table['skill_combo'].map(combo2id)
    # 只保留每个skill_id一种组合（去重）
    dim_skills = job_table.drop_duplicates(subset=['skill_id'])[['skill_id'] + list(skill_cols)]
    print(dim_skills.head(10))
else:
    print('未找到Excel列，无法生成技能组合id')

    skill_id  Excel  Power BI  Tableau  Looker  Qlik  SPSS  Stata  Python  R  Java  JavaScript  TypeScript  C++  C  Ruby  PHP  Scala  SQL  NoSQL  MySQL  PostgreSQL  SQLite  MongoDB  Oracle  T-SQL  PL/SQL  AWS  Azure  Google Cloud  GCP  Firebase  Snowflake  Databricks  BigQuery  Git  GitHub  Docker  Kubernetes  Jenkins  VS Code  IntelliJ  Eclipse  Confluence  Jira  Selenium  Postman  Cypress  JUnit  TestNG  Pytest  Mocha  Chai  scikit-learn  TensorFlow  Keras  PyTorch  OpenCV  Pandas  NumPy  Airflow  Apache Spark  Kafka  Talend  Informatica  HTML  CSS  React  Angular  Vue.js  Bootstrap  jQuery  SASS  REST API  GraphQL  Salesforce  Microsoft Office  Outlook  SAP  Xero  QuickBooks  Dynamics 365
0          1      0         0        0       0     0     0      0       0  0     0           0           0    0  0     0    0      0    0      0      0           0       0        0       0      0       0    0      0             0    0         0          0           0         0    0       0       0 

In [80]:
dim_skills.to_csv('dim_skills.csv', index=False)

### 2. dim_locations

In [81]:
dim_locations = df[['sub_region', 'region']].drop_duplicates().reset_index(drop=True)
dim_locations['location_id'] = np.arange(1, len(dim_locations) + 1)
cols = ['location_id'] + [col for col in dim_locations.columns if col != 'location_id']
dim_locations = dim_locations[cols]
print(dim_locations.head(20))

    location_id    sub_region         region
0             1       Unknown       Auckland
1             2       Unknown     Canterbury
2             3       Unknown          Otago
3             4       Unknown     Wellington
4             5       Unknown       Taranaki
5             6       Unknown       Manawatu
6             7       Unknown        Waikato
7             8       Unknown      Northland
8             9       Unknown  Bay of Plenty
9            10       Unknown      Southland
10           11       Unknown       Gisborne
11           12       Unknown     Hawkes Bay
12           13       Unknown         Tasman
13           14       Unknown     West Coast
14           15       Unknown    Marlborough
15           16       Unknown    New Zealand
16           17        Hornby     Canterbury
17           18      Onehunga       Auckland
18           19       Porirua     Wellington
19           20  Auckland CBD       Auckland


In [82]:
dim_locations.to_csv('dim_locations.csv', index=False)

### 3. dim_titles

In [83]:
dim_titles = df[['title']].drop_duplicates().reset_index(drop=True)
dim_titles['title_id'] = np.arange(1, len(dim_titles) + 1)
cols = ['title_id'] + [col for col in dim_titles.columns if col != 'title_id']
dim_titles = dim_titles[cols]
print(dim_titles.head(10))

   title_id                                 title
0         1                 Deafblind Coordinator
1         2  Site supervisor for structural steel
2         3      Project manager structural steel
3         4          Medical Device Kit Assembler
4         5                Holiday Park Assistant
5         6             Venue Manager - Bakehouse
6         7                 Senior Analyst Tester
7         8                        Analyst Tester
8         9                   Construction Lawyer
9        10               Legal Executive Estates


In [84]:
dim_titles.to_csv('dim_titles.csv', index=False)

### 4. dim_companies

In [85]:
dim_companies = df[['company']].drop_duplicates().reset_index(drop=True)
dim_companies['company_id'] = np.arange(1, len(dim_companies) + 1)
cols = ['company_id'] + [col for col in dim_companies.columns if col != 'company_id']
dim_companies = dim_companies[cols]
print(dim_companies.head(10))

   company_id                       company
0           1           Blind Low Vision NZ
1           2  Grayson Engineering 2015 Ltd
2           3                       Stryker
3           4       Lakes Edge Holiday Park
4           5      Ayrburn Precinct Limited
5           6     FNZ Services (NZ) Limited
6           7         Forte Recruitment Ltd
7           8         Nicholsons Solicitors
8           9            Star Personnel Ltd
9          10         McAlpine Hussmann Ltd


In [86]:
dim_companies.to_csv('dim_companies.csv', index=False)

### 5. dim_job_type

In [87]:
dim_job_type = df[['type']].drop_duplicates().reset_index(drop=True)
dim_job_type['job_type_id'] = np.arange(1, len(dim_job_type) + 1)
cols = ['job_type_id'] + [col for col in dim_job_type.columns if col != 'job_type_id']
dim_job_type = dim_job_type[cols]
print(dim_job_type.head(10))

   job_type_id             type
0            1        Full time
1            2        Part time
2            3    Contract/Temp
3            4  Casual/Vacation


In [88]:
dim_job_type.to_csv('dim_job_type.csv', index=False)

### 6. dim_industries

In [89]:
dim_industries = df[['industry']].drop_duplicates().reset_index(drop=True)
dim_industries['industry_id'] = np.arange(1, len(dim_industries) + 1)
cols = ['industry_id'] + [col for col in dim_industries.columns if col != 'industry_id']
dim_industries = dim_industries[cols]
print(dim_industries.head(10))

   industry_id                                industry
0            1        Community Services & Development
1            2                            Construction
2            3                    Healthcare & Medical
3            4                   Hospitality & Tourism
4            5  Information & Communication Technology
5            6                                   Legal
6            7    Manufacturing, Transport & Logistics
7            8                       Trades & Services
8            9           Human Resources & Recruitment
9           10            Banking & Financial Services


In [90]:
dim_industries.to_csv('dim_industries.csv', index=False)

### 7. dim_datetime

In [91]:
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

# 生成日期范围
start_date = df['created_at'].min().normalize()
end_date = pd.Timestamp.today().normalize()
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# 构建维表
dim_datetime = pd.DataFrame({'date': date_range})
dim_datetime['year'] = dim_datetime['date'].dt.year
dim_datetime['month'] = dim_datetime['date'].dt.month
dim_datetime['day'] = dim_datetime['date'].dt.day
dim_datetime['quarter'] = dim_datetime['date'].dt.quarter
dim_datetime['week'] = dim_datetime['date'].dt.isocalendar().week
dim_datetime['weekday'] = dim_datetime['date'].dt.weekday + 1  # 1=Monday
dim_datetime['datetime_id'] = dim_datetime['date'].dt.strftime('%Y%m%d').astype(int)

# 调整列顺序
cols = ['datetime_id', 'date', 'year', 'quarter', 'month', 'week', 'day', 'weekday']
dim_datetime = dim_datetime[cols]

print(dim_datetime.head(10))

   datetime_id       date  year  quarter  month  week  day  weekday
0     20240824 2024-08-24  2024        3      8    34   24        6
1     20240825 2024-08-25  2024        3      8    34   25        7
2     20240826 2024-08-26  2024        3      8    35   26        1
3     20240827 2024-08-27  2024        3      8    35   27        2
4     20240828 2024-08-28  2024        3      8    35   28        3
5     20240829 2024-08-29  2024        3      8    35   29        4
6     20240830 2024-08-30  2024        3      8    35   30        5
7     20240831 2024-08-31  2024        3      8    35   31        6
8     20240901 2024-09-01  2024        3      9    35    1        7
9     20240902 2024-09-02  2024        3      9    36    2        1


In [92]:
dim_datetime.to_csv('dim_datetime.csv', index=False)

### 8. dim_salary

In [93]:
dim_salary = df[['salary_bucket']].drop_duplicates().reset_index(drop=True)
dim_salary['salary_id'] = np.arange(1, len(dim_salary) + 1)
cols = ['salary_id'] + [col for col in dim_salary.columns if col != 'salary_id']
dim_salary = dim_salary[cols]
print(dim_salary.head(10))

   salary_id salary_bucket
0          1    Negotiable
1          2         Other
2          3      50k-100k
3          4         100k+
4          5         0-50k
5          6   Competitive


In [94]:
dim_salary.to_csv('dim_salary.csv', index=False)

### 9. dim_accredited

In [95]:
dim_accredited = df[['accredited_label']].drop_duplicates().reset_index(drop=True)
dim_accredited['accredited_id'] = np.arange(1, len(dim_accredited) + 1)
cols = ['accredited_id'] + [col for col in dim_accredited.columns if col != 'accredited_id']
dim_accredited = dim_accredited[cols]
print(dim_accredited.head(10))

   accredited_id accredited_label
0              1   Not Accredited
1              2       Accredited
2              3          Unknown


In [96]:
dim_accredited.to_csv('dim_accredited.csv', index=False)

## Fact table

### fact_jobs

In [101]:
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df['datetime_id'] = df['created_at'].dt.strftime('%Y%m%d').astype(int)

skill_cols = df.columns[df.columns.get_loc('Excel'):]  # 假设技能列从'Excel'开始
df['skill_combo'] = df[skill_cols].astype(str).agg('-'.join, axis=1)

# 2. 建立技能组合到skill_id的映射（与dim_skills一致）
combo2id = {combo: idx+1 for idx, combo in enumerate(df['skill_combo'].unique())}
df['skill_id'] = df['skill_combo'].map(combo2id)


# 合并各维度ID
fact_jobs = df.merge(dim_titles, on='title', how='left') \
    .merge(dim_companies, on='company', how='left') \
    .merge(dim_locations, on=['sub_region', 'region'], how='left') \
    .merge(dim_job_type, on='type', how='left') \
    .merge(dim_industries, on='industry', how='left') \
    .merge(dim_salary, on='salary_bucket', how='left') \
    .merge(dim_accredited, on='accredited_label', how='left') \
    .merge(dim_datetime[['datetime_id']], on='datetime_id', how='left')\
    .merge(dim_skills[['skill_id']], on='skill_id', how='left')



# 只保留 fact 表需要的 id 字段和指标字段
fact_jobs = fact_jobs[[
    'id','title_id', 'company_id', 'location_id', 'job_type_id', 'industry_id',
    'salary_id', 'accredited_id', 'datetime_id', 'skill_id', 'is_active'
    
]]

print(fact_jobs.head(20))

       id  title_id  company_id  location_id  job_type_id  industry_id  salary_id  accredited_id  datetime_id  skill_id  is_active
0   21070         1           1            1            1            1          1              1     20240824         1          0
1   21840         2           2            1            1            2          1              1     20240824         2          0
2   22213         3           2            1            1            2          1              1     20240824         3          0
3   26397         4           3            1            1            3          2              1     20240824         2          0
4   27671         5           4            2            1            4          1              1     20240824         2          0
5   27946         6           5            3            1            4          1              2     20240824         2          0
6   29815         7           6            1            1            5          1  

In [102]:
fact_jobs.to_csv('fact_jobs.csv', index=False)