# Table for constructing a star schema


In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)

In [2]:
df = pd.read_csv("../cleaned data/cleaned_df.csv")
print(df.head(10))

                                  title                       company    location       type                                industry  is_active salary_bucket  year  month  day  week  hour  minute  second accredited_label  Excel  Power BI  Tableau  Looker  Qlik  SPSS  Stata  Python  R  Java  JavaScript  TypeScript  C++  C  Ruby  PHP  Scala  SQL  NoSQL  MySQL  PostgreSQL  SQLite  MongoDB  Oracle  T-SQL  PL/SQL  AWS  Azure  Google Cloud  GCP  Firebase  Snowflake  Databricks  BigQuery  Git  GitHub  Docker  Kubernetes  Jenkins  VS Code  IntelliJ  Eclipse  Confluence  Jira  Selenium  Postman  Cypress  JUnit  TestNG  Pytest  Mocha  Chai  scikit-learn  TensorFlow  Keras  PyTorch  OpenCV  Pandas  NumPy  Airflow  Apache Spark  Kafka  Talend  Informatica  HTML  CSS  React  Angular  Vue.js  Bootstrap  jQuery  SASS  REST API  GraphQL  Salesforce  Microsoft Office  Outlook  SAP  Xero  QuickBooks  Dynamics 365
0                 Deafblind Coordinator           Blind Low Vision NZ    Auckland  Full tim

## Dimension Tables

### 1. Job_table

In [3]:
# 提取从'excel'列到最后一列，作为job_table
start_col = 'Excel'
if start_col in df.columns:
    start_idx = df.columns.get_loc(start_col)
    job_table = df.iloc[:, start_idx:]
    print(job_table.head(10))


   Excel  Power BI  Tableau  Looker  Qlik  SPSS  Stata  Python  R  Java  JavaScript  TypeScript  C++  C  Ruby  PHP  Scala  SQL  NoSQL  MySQL  PostgreSQL  SQLite  MongoDB  Oracle  T-SQL  PL/SQL  AWS  Azure  Google Cloud  GCP  Firebase  Snowflake  Databricks  BigQuery  Git  GitHub  Docker  Kubernetes  Jenkins  VS Code  IntelliJ  Eclipse  Confluence  Jira  Selenium  Postman  Cypress  JUnit  TestNG  Pytest  Mocha  Chai  scikit-learn  TensorFlow  Keras  PyTorch  OpenCV  Pandas  NumPy  Airflow  Apache Spark  Kafka  Talend  Informatica  HTML  CSS  React  Angular  Vue.js  Bootstrap  jQuery  SASS  REST API  GraphQL  Salesforce  Microsoft Office  Outlook  SAP  Xero  QuickBooks  Dynamics 365
0      0         0        0       0     0     0      0       0  0     0           0           0    0  0     0    0      0    0      0      0           0       0        0       0      0       0    0      0             0    0         0          0           0         0    0       0       0           0        0  

In [4]:
# 为每种唯一技能组合创建skill_id，并只保留每个id一行（作为dimension table）
if 'Excel' in df.columns:
    # 提取所有技能列（从Excel到最后一列）
    skill_cols = df.columns[df.columns.get_loc('Excel'):]
    # 将技能组合转为字符串用于唯一性
    job_table['skill_combo'] = job_table[skill_cols].astype(str).agg('-'.join, axis=1)
    # 为每种唯一组合分配skill_id
    combo2id = {combo: idx+1 for idx, combo in enumerate(job_table['skill_combo'].unique())}
    job_table['skill_id'] = job_table['skill_combo'].map(combo2id)
    # 只保留每个skill_id一种组合（去重）
    dim_skills = job_table.drop_duplicates(subset=['skill_id'])[['skill_id'] + list(skill_cols)]
    print(dim_skills.head(10))
else:
    print('未找到Excel列，无法生成技能组合id')

    skill_id  Excel  Power BI  Tableau  Looker  Qlik  SPSS  Stata  Python  R  Java  JavaScript  TypeScript  C++  C  Ruby  PHP  Scala  SQL  NoSQL  MySQL  PostgreSQL  SQLite  MongoDB  Oracle  T-SQL  PL/SQL  AWS  Azure  Google Cloud  GCP  Firebase  Snowflake  Databricks  BigQuery  Git  GitHub  Docker  Kubernetes  Jenkins  VS Code  IntelliJ  Eclipse  Confluence  Jira  Selenium  Postman  Cypress  JUnit  TestNG  Pytest  Mocha  Chai  scikit-learn  TensorFlow  Keras  PyTorch  OpenCV  Pandas  NumPy  Airflow  Apache Spark  Kafka  Talend  Informatica  HTML  CSS  React  Angular  Vue.js  Bootstrap  jQuery  SASS  REST API  GraphQL  Salesforce  Microsoft Office  Outlook  SAP  Xero  QuickBooks  Dynamics 365
0          1      0         0        0       0     0     0      0       0  0     0           0           0    0  0     0    0      0    0      0      0           0       0        0       0      0       0    0      0             0    0         0          0           0         0    0       0       0 

In [5]:
dim_skills.to_csv('dim_skills.csv', index=False)

### 2. 