# Data Cleaning and Preprocessing

In [1]:
# Import packages
import pandas as pd
import re
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# Load the CSV file
df_seek = pd.read_csv("../step1_connect db and ETL strategy/job_seek_new.csv")

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)

print(df_seek.head())

      id                                 title                       company    location       type  \
0  21070                 Deafblind Coordinator           Blind Low Vision NZ    Auckland  Full time   
1  21840  Site supervisor for structural steel  Grayson Engineering 2015 Ltd    Auckland  Full time   
2  22213      Project manager structural steel  Grayson Engineering 2015 Ltd    Auckland  Full time   
3  26397          Medical Device Kit Assembler                       Stryker    Auckland  Full time   
4  27671                Holiday Park Assistant       Lakes Edge Holiday Park  Canterbury  Full time   

                                                                                                                                                                                                                                                                                                                                                                                             

In [3]:
# Check DataFrame info
df_seek.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95863 entries, 0 to 95862
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   95863 non-null  int64  
 1   title                95863 non-null  object 
 2   company              95863 non-null  object 
 3   location             95863 non-null  object 
 4   type                 95863 non-null  object 
 5   description          95859 non-null  object 
 6   url                  95863 non-null  object 
 7   industry             95863 non-null  object 
 8   salary               95858 non-null  object 
 9   hour_rate            35574 non-null  float64
 10  weekly_hours         0 non-null      float64
 11  source               95863 non-null  object 
 12  created_at           95863 non-null  object 
 13  classified_location  95863 non-null  object 
 14  classified_industry  95863 non-null  object 
 15  classified_type      95863 non-null 

In [4]:
duplicates = df_seek.duplicated()
print(f"duplicated rows: {duplicates.any()}")
print(f"duplicated row numbers: {duplicates.sum()}")

if duplicates.any():
    print(df_seek[duplicates])

duplicated rows: False
duplicated row numbers: 0


In [5]:
# Drop unnecessary columns
df = df_seek.drop(columns=[
    'short_description', 'source', 'date', 'url', 'classified_type', 'classified_industry', 'classified_location'
    ]
)
print(df.head(10))

      id                                 title                       company    location       type  \
0  21070                 Deafblind Coordinator           Blind Low Vision NZ    Auckland  Full time   
1  21840  Site supervisor for structural steel  Grayson Engineering 2015 Ltd    Auckland  Full time   
2  22213      Project manager structural steel  Grayson Engineering 2015 Ltd    Auckland  Full time   
3  26397          Medical Device Kit Assembler                       Stryker    Auckland  Full time   
4  27671                Holiday Park Assistant       Lakes Edge Holiday Park  Canterbury  Full time   
5  27946             Venue Manager - Bakehouse      Ayrburn Precinct Limited       Otago  Full time   
6  29815                 Senior Analyst Tester     FNZ Services (NZ) Limited    Auckland  Full time   
7  29830                        Analyst Tester     FNZ Services (NZ) Limited  Wellington  Full time   
8  30043                   Construction Lawyer         Forte Recruitment 

In [6]:
print(df['location'].head())

0      Auckland
1      Auckland
2      Auckland
3      Auckland
4    Canterbury
Name: location, dtype: object


In [7]:
print(df['location'].tail())

95858                 Henderson, Auckland
95859        Whangarei Central, Northland
95860           Hamilton Central, Waikato
95861    Christchurch Central, Canterbury
95862                   Hamilton, Waikato
Name: location, dtype: object


In [8]:
def split_location(loc):
    if ',' in loc:
        parts = loc.split(',')
        sub_region = parts[0].strip()
        region = parts[1].strip()
    else:
        sub_region = 'Unknown'
        region = loc.strip()
    return pd.Series([sub_region, region])


df[['sub_region', 'region']] = df['location'].apply(split_location)

# Outliers in 'region' column
country_names = [
    "New Zealand", "Australia", "China", "India", "Philippines", 
    "Singapore", "Fiji", "Malaysia", "South Africa", "United Kingdom"
]

df = df[~df["region"].isin(country_names)]

In [9]:
df = df.drop(columns= 'location')

In [10]:
# Extract skill keywords from the description field
import re
skills = [
    # Analytics tools
    "Excel", "Power BI", "Tableau", "Looker", "Qlik", "SPSS", "Stata",
    # Programming languages
    "Python", "R", "Java", "JavaScript", "TypeScript", "C#", "C++", "C", "Ruby", "PHP", "Scala",
    # Databases
    "SQL", "NoSQL", "MySQL", "PostgreSQL", "SQLite", "MongoDB", "Oracle", "T-SQL", "PL/SQL",
    # Cloud platforms
    "AWS", "Azure", "Google Cloud", "GCP", "Firebase", "Snowflake", "Databricks", "BigQuery",
    # Development tools
    "Git", "GitHub", "Docker", "Kubernetes", "Jenkins", "VS Code", "IntelliJ", "Eclipse", "Confluence", "Jira",
    # Testing
    "Selenium", "Postman", "Cypress", "JUnit", "TestNG", "Pytest", "Mocha", "Chai",
    # Machine learning
    "scikit-learn", "TensorFlow", "Keras", "PyTorch", "XGBoost", "LightGBM", "OpenCV", "Hugging Face",
    # ETL and data engineering
    "Pandas", "NumPy", "BeautifulSoup", "Scrapy", "Airflow", "Apache Spark", "Kafka", "Talend", "Informatica",
    # Web development
    "HTML", "CSS", "React", "Angular", "Vue.js", "Bootstrap", "jQuery", "SASS", "REST API", "GraphQL",
    # Business software
    "Salesforce", "Microsoft Office", "Outlook", "SAP", "Xero", "QuickBooks", "Dynamics 365"
 ]

# Build regex pattern
skills_sorted = sorted(skills, key=lambda x: -len(x))
pattern = r"(?i)\b(" + "|".join(re.escape(skill) for skill in skills_sorted) + r")\b"


def extract_skills(text):
    if not isinstance(text, str):
        return []
    matches = re.findall(pattern, text)
    return list(set(match.strip().title() for match in matches))
# Apply to all descriptions
df["skills_extracted"] = df["description"].apply(extract_skills)


In [11]:
df = df.drop(columns=['description'])
print(df.head(10))

      id                                 title                       company       type                                industry                       salary  hour_rate  weekly_hours           created_at  is_active    job_id  accredited sub_region      region                skills_extracted
0  21070                 Deafblind Coordinator           Blind Low Vision NZ  Full time        Community Services & Development                           面议        NaN           NaN  2024-08-24 20:55:36          0  78022405         0.0    Unknown    Auckland  [Microsoft Office, Salesforce]
1  21840  Site supervisor for structural steel  Grayson Engineering 2015 Ltd  Full time                            Construction                           面议        NaN           NaN  2024-08-24 20:56:50          0  78263986         0.0    Unknown    Auckland                              []
2  22213      Project manager structural steel  Grayson Engineering 2015 Ltd  Full time                            Construction

In [12]:
df.drop(columns = ['job_id'], inplace=True)

In [13]:
# Recalculate missing value ratio for hour rate and weekly hours in a new cell
for col in ['hour_rate', 'weekly_hours']:
    if col in df.columns:
        missing_ratio = df[col].isna().mean()
        print(f"Missing ratio for {col}: {missing_ratio:.2%}")
        if missing_ratio > 0.5:
            print(f"Column '{col}' has more than 50% missing values, dropping it.")
            df.drop(columns=[col], inplace=True)
    else:
        print(f"Column '{col}' not found in DataFrame.")

Missing ratio for hour_rate: 62.87%
Column 'hour_rate' has more than 50% missing values, dropping it.
Missing ratio for weekly_hours: 100.00%
Column 'weekly_hours' has more than 50% missing values, dropping it.


In [14]:
print(df['salary'].head())

0                             面议
1                             面议
2                             面议
3    Base salary + Bonus + Super
4                             面议
Name: salary, dtype: object


In [15]:
df_salary = df.copy()

In [16]:
def preprocess_salary_text(s):
    if isinstance(s, str):
        if s.strip() == "面议":
            return "Negotiable"
        elif s.strip().lower().startswith("competitive"):
            return "Competitive"
        else:
            return s
    return s

def parse_salary(s):
    if not isinstance(s, str):
        return np.nan
    s = s.replace(',', '').replace('$', '')
    match = re.search(r'(\d+)[kK]?[\s–-]+(\d+)[kK]?', s)
    if match:
        low = int(match.group(1))
        high = int(match.group(2))
        if 'hour' in s:
            return (low + high) / 2 * 40 * 52
        else:
            return (low + high) / 2
    match = re.search(r'(\d+)[kK]?', s)
    if match:
        val = int(match.group(1))
        if 'hour' in s:
            return val * 2000 * 40 * 52
        else:
            return val
    return np.nan

# Step 1: Clean salary text
df_salary['salary_clean'] = df_salary['salary'].apply(preprocess_salary_text)

# Step 2: Extract numeric salary (only for those not Negotiable/Competitive)
df_salary['salary_num'] = np.where(df_salary['salary_clean'].isin(['Negotiable', 'Competitive']), np.nan, df_salary['salary_clean'].apply(parse_salary))

# Step 3: Bin salary
bins = [0, 50000, 100000, np.inf]
labels = ['0-50k', '50k-100k', '100k+']
df_salary['salary_bucket'] = pd.cut(df_salary['salary_num'], bins=bins, labels=labels, right=False)

# Convert to object dtype to allow new string categories
df_salary['salary_bucket'] = df_salary['salary_bucket'].astype(object)

# Step 4: Assign Negotiable, Competitive, Other
df_salary.loc[df_salary['salary_clean'] == 'Negotiable', 'salary_bucket'] = 'Negotiable'
df_salary.loc[df_salary['salary_clean'] == 'Competitive', 'salary_bucket'] = 'Competitive'
df_salary.loc[df_salary['salary_bucket'].isna() & (~df_salary['salary_clean'].isin(['Negotiable', 'Competitive'])), 'salary_bucket'] = 'Other'

# Step 5: View result
print(df_salary[['salary', 'salary_clean', 'salary_num', 'salary_bucket']].head(20))

                            salary                    salary_clean  salary_num salary_bucket
0                               面议                      Negotiable         NaN    Negotiable
1                               面议                      Negotiable         NaN    Negotiable
2                               面议                      Negotiable         NaN    Negotiable
3      Base salary + Bonus + Super     Base salary + Bonus + Super         NaN         Other
4                               面议                      Negotiable         NaN    Negotiable
5                               面议                      Negotiable         NaN    Negotiable
6                               面议                      Negotiable         NaN    Negotiable
7                               面议                      Negotiable         NaN    Negotiable
8      $80,000 – $100,000 per year     $80,000 – $100,000 per year     90000.0      50k-100k
9                               面议                      Negotiable    

In [17]:
print(df_salary['salary_bucket'].value_counts(dropna=False).to_string())

salary_bucket
Negotiable     56890
50k-100k       13890
0-50k           7376
Other           7305
100k+           6936
Competitive     3407


In [18]:
df_salary = df_salary.drop(columns=['salary_clean', 'salary_num', 'salary'])
print(df_salary.head(10))

      id                                 title                       company       type                                industry           created_at  is_active  accredited sub_region      region                skills_extracted salary_bucket
0  21070                 Deafblind Coordinator           Blind Low Vision NZ  Full time        Community Services & Development  2024-08-24 20:55:36          0         0.0    Unknown    Auckland  [Microsoft Office, Salesforce]    Negotiable
1  21840  Site supervisor for structural steel  Grayson Engineering 2015 Ltd  Full time                            Construction  2024-08-24 20:56:50          0         0.0    Unknown    Auckland                              []    Negotiable
2  22213      Project manager structural steel  Grayson Engineering 2015 Ltd  Full time                            Construction  2024-08-24 20:57:25          0         0.0    Unknown    Auckland                       [Outlook]    Negotiable
3  26397          Medical Device Kit

In [19]:
# Convert created_at to datetime and extract time components
df_time = df_salary.copy()

In [20]:
print(df_time['type'].value_counts(dropna=False).to_string())
print('-' * 20)
print(df_time['accredited'].value_counts(dropna=False).to_string())

type
Full time          74886
Part time          10298
Contract/Temp       8276
Casual/Vacation     2344
--------------------
accredited
0.0    41342
NaN    35521
1.0    18941


In [21]:
# Map accredited field to three categories
df_acc = df_time.copy()
df_acc['accredited_label'] = df_acc['accredited'].map({1.0: 'Accredited', 0.0: 'Not Accredited'})
df_acc['accredited_label'] = df_acc['accredited_label'].fillna('Unknown')
df_acc = df_acc.drop(columns=['accredited'])
print(df_acc.head(10))

      id                                 title                       company       type                                industry           created_at  is_active sub_region      region                skills_extracted salary_bucket accredited_label
0  21070                 Deafblind Coordinator           Blind Low Vision NZ  Full time        Community Services & Development  2024-08-24 20:55:36          0    Unknown    Auckland  [Microsoft Office, Salesforce]    Negotiable   Not Accredited
1  21840  Site supervisor for structural steel  Grayson Engineering 2015 Ltd  Full time                            Construction  2024-08-24 20:56:50          0    Unknown    Auckland                              []    Negotiable   Not Accredited
2  22213      Project manager structural steel  Grayson Engineering 2015 Ltd  Full time                            Construction  2024-08-24 20:57:25          0    Unknown    Auckland                       [Outlook]    Negotiable   Not Accredited
3  26397        

In [22]:
cleaned_df = df_acc.copy()
print(cleaned_df.head(10))

      id                                 title                       company       type                                industry           created_at  is_active sub_region      region                skills_extracted salary_bucket accredited_label
0  21070                 Deafblind Coordinator           Blind Low Vision NZ  Full time        Community Services & Development  2024-08-24 20:55:36          0    Unknown    Auckland  [Microsoft Office, Salesforce]    Negotiable   Not Accredited
1  21840  Site supervisor for structural steel  Grayson Engineering 2015 Ltd  Full time                            Construction  2024-08-24 20:56:50          0    Unknown    Auckland                              []    Negotiable   Not Accredited
2  22213      Project manager structural steel  Grayson Engineering 2015 Ltd  Full time                            Construction  2024-08-24 20:57:25          0    Unknown    Auckland                       [Outlook]    Negotiable   Not Accredited
3  26397        

In [None]:
# Save the cleaned DataFrame to a new CSV file
cleaned_df.to_csv("cleaned_df.csv", index=False)