In [1]:
import pandas as pd

# Sample dataset with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [24, 30, None, 22, 35],
    'Salary': [48000, None, 57000, None, 60000]
}
df = pd.DataFrame(data)

# Filling missing values and dropping rows
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].median())
df.dropna(subset=['Name'], inplace=True)

print('After cleaning:\n', df)

After cleaning:
       Name    Age   Salary
0    Alice  24.00  48000.0
1      Bob  30.00  57000.0
2  Charlie  27.75  57000.0
3    David  22.00  57000.0


In [2]:
# Sample dataset with inconsistent categorical values
data = {
    'Product': ['Laptop', 'Laptop', 'Desktop', 'Tablet', 'Tablet'],
    'Category': ['Electronics', 'electronics', 'Electronics', 'Gadgets', 'gadgets']
}
df = pd.DataFrame(data)

# Standardize category values
df['Category'] = df['Category'].str.capitalize()
print('Standardized Data:\n', df)

Standardized Data:
    Product     Category
0   Laptop  Electronics
1   Laptop  Electronics
2  Desktop  Electronics
3   Tablet      Gadgets
4   Tablet      Gadgets


In [5]:
import pandas as pd

df = pd.read_csv('job_market.csv')

print("Missing values per kolom:")
print(df.isna().sum())

Missing values per kolom:
job_title               0
company                 0
location                0
job_type               29
category               20
salary_min              0
salary_max              0
experience_required    43
publication_date        0
skills                 50
dtype: int64


In [8]:
# Normalisasi Min-Max
df['salary_min_norm'] = (df['salary_min'] - df['salary_min'].min()) / (df['salary_min'].max() - df['salary_min'].min())
df['salary_max_norm'] = (df['salary_max'] - df['salary_max'].min()) / (df['salary_max'].max() - df['salary_max'].min())

# Experience masih ada missing - isi dulu median
df['experience_required'] = df['experience_required'].fillna(df['experience_required'].median())
df['experience_required_norm'] = (
    df['experience_required'] - df['experience_required'].min()
) / (df['experience_required'].max() - df['experience_required'].min())

In [9]:
df['salary_min_z'] = (df['salary_min'] - df['salary_min'].mean()) / df['salary_min'].std()
df['salary_max_z'] = (df['salary_max'] - df['salary_max'].mean()) / df['salary_max'].std()
df['experience_required_z'] = (df['experience_required'] - df['experience_required'].mean()) / df['experience_required'].std()

In [11]:
# Standarisasi ke lowercase
df['job_title'] = df['job_title'].str.lower()
df['company'] = df['company'].str.lower()
df['location'] = df['location'].str.lower()
df['job_type'] = df['job_type'].str.lower()
df['category'] = df['category'].str.lower()
df['skills'] = df['skills'].str.lower()

# Tangani missing values kategori (mode)
df['job_type'] = df['job_type'].fillna(df['job_type'].mode()[0])
df['category'] = df['category'].fillna(df['category'].mode()[0])
df['skills'] = df['skills'].fillna(df['skills'].mode()[0])

# Hapus duplikasi
df = df.drop_duplicates()
df.head()

Unnamed: 0,job_title,company,location,job_type,category,salary_min,salary_max,experience_required,publication_date,skills,salary_min_norm,salary_max_norm,experience_required_norm,salary_min_z,salary_max_z,experience_required_z
0,engineering manager,datainc,"san francisco, ca",remote,technology,151082,291345,4.0,2025-11-27,"aws, agile, machine learning, kubernetes, mong...",0.948083,1.0,0.333333,2.674913,3.166496,-0.083902
1,engineering manager,enterprisehub,"new york, ny",remote,technology,156891,280075,3.0,2025-11-27,"java, agile, git, sql, ruby, go",1.0,0.949084,0.25,2.925478,2.903438,-0.530191
2,engineering manager,startupxyz,"seattle, wa",part-time,technology,152134,280310,4.0,2025-11-27,"aws, python, kubernetes, git",0.957485,0.950146,0.333333,2.72029,2.908923,-0.083902
3,lead engineer,webdynamics,"seattle, wa",full-time,technology,151918,253988,7.0,2025-11-27,"agile, git, docker, rest apis, typescript",0.955555,0.831227,0.583333,2.710973,2.294528,1.254965
4,senior software engineer,digitalworks,"san francisco, ca",full-time,technology,148141,252584,9.0,2025-11-27,"agile, ruby, docker, git, javascript",0.921799,0.824884,0.75,2.548056,2.261757,2.147543


In [12]:
# Tangani missing values
# Numeric → median
num_cols = ['experience_required']
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Categorical → mode
cat_cols = ['job_type', 'category', 'skills']
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Hapus outlier (IQR Method)
def remove_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[col] >= lower) & (df[col] <= upper)]

df = remove_outliers(df, 'salary_min')
df = remove_outliers(df, 'salary_max')

# Normalisasi data numerik
num_cols = ['salary_min', 'salary_max', 'experience_required']

for col in num_cols:
    df[f'{col}_norm'] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
Index: 247 entries, 3 to 249
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   job_title                 247 non-null    object 
 1   company                   247 non-null    object 
 2   location                  247 non-null    object 
 3   job_type                  247 non-null    object 
 4   category                  247 non-null    object 
 5   salary_min                247 non-null    int64  
 6   salary_max                247 non-null    int64  
 7   experience_required       247 non-null    float64
 8   publication_date          247 non-null    object 
 9   skills                    247 non-null    object 
 10  salary_min_norm           247 non-null    float64
 11  salary_max_norm           247 non-null    float64
 12  experience_required_norm  247 non-null    float64
 13  salary_min_z              247 non-null    float64
 14  salary_max_z   

Unnamed: 0,job_title,company,location,job_type,category,salary_min,salary_max,experience_required,publication_date,skills,salary_min_norm,salary_max_norm,experience_required_norm,salary_min_z,salary_max_z,experience_required_z
3,lead engineer,webdynamics,"seattle, wa",full-time,technology,151918,253988,7.0,2025-11-27,"agile, git, docker, rest apis, typescript",1.0,0.973827,0.583333,2.710973,2.294528,1.254965
4,senior software engineer,digitalworks,"san francisco, ca",full-time,technology,148141,252584,9.0,2025-11-27,"agile, ruby, docker, git, javascript",0.964674,0.966395,0.75,2.548056,2.261757,2.147543
5,senior software engineer,webdynamics,"seattle, wa",part-time,technology,147870,248059,8.0,2025-11-27,"ci/cd, python, kubernetes, javascript, sql, do...",0.962139,0.942445,0.666667,2.536367,2.156137,1.701254
6,senior data scientist,webdynamics,"seattle, wa",part-time,technology,149467,244158,11.0,2025-11-27,"machine learning, typescript, node.js, docker,...",0.977076,0.921798,0.916667,2.605252,2.065082,3.040121
7,lead engineer,ai solutions,"san francisco, ca",contract,technology,133652,258933,11.0,2025-11-27,"java, python, machine learning",0.829159,1.0,0.916667,1.923088,2.409952,3.040121
