# Import libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load cleaned data

In [2]:
df = pd.read_csv('../data/ai_job_dataset_clean.csv')

# Select features for modeling

In [3]:
categorical_features = [
    'job_title', 'experience_level', 'employment_type',
    'company_location', 'company_size', 'employee_residence',
    'education_required', 'industry'
]
numerical_features = [
    'years_experience', 'remote_ratio', 'job_description_length', 'benefits_score'
]

# One-hot encode categorical features


In [5]:
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_cats = encoder.fit_transform(df[categorical_features])
encoded_cat_df = pd.DataFrame(
    encoded_cats,
    columns=encoder.get_feature_names_out(categorical_features),
    index=df.index
)

# Standardize numerical features


In [6]:
scaler = StandardScaler()
scaled_nums = scaler.fit_transform(df[numerical_features])
scaled_num_df = pd.DataFrame(
    scaled_nums,
    columns=numerical_features,
    index=df.index
)

# Combine all features


In [7]:
X = pd.concat([encoded_cat_df, scaled_num_df], axis=1)
y = df['salary_usd']

# Train-test split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [9]:
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (12000, 86), Test shape: (3000, 86)


# Export results

In [10]:
import os

os.makedirs('../python_results', exist_ok=True)

X_train.to_csv('../python_results/X_train.csv', index=False)
X_test.to_csv('../python_results/X_test.csv', index=False)
y_train.to_csv('../python_results/y_train.csv', index=False)
y_test.to_csv('../python_results/y_test.csv', index=False)