In [3]:
# 1. IMPORTS
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report


In [5]:

# 2. DATA LOADING AND CLEANING

    # Load the dataset
data = pd.read_csv('jobss.csv')

    # --- Data Cleaning ---
data = data.drop(columns=['Unnamed: 1'])
target = 'Role Category'
data = data.dropna(subset=[target])

    # --- Handle Rare Classes ---
class_counts = data[target].value_counts()
rare_classes = class_counts[class_counts < 2].index
print(f"Original shape: {data.shape}")
data = data[~data[target].isin(rare_classes)]
print(f"Shape after removing rare classes: {data.shape}\n")



Original shape: (463, 11)
Shape after removing rare classes: (444, 11)



In [6]:
 # 3. FEATURE ENGINEERING

    # --- Preprocess 'Job Experience Required' column ---
def preprocess_experience(experience_str):
        if isinstance(experience_str, str):
            experience_str = experience_str.lower().replace('yrs', '').replace('yr', '').strip()
            if '-' in experience_str:
                try:
                    low, high = map(int, experience_str.split('-'))
                    return (low + high) / 2
                except:
                    return np.nan
            else:
                try:
                    return float(experience_str)
                except:
                    return np.nan
        return experience_str

data['Job Experience Required'] = data['Job Experience Required'].apply(preprocess_experience)
data['Job Experience Required'] = data['Job Experience Required'].fillna(data['Job Experience Required'].median())


    # --- Define Features and Target ---
X = data.drop(columns=[target])
y = data[target]


    # --- Identify Feature Types for Preprocessing ---
text_feature = 'Key Skills'
categorical_features = X.select_dtypes(include=['object']).columns.drop(text_feature)
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

    # Impute NaNs in categorical columns before splitting
for col in categorical_features:
    X[col] = X[col].fillna('Missing')
X[text_feature] = X[text_feature].fillna('Missing')


In [7]:
   # 4. MODELING PIPELINE

    # --- Create Preprocessing Pipelines for Each Feature Type ---
numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

text_transformer = TfidfVectorizer(max_features=1000, stop_words='english')


    # --- Combine All Preprocessing Steps ---
preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features),
            ('text', text_transformer, text_feature)
        ],
        remainder='passthrough'
    )


    # --- Create the Full Model Pipeline with RandomForest ---
model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        # Here we use RandomForestClassifier instead of XGBoost
        ('classifier', RandomForestClassifier(
            n_estimators=200, # Number of trees in the forest
            max_depth=20,     # Max depth of the trees
            random_state=42,
            n_jobs=-1         # Use all available CPU cores
        ))
    ])



In [8]:
    # 5. TRAINING AND EVALUATION

    # --- Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # --- Train the Model ---
print("Training the Random Forest model...")
model_pipeline.fit(X_train, y_train)
print("Training complete.\n")

    # --- Make Predictions ---
y_pred = model_pipeline.predict(X_test)

    # --- Evaluate the Model ---
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Random Forest Model Accuracy: {accuracy:.4f}")
print("\nThis model provides strong performance without needing the xgboost library.")

    # Display a detailed classification report
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))

Training the Random Forest model...
Training complete.

✅ Random Forest Model Accuracy: 0.5843

This model provides strong performance without needing the xgboost library.

--- Classification Report ---
                                            precision    recall  f1-score   support

                                  Accounts       0.75      0.75      0.75         4
Admin/Maintenance/Security/Datawarehousing       0.00      0.00      0.00         3
                            Analytics & BI       0.00      0.00      0.00         1
                    Architectural Services       1.00      1.00      1.00         1
    Back Office/Web/Transaction Processing       0.00      0.00      0.00         1
                       Content Development       1.00      1.00      1.00         1
                           Corporate Sales       0.00      0.00      0.00         3
                                  Creative       0.00      0.00      0.00         1
     Drug Regulatory Affairs/Documentati

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Define columns
text_feature = 'Key Skills'
categorical_features = X.select_dtypes(include=['object']).columns.drop(text_feature)
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

text_transformer = TfidfVectorizer(max_features=1000, stop_words='english')

# Combine all preprocessing
preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features),
    ('text', text_transformer, text_feature)
])

# Define base pipeline
base_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
])


In [13]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20, 30],
    'classifier__min_samples_split': [2, 5]
}

# GridSearchCV setup
grid_search = GridSearchCV(
    base_pipeline,
    param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1
)

# Train model
grid_search.fit(X_train, y_train)

# Evaluate
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"🔍 Best Parameters: {grid_search.best_params_}")
print(f"✅ Improved Accuracy: {accuracy:.4f}")
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))



Fitting 3 folds for each of 12 candidates, totalling 36 fits




🔍 Best Parameters: {'classifier__max_depth': 30, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
✅ Improved Accuracy: 0.6292

📊 Classification Report:
                                             precision    recall  f1-score   support

                                  Accounts       0.75      0.75      0.75         4
Admin/Maintenance/Security/Datawarehousing       0.00      0.00      0.00         3
                            Analytics & BI       0.00      0.00      0.00         1
                    Architectural Services       1.00      1.00      1.00         1
    Back Office/Web/Transaction Processing       0.00      0.00      0.00         1
                       Content Development       1.00      1.00      1.00         1
                           Corporate Sales       0.50      0.33      0.40         3
                                  Creative       0.00      0.00      0.00         1
     Drug Regulatory Affairs/Documentation       0.00      0.00      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [10]:


    # 6. SAVING THE MODEL 💾
import joblib
    # --- Define the filename for the saved model ---
model_filename = 'random_forest_model.joblib'
print(f"\nSaving model to '{model_filename}'...")
    
    # --- Use joblib to dump the pipeline to a file ---
joblib.dump(model_pipeline, model_filename)
print("Model saved successfully!")


    # 7. LOADING THE MODEL 📂

    # --- Load the model from the file ---
print(f"\nLoading model from '{model_filename}'...")
loaded_model = joblib.load(model_filename)
print("Model loaded successfully!")

    # --- Use the loaded model to make predictions ---
y_pred_loaded = loaded_model.predict(X_test)
accuracy_loaded = accuracy_score(y_test, y_pred_loaded)

print(f"\n✅ Accuracy of loaded model: {accuracy_loaded:.4f}")
    
    # --- Verify that the loaded model gives the same results ---
if np.array_equal(y_pred, y_pred_loaded):
    print("\nVerification successful: The loaded model's predictions match the original model's.")
else:
    print("\nVerification failed: The predictions do not match.")



Saving model to 'random_forest_model.joblib'...
Model saved successfully!

Loading model from 'random_forest_model.joblib'...
Model loaded successfully!

✅ Accuracy of loaded model: 0.5843

Verification successful: The loaded model's predictions match the original model's.
