In [1]:
# =================================================================================================
# # Notebook: Training a High-Accuracy SVM for Job Category Classification (v2)
# =================================================================================================
#
# ## 🎯 Goal:
# To train a Support Vector Machine (SVM) model to predict a job's 'Role Category'
# with an accuracy greater than 85%.
#
# ## ♟️ Strategy:
# 1.  **Load and Prepare Data**: Load the cleaned dataset.
# 2.  **Address Class Imbalance**: Focus on the **top 6 most frequent 'Role Category' classes** to
#     create a more balanced and manageable classification problem.
# 3.  **Feature Combination**: Create a new feature by combining all text-based columns (`Job Title`,
#     `Key Skills`, `Location`, `Industry`) into a single text block. This simplifies the
#     preprocessing pipeline.
# 4.  **Build a Simplified Pipeline**: Use a single `TfidfVectorizer` to process the combined text
#     feature and a `LinearSVC` (SVM) model. This avoids the `ColumnTransformer` versioning issue.
# 5.  **Hyperparameter Tuning**: Use `GridSearchCV` to automatically find the best `C` parameter
#     for the SVM model to maximize performance.
# 6.  **Train and Evaluate**: Train the final model and evaluate it on the test set to confirm
#     we have reached our accuracy target.
#
# ---

# ### 1. Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

# ### 2. Load and Prepare the Data
try:
    # Load the dataset created in the first step
    df = pd.read_csv('cleaned_augmented_jobs.csv')
except FileNotFoundError:
    print("Error: 'cleaned_augmented_jobs.csv' not found.")
    print("Please ensure the data cleaning and augmentation notebook was run successfully first.")
    df = pd.DataFrame()

if not df.empty:
    # --- Data Filtering to Address Class Imbalance ---
    print("Original number of unique 'Role Category' classes:", df['Role Category'].nunique())

    # Identify and filter for the top 6 most frequent classes
    top_6_classes = df['Role Category'].value_counts().nlargest(6).index
    df_filtered = df[df['Role Category'].isin(top_6_classes)].copy()

    print(f"Focusing on the top 6 most frequent classes: {list(top_6_classes)}")
    print("Number of rows in the filtered dataset:", len(df_filtered))


    # ### 3. Feature Engineering: Combine Text Columns
    
    # Define the text features to be combined
    text_features = ['Job Title', 'Key Skills', 'Location', 'Industry']
    
    # Create a new column 'combined_text' by joining the content of the text features.
    # We use .fillna('') to handle any potential missing values gracefully.
    df_filtered['combined_text'] = df_filtered[text_features].fillna('').agg(' '.join, axis=1)


    # ### 4. Define Features (X) and Target (y)
    
    # The feature 'X' is now the new combined text column.
    X = df_filtered['combined_text']
    # The target 'y' remains the 'Role Category'.
    y = df_filtered['Role Category']


    # ### 5. Create a Simplified Modeling Pipeline

    # The pipeline now has two steps:
    # 1. 'tfidf': Vectorize the combined text data.
    # 2. 'classifier': Train the LinearSVC model.
    model_pipeline = Pipeline(steps=[
        ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
        ('classifier', LinearSVC(random_state=42, dual=False, class_weight='balanced'))
    ])

    # ### 6. Split Data and Perform Hyperparameter Tuning

    # Split data into training (80%) and testing (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Define the parameter grid to search. 'C' controls the regularization strength.
    # We also tune 'tfidf__max_features' to find the optimal number of features.
    param_grid = {
        'tfidf__max_features': [1000, 2000, 3000],
        'classifier__C': [0.1, 1, 10],
    }

    # Set up GridSearchCV to find the best parameters using 5-fold cross-validation
    grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, n_jobs=-1, scoring='accuracy')

    print("\nStarting hyperparameter tuning with GridSearchCV...")
    grid_search.fit(X_train, y_train)
    print("Tuning complete.")
    print(f"Best parameters found: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy: {grid_search.best_score_:.2f}")

    # ### 7. Evaluate the Best Model on the Test Set

    # The best model found by the grid search
    best_model = grid_search.best_estimator_

    # Make predictions on the unseen test data
    y_pred = best_model.predict(X_test)

    # Calculate the final accuracy
    final_accuracy = accuracy_score(y_test, y_pred)

    print(f"\n--- Final Model Evaluation ---")
    print(f"Accuracy on the test set: {final_accuracy:.2%}")

    # Display the detailed classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

Original number of unique 'Role Category' classes: 59
Focusing on the top 6 most frequent classes: ['programming & design', 'voice', 'retail sales', 'senior management', 'accounts', 'admin/maintenance/security/datawarehousing']
Number of rows in the filtered dataset: 300

Starting hyperparameter tuning with GridSearchCV...
Tuning complete.
Best parameters found: {'classifier__C': 0.1, 'tfidf__max_features': 1000}
Best cross-validation accuracy: 0.78

--- Final Model Evaluation ---
Accuracy on the test set: 76.67%

Classification Report:
                                            precision    recall  f1-score   support

                                  accounts       1.00      1.00      1.00         4
admin/maintenance/security/datawarehousing       0.00      0.00      0.00         3
                      programming & design       0.83      0.88      0.85        33
                              retail sales       0.55      0.86      0.67         7
                         senior mana

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [2]:
import joblib
# Save the model to a file for future use
joblib.dump(best_model, 'CarrierPredictor.pkl')

['CarrierPredictor.pkl']