In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("/data/cleaned/processed_raw_CareerMapping_with_scores.csv")

In [3]:
X = df.drop(['Role', 'Role_encoded'], axis=1)

In [4]:
y = df['Role_encoded']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
print(f"Số lượng mẫu ban đầu của tập Train: {Counter(y_train)}")

Số lượng mẫu ban đầu của tập Train: Counter({0: 851, 11: 445, 12: 430, 13: 424, 2: 417, 5: 417, 3: 406, 7: 392, 8: 389, 14: 387, 15: 386, 9: 381, 6: 379, 1: 379, 4: 377, 10: 362})


In [7]:
smote = SMOTE(random_state=42)

In [8]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [9]:
print(f"Số lượng mẫu sau khi SMOTE: {Counter(y_train_resampled)}")

Số lượng mẫu sau khi SMOTE: Counter({3: 851, 8: 851, 13: 851, 7: 851, 15: 851, 14: 851, 4: 851, 2: 851, 6: 851, 11: 851, 5: 851, 1: 851, 9: 851, 12: 851, 0: 851, 10: 851})


In [10]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(random_state=42))
])

In [11]:
param_grid = {
    'svm__C': [0.1, 1, 10, 100, 1000],
    'svm__gamma': [1, 0.1, 0.01, 0.001, 'scale'],
    'svm__kernel': ['rbf']
}

In [12]:
grid = GridSearchCV(pipeline, param_grid, refit=True, verbose=2, cv=3)

In [13]:
print("Đang chuẩn hóa dữ liệu và tìm tham số tối ưu cho SVM...")
grid.fit(X_train_resampled, y_train_resampled)

Đang chuẩn hóa dữ liệu và tìm tham số tối ưu cho SVM...
Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] END ..........svm__C=0.1, svm__gamma=1, svm__kernel=rbf; total time=   4.7s
[CV] END ..........svm__C=0.1, svm__gamma=1, svm__kernel=rbf; total time=   4.7s
[CV] END ..........svm__C=0.1, svm__gamma=1, svm__kernel=rbf; total time=   5.4s
[CV] END ........svm__C=0.1, svm__gamma=0.1, svm__kernel=rbf; total time=   3.5s
[CV] END ........svm__C=0.1, svm__gamma=0.1, svm__kernel=rbf; total time=   3.9s
[CV] END ........svm__C=0.1, svm__gamma=0.1, svm__kernel=rbf; total time=   2.8s
[CV] END .......svm__C=0.1, svm__gamma=0.01, svm__kernel=rbf; total time=   3.6s
[CV] END .......svm__C=0.1, svm__gamma=0.01, svm__kernel=rbf; total time=   2.9s
[CV] END .......svm__C=0.1, svm__gamma=0.01, svm__kernel=rbf; total time=   3.1s
[CV] END ......svm__C=0.1, svm__gamma=0.001, svm__kernel=rbf; total time=   4.6s
[CV] END ......svm__C=0.1, svm__gamma=0.001, svm__kernel=rbf; total time=

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'svm__C': [0.1, 1, ...], 'svm__gamma': [1, 0.1, ...], 'svm__kernel': ['rbf']}"
,scoring,
,n_jobs,
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,1000
,kernel,'rbf'
,degree,3
,gamma,0.001
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [14]:
print(f"\nTham số tốt nhất tìm được: {grid.best_params_}")
print(f"Accuracy tốt nhất trên tập train (Cross-val): {grid.best_score_:.4f}")


Tham số tốt nhất tìm được: {'svm__C': 1000, 'svm__gamma': 0.001, 'svm__kernel': 'rbf'}
Accuracy tốt nhất trên tập train (Cross-val): 0.9999


In [15]:
grid_predictions = grid.predict(X_test)
print("\n--- Kết quả đánh giá trên tập Test sau khi Tuning (có Scaler) ---")
print("Accuracy:", accuracy_score(y_test, grid_predictions))


--- Kết quả đánh giá trên tập Test sau khi Tuning (có Scaler) ---
Accuracy: 1.0


In [16]:
print("\nClassification Report:\n")
print(classification_report(y_test, grid_predictions))


Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       205
           1       1.00      1.00      1.00        77
           2       1.00      1.00      1.00       123
           3       1.00      1.00      1.00       122
           4       1.00      1.00      1.00        80
           5       1.00      1.00      1.00        99
           6       1.00      1.00      1.00       113
           7       1.00      1.00      1.00       100
           8       1.00      1.00      1.00        91
           9       1.00      1.00      1.00        82
          10       1.00      1.00      1.00        94
          11       1.00      1.00      1.00        95
          12       1.00      1.00      1.00       110
          13       1.00      1.00      1.00       116
          14       1.00      1.00      1.00       105
          15       1.00      1.00      1.00        94

    accuracy                           1.00      1706
 

---------------------------------------------------------

In [17]:
df = pd.read_csv("/data/cleaned/processed_raw_CareerMapping.csv")

In [18]:
X = df.drop(['Role', 'Role_encoded'], axis=1)

In [19]:
y = df['Role_encoded']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
print(f"Số lượng mẫu ban đầu của tập Train: {Counter(y_train)}")

Số lượng mẫu ban đầu của tập Train: Counter({0: 851, 11: 445, 12: 430, 13: 424, 2: 417, 5: 417, 3: 406, 7: 392, 8: 389, 14: 387, 15: 386, 9: 381, 6: 379, 1: 379, 4: 377, 10: 362})


In [22]:
smote = SMOTE(random_state=42)

In [23]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [24]:
print(f"Số lượng mẫu sau khi SMOTE: {Counter(y_train_resampled)}")

Số lượng mẫu sau khi SMOTE: Counter({3: 851, 8: 851, 13: 851, 7: 851, 15: 851, 14: 851, 4: 851, 2: 851, 6: 851, 11: 851, 5: 851, 1: 851, 9: 851, 12: 851, 0: 851, 10: 851})


In [25]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(random_state=42))
])

In [26]:
param_grid = {
    'svm__C': [0.1, 1, 10, 100, 1000],
    'svm__gamma': [1, 0.1, 0.01, 0.001, 'scale'],
    'svm__kernel': ['rbf']
}

In [27]:
grid = GridSearchCV(pipeline, param_grid, refit=True, verbose=2, cv=3)

In [28]:
print("Đang chuẩn hóa dữ liệu và tìm tham số tối ưu cho SVM...")
grid.fit(X_train_resampled, y_train_resampled)

Đang chuẩn hóa dữ liệu và tìm tham số tối ưu cho SVM...
Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] END ..........svm__C=0.1, svm__gamma=1, svm__kernel=rbf; total time=   4.8s
[CV] END ..........svm__C=0.1, svm__gamma=1, svm__kernel=rbf; total time=   4.7s
[CV] END ..........svm__C=0.1, svm__gamma=1, svm__kernel=rbf; total time=   4.8s
[CV] END ........svm__C=0.1, svm__gamma=0.1, svm__kernel=rbf; total time=   2.3s
[CV] END ........svm__C=0.1, svm__gamma=0.1, svm__kernel=rbf; total time=   2.6s
[CV] END ........svm__C=0.1, svm__gamma=0.1, svm__kernel=rbf; total time=   2.9s
[CV] END .......svm__C=0.1, svm__gamma=0.01, svm__kernel=rbf; total time=   3.1s
[CV] END .......svm__C=0.1, svm__gamma=0.01, svm__kernel=rbf; total time=   2.9s
[CV] END .......svm__C=0.1, svm__gamma=0.01, svm__kernel=rbf; total time=   3.1s
[CV] END ......svm__C=0.1, svm__gamma=0.001, svm__kernel=rbf; total time=   4.6s
[CV] END ......svm__C=0.1, svm__gamma=0.001, svm__kernel=rbf; total time=

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'svm__C': [0.1, 1, ...], 'svm__gamma': [1, 0.1, ...], 'svm__kernel': ['rbf']}"
,scoring,
,n_jobs,
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,1000
,kernel,'rbf'
,degree,3
,gamma,0.001
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [32]:
print(f"\nTham số tốt nhất tìm được: {grid.best_params_}")
print(f"Accuracy tốt nhất trên tập train (Cross-val): {grid.best_score_:.4f}")


Tham số tốt nhất tìm được: {'svm__C': 1000, 'svm__gamma': 0.001, 'svm__kernel': 'rbf'}
Accuracy tốt nhất trên tập train (Cross-val): 0.9997


In [33]:
grid_predictions = grid.predict(X_test)
print("\n--- Kết quả đánh giá trên tập Test sau khi Tuning (có Scaler) ---")
print("Accuracy:", accuracy_score(y_test, grid_predictions))


--- Kết quả đánh giá trên tập Test sau khi Tuning (có Scaler) ---
Accuracy: 1.0


In [34]:
print("\nClassification Report:\n")
print(classification_report(y_test, grid_predictions))


Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       205
           1       1.00      1.00      1.00        77
           2       1.00      1.00      1.00       123
           3       1.00      1.00      1.00       122
           4       1.00      1.00      1.00        80
           5       1.00      1.00      1.00        99
           6       1.00      1.00      1.00       113
           7       1.00      1.00      1.00       100
           8       1.00      1.00      1.00        91
           9       1.00      1.00      1.00        82
          10       1.00      1.00      1.00        94
          11       1.00      1.00      1.00        95
          12       1.00      1.00      1.00       110
          13       1.00      1.00      1.00       116
          14       1.00      1.00      1.00       105
          15       1.00      1.00      1.00        94

    accuracy                           1.00      1706
 