In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("/Users/rolandtran/Documents/Repos/Student-Career-Prediction-System/data/cleaned/processed_raw_CareerMapping1_with_scores.csv")

In [3]:
X = df.drop(['Role', 'Role_encoded'], axis=1)

In [4]:
y = df['Role_encoded']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
print(f"Số lượng mẫu ban đầu của tập Train: {Counter(y_train)}")

Số lượng mẫu ban đầu của tập Train: Counter({13: 328, 0: 312, 2: 272, 11: 249, 7: 219, 12: 210, 5: 199, 3: 198, 10: 187, 9: 182, 6: 181, 15: 170, 14: 169, 4: 142, 1: 141, 8: 133})


In [9]:
smote = SMOTE(random_state=42)

In [10]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [11]:
print(f"Số lượng mẫu sau khi SMOTE: {Counter(y_train_resampled)}")

Số lượng mẫu sau khi SMOTE: Counter({2: 328, 15: 328, 3: 328, 11: 328, 9: 328, 13: 328, 0: 328, 6: 328, 10: 328, 8: 328, 7: 328, 14: 328, 12: 328, 5: 328, 1: 328, 4: 328})


In [12]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(random_state=42))
])

In [13]:
param_grid = {
    'svm__C': [0.1, 1, 10, 100, 1000],
    'svm__gamma': [1, 0.1, 0.01, 0.001, 'scale'],
    'svm__kernel': ['rbf']
}

In [14]:
grid = GridSearchCV(pipeline, param_grid, refit=True, verbose=2, cv=3)

In [15]:
print("Đang chuẩn hóa dữ liệu và tìm tham số tối ưu cho SVM...")
grid.fit(X_train_resampled, y_train_resampled)

Đang chuẩn hóa dữ liệu và tìm tham số tối ưu cho SVM...
Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] END ..........svm__C=0.1, svm__gamma=1, svm__kernel=rbf; total time=   0.6s
[CV] END ..........svm__C=0.1, svm__gamma=1, svm__kernel=rbf; total time=   0.6s
[CV] END ..........svm__C=0.1, svm__gamma=1, svm__kernel=rbf; total time=   0.6s
[CV] END ........svm__C=0.1, svm__gamma=0.1, svm__kernel=rbf; total time=   0.5s
[CV] END ........svm__C=0.1, svm__gamma=0.1, svm__kernel=rbf; total time=   0.5s
[CV] END ........svm__C=0.1, svm__gamma=0.1, svm__kernel=rbf; total time=   0.5s
[CV] END .......svm__C=0.1, svm__gamma=0.01, svm__kernel=rbf; total time=   0.6s
[CV] END .......svm__C=0.1, svm__gamma=0.01, svm__kernel=rbf; total time=   0.6s
[CV] END .......svm__C=0.1, svm__gamma=0.01, svm__kernel=rbf; total time=   0.6s
[CV] END ......svm__C=0.1, svm__gamma=0.001, svm__kernel=rbf; total time=   0.6s
[CV] END ......svm__C=0.1, svm__gamma=0.001, svm__kernel=rbf; total time=

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'svm__C': [0.1, 1, ...], 'svm__gamma': [1, 0.1, ...], 'svm__kernel': ['rbf']}"
,scoring,
,n_jobs,
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,1000
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [16]:
print(f"\nTham số tốt nhất tìm được: {grid.best_params_}")
print(f"Accuracy tốt nhất trên tập train (Cross-val): {grid.best_score_:.4f}")


Tham số tốt nhất tìm được: {'svm__C': 1000, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}
Accuracy tốt nhất trên tập train (Cross-val): 0.8544


In [18]:
grid_predictions = grid.predict(X_test)
print("\n--- Kết quả đánh giá trên tập Test sau khi Tuning (có Scaler) ---")
print("Accuracy:", accuracy_score(y_test, grid_predictions))


--- Kết quả đánh giá trên tập Test sau khi Tuning (có Scaler) ---
Accuracy: 0.7621359223300971


In [19]:
print("\nClassification Report:\n")
print(classification_report(y_test, grid_predictions))


Classification Report:

              precision    recall  f1-score   support

           0       0.76      0.85      0.81        68
           1       0.47      0.59      0.52        39
           2       0.79      0.73      0.76        81
           3       0.79      0.77      0.78        65
           4       0.62      0.64      0.63        25
           5       0.97      0.75      0.85        40
           6       0.73      0.75      0.74        57
           7       0.76      0.76      0.76        45
           8       0.82      0.66      0.73        47
           9       0.73      0.78      0.75        41
          10       0.68      0.76      0.72        34
          11       0.79      0.83      0.81        58
          12       0.70      0.86      0.77        50
          13       0.92      0.78      0.84        87
          14       0.80      0.75      0.77        48
          15       0.79      0.79      0.79        39

    accuracy                           0.76       824
 

---------------------------------------------------------

In [20]:
df = pd.read_csv("/Users/rolandtran/Documents/Repos/Student-Career-Prediction-System/data/cleaned/processed_raw_CareerMapping1.csv")

In [21]:
X = df.drop(['Role', 'Role_encoded'], axis=1)

In [22]:
y = df['Role_encoded']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
print(f"Số lượng mẫu ban đầu của tập Train: {Counter(y_train)}")

Số lượng mẫu ban đầu của tập Train: Counter({13: 328, 0: 312, 2: 272, 11: 249, 7: 219, 12: 210, 5: 199, 3: 198, 10: 187, 9: 182, 6: 181, 15: 170, 14: 169, 4: 142, 1: 141, 8: 133})


In [25]:
smote = SMOTE(random_state=42)

In [26]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [27]:
print(f"Số lượng mẫu sau khi SMOTE: {Counter(y_train_resampled)}")

Số lượng mẫu sau khi SMOTE: Counter({2: 328, 15: 328, 3: 328, 11: 328, 9: 328, 13: 328, 0: 328, 6: 328, 10: 328, 8: 328, 7: 328, 14: 328, 12: 328, 5: 328, 1: 328, 4: 328})


In [28]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(random_state=42))
])

In [29]:
param_grid = {
    'svm__C': [0.1, 1, 10, 100, 1000],
    'svm__gamma': [1, 0.1, 0.01, 0.001, 'scale'],
    'svm__kernel': ['rbf']
}

In [30]:
grid = GridSearchCV(pipeline, param_grid, refit=True, verbose=2, cv=3)

In [31]:
print("Đang chuẩn hóa dữ liệu và tìm tham số tối ưu cho SVM...")
grid.fit(X_train_resampled, y_train_resampled)

Đang chuẩn hóa dữ liệu và tìm tham số tối ưu cho SVM...
Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] END ..........svm__C=0.1, svm__gamma=1, svm__kernel=rbf; total time=   0.6s
[CV] END ..........svm__C=0.1, svm__gamma=1, svm__kernel=rbf; total time=   0.6s
[CV] END ..........svm__C=0.1, svm__gamma=1, svm__kernel=rbf; total time=   0.6s
[CV] END ........svm__C=0.1, svm__gamma=0.1, svm__kernel=rbf; total time=   0.5s
[CV] END ........svm__C=0.1, svm__gamma=0.1, svm__kernel=rbf; total time=   0.5s
[CV] END ........svm__C=0.1, svm__gamma=0.1, svm__kernel=rbf; total time=   0.5s
[CV] END .......svm__C=0.1, svm__gamma=0.01, svm__kernel=rbf; total time=   0.6s
[CV] END .......svm__C=0.1, svm__gamma=0.01, svm__kernel=rbf; total time=   0.6s
[CV] END .......svm__C=0.1, svm__gamma=0.01, svm__kernel=rbf; total time=   0.6s
[CV] END ......svm__C=0.1, svm__gamma=0.001, svm__kernel=rbf; total time=   0.6s
[CV] END ......svm__C=0.1, svm__gamma=0.001, svm__kernel=rbf; total time=

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'svm__C': [0.1, 1, ...], 'svm__gamma': [1, 0.1, ...], 'svm__kernel': ['rbf']}"
,scoring,
,n_jobs,
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,1000
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [32]:
print(f"\nTham số tốt nhất tìm được: {grid.best_params_}")
print(f"Accuracy tốt nhất trên tập train (Cross-val): {grid.best_score_:.4f}")


Tham số tốt nhất tìm được: {'svm__C': 1000, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}
Accuracy tốt nhất trên tập train (Cross-val): 0.8559


In [33]:
grid_predictions = grid.predict(X_test)
print("\n--- Kết quả đánh giá trên tập Test sau khi Tuning (có Scaler) ---")
print("Accuracy:", accuracy_score(y_test, grid_predictions))


--- Kết quả đánh giá trên tập Test sau khi Tuning (có Scaler) ---
Accuracy: 0.7657766990291263


In [34]:
print("\nClassification Report:\n")
print(classification_report(y_test, grid_predictions))


Classification Report:

              precision    recall  f1-score   support

           0       0.81      0.88      0.85        68
           1       0.44      0.62      0.52        39
           2       0.85      0.75      0.80        81
           3       0.81      0.77      0.79        65
           4       0.55      0.64      0.59        25
           5       0.91      0.75      0.82        40
           6       0.75      0.74      0.74        57
           7       0.76      0.76      0.76        45
           8       0.78      0.66      0.71        47
           9       0.67      0.76      0.71        41
          10       0.65      0.76      0.70        34
          11       0.77      0.79      0.78        58
          12       0.75      0.86      0.80        50
          13       0.95      0.79      0.86        87
          14       0.80      0.75      0.77        48
          15       0.84      0.82      0.83        39

    accuracy                           0.77       824
 