In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile
from imblearn.over_sampling import RandomOverSampler, SMOTEN
def filter_location(location):
    result = re.findall("\,\s[A-Z]{2}$", location)
    if len(result) != 0:
        return result[0][2:]
    else:
        return location


data = pd.read_excel("final_project.ods", engine="odf", dtype=str)
data = data.dropna(axis=0)
data["location"] = data["location"].apply(filter_location)

target = "career_level"
x = data.drop(target, axis=1)
y = data[target]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100, stratify=y)
ros = SMOTEN(k_neighbors=2, random_state=0,sampling_strategy={'director_business_unit_leader': 500, 'specialist': 500, 'managing_director_small_medium_company':1000, 'bereichsleiter': 1000 })
#Nhân bản sao như này và đặt nó trước khi split thì nó sẽ lấy cả test ở bộ train và -> bộ train,test sẽ bị overlap 
#SMOTE: just numerical feature --> use SMOTEN (1 dạng của Oversampling)
print(y_train.value_counts())
x_train,y_train = ros.fit_resample(x_train,y_train)
print("_________MTL___________________________")
print(y_train.value_counts())

career_level
senior_specialist_or_project_manager      3469
manager_team_leader                       2138
bereichsleiter                             768
director_business_unit_leader               56
specialist                                  24
managing_director_small_medium_company       3
Name: count, dtype: int64
_________MTL___________________________
career_level
senior_specialist_or_project_manager      3469
manager_team_leader                       2138
bereichsleiter                            1000
managing_director_small_medium_company    1000
specialist                                 500
director_business_unit_leader              500
Name: count, dtype: int64


In [7]:
# from sklearn.impute import SimpleImputer
# tf_transform = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('TfidfVectorizer', TfidfVectorizer(stop_words= 'english', ngram_range = (1,1), encoding= 'utf-8'))
# ])
# tf_transform2 = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('TfidfVectorizer', TfidfVectorizer(stop_words= 'english', ngram_range = (1,2), encoding= 'utf-8'))
# ])
# nom_transform = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('nom_transform', OneHotEncoder())
# ])

In [8]:
preprocessor = ColumnTransformer(transformers=[
    ("title_ft", TfidfVectorizer(stop_words="english", ngram_range=(1, 1)), "title"),
    ("location_ft", OneHotEncoder(handle_unknown='ignore'), ["location"]),
    ("des_ft", TfidfVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.01, max_df= 0.95), "description"),
    ("function_ft", OneHotEncoder(handle_unknown='ignore'), ["function"]),
    ("industry_ft", TfidfVectorizer(stop_words="english", ngram_range=(1, 1)), "industry")
])
#min_df = cận dưới, max_df = cận trên
# The default list of English stop words includes: 
# a, an, and, are, as, at, be, but, by, 
# for, if, in, into, is, it, no, not, of, on, 
# or, such, that, the, their, then, there, t
# hese, they, this, to, was, will and with.



In [9]:
cls = Pipeline(steps=[
    ("preprocessor", preprocessor), #(6458, 850370)
    ("feature_selector", SelectPercentile(chi2, percentile=5)),
    ("model", RandomForestClassifier())
])
# result = cls.fit_transform(x_train)
cls.fit(x_train, y_train)
# print(result.shape)


In [10]:
y_predict = cls.predict(x_test)
print(classification_report(y_test, y_predict))

#Decision tree
                        #       accuracy                           0.64      1615
                        #      macro avg       0.38      0.39      0.38      1615
                        #   weighted avg       0.65      0.64      0.65      1615

                        #       accuracy                           0.64      1615
                        #      macro avg       0.32      0.30      0.31      1615
                        #   weighted avg       0.64      0.64      0.64      1615
                    #--> Performance giam 1 chut nhung model chay nhanh hon rat nhieu

#random forest, with select k best = 800

                    # #     accuracy                           0.75      1615
                    #          macro avg       0.50      0.31      0.33      1615
                    #       weighted avg       0.73      0.75      0.72      1615

                    #    accuracy                           0.77      1615
                    #          macro avg       0.53      0.32      0.32      1615
                    #       weighted avg       0.76      0.77      0.73      1615

                                        precision    recall  f1-score   support

                        bereichsleiter       0.60      0.28      0.38       192
         director_business_unit_leader       0.62      0.36      0.45        14
                   manager_team_leader       0.61      0.66      0.64       534
managing_director_small_medium_company       0.00      0.00      0.00         1
  senior_specialist_or_project_manager       0.80      0.87      0.84       868
                            specialist       0.00      0.00      0.00         6

                              accuracy                           0.72      1615
                             macro avg       0.44      0.36      0.38      1615
                          weighted avg       0.71      0.72      0.71      1615



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
params = {
    # "model__n_estimator ": [100,200,300],
    "model__criterion": ["gini", "entropy", "log_loss"],
    "feature_selector__percentile": [1,5,10]
    
}
from sklearn.model_selection import GridSearchCV

In [13]:
grid_search = GridSearchCV(estimator=cls, param_grid= params, cv = 4 , scoring= "recall_weighted", verbose = 2)
grid_search.fit(x_train, y_train)
y_predicted = grid_search.predict(x_test)
print(classification_report(y_test, y_predicted))

Fitting 4 folds for each of 9 candidates, totalling 36 fits
[CV] END feature_selector__percentile=1, model__criterion=gini; total time=   4.2s
[CV] END feature_selector__percentile=1, model__criterion=gini; total time=   4.0s
[CV] END feature_selector__percentile=1, model__criterion=gini; total time=   4.0s
[CV] END feature_selector__percentile=1, model__criterion=gini; total time=   4.2s
[CV] END feature_selector__percentile=1, model__criterion=entropy; total time=   4.1s
[CV] END feature_selector__percentile=1, model__criterion=entropy; total time=   4.1s
[CV] END feature_selector__percentile=1, model__criterion=entropy; total time=   4.2s
[CV] END feature_selector__percentile=1, model__criterion=entropy; total time=   4.3s
[CV] END feature_selector__percentile=1, model__criterion=log_loss; total time=   4.0s
[CV] END feature_selector__percentile=1, model__criterion=log_loss; total time=   4.1s
[CV] END feature_selector__percentile=1, model__criterion=log_loss; total time=   4.0s
[CV

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
