In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import pickle
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec

ROOT = r"C:\\Users\\devra\\Downloads\\Codes_PhD"
DATA_DIR = rf"{ROOT}\\dataset"
MBTI_RAW_CSV_PATH = os.path.join(DATA_DIR, "mbti_clean_biTri.csv")
MODEL = os.path.join(ROOT, "models", "word2vec_smote")

data = pd.read_csv(MBTI_RAW_CSV_PATH)

label_encoder = LabelEncoder()

for target_name in ["E-I", "N-S", "F-T", "J-P"]:
    encoded_target = label_encoder.fit_transform(data[target_name])
    data[f"type_{target_name[0]}"] = encoded_target

training_data = data[["cleaned_post", "E-I", "N-S", "F-T", "J-P"]].copy()

def make_dummies(data, columns=["E-I", "N-S", "F-T", "J-P"]):
    for column in columns:
        temp_dummy = pd.get_dummies(data[column], prefix="type")
        data = data.join(temp_dummy)
    return data

training_data = make_dummies(training_data)

X = training_data[["cleaned_post"]]
y = training_data.drop(columns=["cleaned_post"])

tokenized_posts = X["cleaned_post"].apply(lambda x: x.split())
word2vec_model = Word2Vec(sentences=tokenized_posts, vector_size=100, window=5, min_count=1, workers=4)

y_columns = ["E-I", "N-S", "F-T", "J-P"]
evaluation_df = pd.DataFrame(columns=["Target", "Accuracy", "Precision", "Recall", "F1-Score", "Roc-AUC"])

for target_name in y_columns:
    y_target = y[f"type_{target_name[0]}"]
    X_transformed = word2vec_model.wv[X["cleaned_post"].apply(lambda x: x.split()).sum()]
    X_target = X_transformed[y_target.index]
    # Using Word2Vec for vectorization
    X_df = pd.DataFrame(X_target, index=X.index)

    # Using SMOTE for oversampling
    smote = SMOTE(random_state=42)
    X_over, y_over = smote.fit_resample(X_df.loc[y_target.index], y_target)

    X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=42)

    print('shape of X_train:',X_train.shape)
    print('shape of y_train:',y_train.shape)

    base_models = [(name, pickle.load(open(f'{MODEL}\{name}_{target_name}.sav', 'rb'))) for name in ["RandomForest", "Xgboost"]]
    voting_clf = VotingClassifier(estimators=base_models, voting='soft')

    # Fit the VotingClassifier
    voting_clf.fit(X_train, y_train)

    # Get the predictions for the test data
    voting_pred_proba_test = voting_clf.predict_proba(X_test)

    # Use the Voting predictions as features for AdaBoost
    ada_boost_clf = AdaBoostClassifier(n_estimators=50, random_state=42)
    ada_boost_clf.fit(voting_pred_proba_test, y_test)

    # Get feature importance from the base estimators in the ensemble
    base_estimator_feature_importance = np.mean([estimator.feature_importances_ for estimator in ada_boost_clf.estimators_], axis=0)
    print(f"Feature Importance for {target_name}:", base_estimator_feature_importance)

    # Evaluate the ensemble
    boosting_pred = ada_boost_clf.predict(voting_pred_proba_test)
    accuracy = metrics.accuracy_score(y_test, boosting_pred)
    precision = metrics.precision_score(y_test, boosting_pred)
    recall = metrics.recall_score(y_test, boosting_pred)
    f1_score = metrics.f1_score(y_test, boosting_pred)
    roc_auc_score = metrics.roc_auc_score(y_test, boosting_pred)

    # Update the evaluation_df DataFrame
    evaluation_df = pd.concat([evaluation_df, pd.DataFrame({
        "Target": [target_name],
        "Accuracy": [accuracy],
        "Precision": [precision],
        "Recall": [recall],
        "F1-Score": [f1_score],
        "Roc-AUC": [roc_auc_score]
    })], ignore_index=True)

    # Save the Boosting Classifier
    boosting_filename = f'{MODEL}\BoostingPara_SMOTE_{target_name}.sav'
    print(boosting_filename)
    pickle.dump(ada_boost_clf, open(boosting_filename, 'wb'))

# Save the evaluation_df to a CSV file
evaluation_df.to_csv(os.path.join(MODEL, 'evaluation_boostingPara_SMOTE1.csv'), index=False)
print(evaluation_df)

shape of X_train: (10680, 100)
shape of y_train: (10680,)


  evaluation_df = pd.concat([evaluation_df, pd.DataFrame({


Feature Importance for E-I: [0.54 0.46]
C:\\Users\\devra\\Downloads\\Codes_PhD\models\word2vec_smote\BoostingPara_SMOTE_E-I.sav
shape of X_train: (11963, 100)
shape of y_train: (11963,)




Feature Importance for N-S: [0.78 0.22]
C:\\Users\\devra\\Downloads\\Codes_PhD\models\word2vec_smote\BoostingPara_SMOTE_N-S.sav
shape of X_train: (7508, 100)
shape of y_train: (7508,)




Feature Importance for F-T: [0.56 0.44]
C:\\Users\\devra\\Downloads\\Codes_PhD\models\word2vec_smote\BoostingPara_SMOTE_F-T.sav
shape of X_train: (8384, 100)
shape of y_train: (8384,)
Feature Importance for J-P: [0.56 0.44]
C:\\Users\\devra\\Downloads\\Codes_PhD\models\word2vec_smote\BoostingPara_SMOTE_J-P.sav
  Target  Accuracy  Precision    Recall  F1-Score   Roc-AUC
0    E-I  0.696255   0.716406  0.650449  0.681836  0.696289
1    N-S  0.773320   0.722137  0.911212  0.805731  0.768797
2    F-T  0.570820   0.560806  0.822630  0.666942  0.559030
3    J-P  0.583015   0.633588  0.395615  0.487089  0.583194


