In [3]:
###Boosting-Voting Model with SMOTE and Word2Vec
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier
from sklearn import metrics
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec

ROOT = r"C:\\Users\\devra\\Downloads\\Codes_PhD"
DATA_DIR = rf"{ROOT}\\dataset\\bvclassifier"
MBTI_RAW_CSV_PATH = os.path.join(DATA_DIR, "mbti_clean_biTri.csv")
MODEL = rf"{ROOT}\\models\\trained_ml_0603\\"
OUTPUT = r"D:\\devra\\Downloads\\voting_results\\"

data = pd.read_csv(MBTI_RAW_CSV_PATH)

training_data = data[["cleaned_post", "E-I", "N-S", "F-T", "J-P"]].copy()
def make_dummies(data, columns=["E-I", "N-S", "F-T", "J-P"]):
    for column in columns:
        temp_dummy = pd.get_dummies(data[column], prefix="type")
        data = data.join(temp_dummy)
    return data
training_data = make_dummies(training_data)

X = training_data[["cleaned_post"]]
y = training_data.drop(columns=["cleaned_post"])

smote = SMOTE()

vectorizer = TfidfVectorizer(max_features=10000)
vectorizer.fit(X["cleaned_post"])
X_transformed = vectorizer.transform(X["cleaned_post"])

y_columns = ["E-I", "N-S", "F-T", "J-P"]
evaluation_df = pd.DataFrame(columns=["Target", "Accuracy", "Precision", "Recall", "F1-Score", "Roc-AUC"])

for target_name in y_columns:
    y_target = y[f"type_{target_name[0]}"]
    
    X_over, y_over = smote.fit_resample(X_transformed, y_target)

    X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=42)

    base_models = [(name, pickle.load(open(f'{MODEL}{name}_{target_name}.sav', 'rb'))) for name in ["NaiveBayes","DecisionTree",
                                                                                                    "RandomForest", "Xgboost",
                                                                                                    "AdaBoost","LogisticRegression"]]
    voting_clf = VotingClassifier(estimators=base_models, voting='soft')
    
    # Fit the VotingClassifier
    voting_clf.fit(X_train, y_train)

    # Get the predictions for the test data
    voting_pred = voting_clf.predict(X_test)

    # Evaluate the ensemble
    accuracy = metrics.accuracy_score(y_test, voting_pred)
    precision = metrics.precision_score(y_test, voting_pred)
    recall = metrics.recall_score(y_test, voting_pred)
    f1_score = metrics.f1_score(y_test, voting_pred)
    roc_auc_score = metrics.roc_auc_score(y_test, voting_pred)

    # Update the evaluation_df DataFrame
    evaluation_df = pd.concat([evaluation_df, pd.DataFrame({
        "Target": [target_name],
        "Accuracy": [accuracy],
        "Precision": [precision],
        "Recall": [recall],
        "F1-Score": [f1_score],
        "Roc-AUC": [roc_auc_score]
    })], ignore_index=True)

    # Save the Voting Classifier
    voting_filename = f'{OUTPUT}Voting_SMOTE_{target_name}.sav'
    print(voting_filename)
    pickle.dump(voting_clf, open(voting_filename, 'wb'))

# Save the evaluation_df to a CSV file
evaluation_df.to_csv(os.path.join(OUTPUT, 'evaluation_voting_SMOTE.csv'), index=False)
print(evaluation_df)

  evaluation_df = pd.concat([evaluation_df, pd.DataFrame({


D:\\devra\\Downloads\\voting_results\\Voting_SMOTE_E-I.sav




D:\\devra\\Downloads\\voting_results\\Voting_SMOTE_N-S.sav




D:\\devra\\Downloads\\voting_results\\Voting_SMOTE_F-T.sav




D:\\devra\\Downloads\\voting_results\\Voting_SMOTE_J-P.sav
  Target  Accuracy  Precision    Recall  F1-Score   Roc-AUC
0    E-I  0.878652   0.933219  0.815868  0.870607  0.878699
1    N-S  0.939485   0.911232  0.977965  0.943420  0.938223
2    F-T  0.798722   0.829508  0.773700  0.800633  0.799894
3    J-P  0.785782   0.829670  0.719733  0.770801  0.785846


In [1]:
###Boosting-Voting Model with SMOTE and Word2Vec
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier
from sklearn import metrics
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec

ROOT = r"C:\\Users\\devra\\Downloads\\Codes_PhD"
DATA_DIR = rf"{ROOT}\\dataset\\bvclassifier"
MBTI_RAW_CSV_PATH = os.path.join(DATA_DIR, "mbti_clean_biTri.csv")
MODEL = rf"{ROOT}\\models\\trained_ml_0603\\"
OUTPUT = r"D:\\devra\\Downloads\\voting_results\\"

data = pd.read_csv(MBTI_RAW_CSV_PATH)

training_data = data[["cleaned_post", "E-I", "N-S", "F-T", "J-P"]].copy()
def make_dummies(data, columns=["E-I", "N-S", "F-T", "J-P"]):
    for column in columns:
        temp_dummy = pd.get_dummies(data[column], prefix="type")
        data = data.join(temp_dummy)
    return data
training_data = make_dummies(training_data)

X = training_data[["cleaned_post"]]
y = training_data.drop(columns=["cleaned_post"])

oversample = RandomOverSampler()

vectorizer = TfidfVectorizer(max_features=10000)
vectorizer.fit(X["cleaned_post"])
X_transformed = vectorizer.transform(X["cleaned_post"])

y_columns = ["E-I", "N-S", "F-T", "J-P"]
evaluation_df = pd.DataFrame(columns=["Target", "Accuracy", "Precision", "Recall", "F1-Score", "Roc-AUC"])

for target_name in y_columns:
    y_target = y[f"type_{target_name[0]}"]
    
    X_over, y_over = oversample.fit_resample(X_transformed, y_target)

    X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=42)

    base_models = [(name, pickle.load(open(f'{MODEL}{name}_{target_name}.sav', 'rb'))) for name in ["NaiveBayes","DecisionTree",
                                                                                                    "RandomForest", "Xgboost",
                                                                                                    "AdaBoost","LogisticRegression"]]
    voting_clf = VotingClassifier(estimators=base_models, voting='soft')
    
    # Fit the VotingClassifier
    voting_clf.fit(X_train, y_train)

    # Get the predictions for the test data
    voting_pred = voting_clf.predict(X_test)

    # Evaluate the ensemble
    accuracy = metrics.accuracy_score(y_test, voting_pred)
    precision = metrics.precision_score(y_test, voting_pred)
    recall = metrics.recall_score(y_test, voting_pred)
    f1_score = metrics.f1_score(y_test, voting_pred)
    roc_auc_score = metrics.roc_auc_score(y_test, voting_pred)

    # Update the evaluation_df DataFrame
    evaluation_df = pd.concat([evaluation_df, pd.DataFrame({
        "Target": [target_name],
        "Accuracy": [accuracy],
        "Precision": [precision],
        "Recall": [recall],
        "F1-Score": [f1_score],
        "Roc-AUC": [roc_auc_score]
    })], ignore_index=True)

    # Save the Voting Classifier
    voting_filename = f'{OUTPUT}Voting_Random_{target_name}.sav'
    print(voting_filename)
    pickle.dump(voting_clf, open(voting_filename, 'wb'))

# Save the evaluation_df to a CSV file
evaluation_df.to_csv(os.path.join(OUTPUT, 'evaluation_voting_Randomoversample.csv'), index=False)
print(evaluation_df)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
