In [1]:
from gensim.models import Word2Vec
from imblearn.over_sampling import SMOTE
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost
import pickle
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split
import os

ROOT = r"C:\\Users\\USER\\Downloads\\Devraj"
DATA_DIR = rf"{ROOT}\dataset_rnn"  
MBTI_RAW_CSV_PATH = os.path.join(DATA_DIR, "mbti_augmented_dataset.csv")
MODEL = rf"{ROOT}\\models\\trained_ml2\\"

# Load data
data = pd.read_csv(MBTI_RAW_CSV_PATH)
training_data = data[["cleaned_post", "E-I", "N-S", "F-T", "J-P"]].copy()

# Function to create dummy variables
def make_dummies(data, columns=["E-I", "N-S", "F-T", "J-P"]):
    return pd.get_dummies(data, columns=columns, prefix="type")

training_data = make_dummies(training_data)
X = training_data[["cleaned_post"]]
y = training_data.drop(columns=["cleaned_post"])

vectorsize = 300
# Word2Vec model
word2vec_model = Word2Vec(sentences=X["cleaned_post"], vector_size=vectorsize, window=5, min_count=1, workers=4)
word_vectors = word2vec_model.wv

# Function to get sentence vector
def get_sentence_vector(sentence, word_vectors, vector_size):
    words = sentence.split()
    vector = np.zeros(vector_size)
    for word in words:
        if word in word_vectors:
            vector += word_vectors[word]
        else:
            # If the word is not in the vocabulary, generate a random vector
            vector += np.random.uniform(low=-0.25, high=0.25, size=vector_size)
    vector /= len(words)

    # Ensure non-negative values
    vector = np.maximum(vector, 0)

    return vector

# Vectorize sentences using Word2Vec
X["vectorized"] = X["cleaned_post"].apply(lambda x: get_sentence_vector(x, word_vectors,vector_size=vectorsize))

# Apply SMOTE
oversample = SMOTE(random_state=42)
datasets = {}

for trait in ["E-I", "N-S", "F-T", "J-P"]:
    X_over, y_over = oversample.fit_resample(X["vectorized"].tolist(), y[f"type_{trait[0]}"])
    X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=42)
    datasets[trait] = (X_train, X_test, y_train, y_test)

# Model creation
def create_models():
    nb_clf  = MultinomialNB(alpha=0.01)
    dt_clf  = DecisionTreeClassifier(max_depth=7)
    rf_clf  = RandomForestClassifier(n_estimators=750)
    xgb_clf = xgboost.XGBClassifier(eval_metric=None)
    lr_clf  = LogisticRegression(max_iter=1000, random_state=42)
    return {"NaiveBayes": nb_clf, "DecisionTree": dt_clf, "RandomForest": rf_clf, "Xgboost": xgb_clf, "LogisticRegression": lr_clf}

_metrics = ["Accuracy", "Accuracy", "Accuracy", "Accuracy", "Precision", "Precision", "Precision", "Precision", 
            "Recall", "Recall", "Recall", "Recall", "F1-Score", "F1-Score", "F1-Score", "F1-Score", 
            "Roc-Auc Score", "Roc-Auc Score", "Roc-Auc Score", "Roc-Auc Score"]
_types   = ["E-I", "N-S", "F-T", "J-P", "E-I", "N-S", "F-T", "J-P", "E-I", "N-S", "F-T", "J-P", 
            "E-I", "N-S", "F-T", "J-P", "E-I", "N-S", "F-T", "J-P"]
_columns = ["NaiveBayes", "DecisionTree", "RandomForest", "Xgboost", "LogisticRegression"]
evaluation_df = pd.DataFrame(columns=_columns, index=[_metrics, _types])
models = create_models()

# Model training and evaluation
for model_name, model in models.items():
    for trait in ["E-I", "N-S", "F-T", "J-P"]:
        X_train, X_test, y_train, y_test = datasets[trait]
        print(f"{model_name} is training for {trait}...")
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
        evaluation_df.loc["Precision", trait][model_name] = round(metrics.precision_score(y_test, pred), 3)
        evaluation_df.loc["Recall", trait][model_name] = round(metrics.recall_score(y_test, pred), 3)
        evaluation_df.loc["F1-Score", trait][model_name] = round(metrics.f1_score(y_test, pred), 3)
        evaluation_df.loc["Roc-Auc Score",trait][model_name] = round(metrics.roc_auc_score(y_test, pred), 3)

        filename = f'{MODEL}\{model_name}_{trait}.sav'
        print(filename)
        pickle.dump(model, open(filename, 'wb'))

# Save evaluation results to a CSV file
evaluation_df.to_csv(os.path.join(DATA_DIR, 'evaluation_word2vec_SMOTE_augmented.csv'), index=False)
print(evaluation_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["vectorized"] = X["cleaned_post"].apply(lambda x: get_sentence_vector(x, word_vectors,vector_size=vectorsize))


NaiveBayes is training for E-I...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\NaiveBayes_E-I.sav
NaiveBayes is training for N-S...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\NaiveBayes_N-S.sav
NaiveBayes is training for F-T...
C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\NaiveBayes_F-T.sav
NaiveBayes is training for J-P...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\NaiveBayes_J-P.sav
DecisionTree is training for E-I...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\DecisionTree_E-I.sav
DecisionTree is training for N-S...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\DecisionTree_N-S.sav
DecisionTree is training for F-T...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\DecisionTree_F-T.sav
DecisionTree is training for J-P...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\DecisionTree_J-P.sav
RandomForest is training for E-I...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\RandomForest_E-I.sav
RandomForest is training for N-S...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\RandomForest_N-S.sav
RandomForest is training for F-T...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\RandomForest_F-T.sav
RandomForest is training for J-P...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\RandomForest_J-P.sav
Xgboost is training for E-I...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\Xgboost_E-I.sav
Xgboost is training for N-S...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\Xgboost_N-S.sav
Xgboost is training for F-T...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\Xgboost_F-T.sav
Xgboost is training for J-P...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\Xgboost_J-P.sav
LogisticRegression is training for E-I...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\LogisticRegression_E-I.sav
LogisticRegression is training for N-S...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\LogisticRegression_N-S.sav
LogisticRegression is training for F-T...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\LogisticRegression_F-T.sav
LogisticRegression is training for J-P...
C:\\Users\\USER\\Downloads\\Devraj\\models\\trained_ml2\\\LogisticRegression_J-P.sav
                  NaiveBayes DecisionTree RandomForest Xgboost  \
Accuracy      E-I      0.553        0.645        0.778   0.744   
              N-S      0.619        0.787        0.919   0.917   
              F-T       0.49        0.573        0.621   0.594   
              J-P      0.483        0.571        0.645   0.588   
Precision     E-I      0.614        0.684        0.797   0.776   
              N-S      0.616         0.79        0.862   0.866   
              F-T      0.554        0.558        0.614   0.606   
              J-P      0.482        0.573        0.653   0.572   
Recall        E-I      0.296        0.543        0.749    0.69   
              N-S      0.633        0.783          1.0   0.986   
              F-T       0.03        0.808        0.702   0.596

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

In [13]:
from gensim.models import Word2Vec
from imblearn.over_sampling import SMOTE
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost
import pickle
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split
import os

ROOT = r"C:\\Users\\USER\\Downloads\\Devraj"
DATA_DIR = rf"{ROOT}\dataset_rnn"  
MBTI_RAW_CSV_PATH = os.path.join(DATA_DIR, "mbti_clean_biTri_para0403.csv")
MODEL = rf"{ROOT}\\models\\trained_ml2\\"

# Load data
data = pd.read_csv(MBTI_RAW_CSV_PATH)
training_data = data[["cleaned_post", "E-I", "N-S", "F-T", "J-P"]].copy()

# Function to create dummy variables
def make_dummies(data, columns=["E-I", "N-S", "F-T", "J-P"]):
    return pd.get_dummies(data, columns=columns, prefix="type")

training_data = make_dummies(training_data)
X = training_data[["cleaned_post"]]
y = training_data.drop(columns=["cleaned_post"])

vectorsize = 300
# Word2Vec model
word2vec_model = Word2Vec(sentences=X["cleaned_post"], vector_size=vectorsize, window=5, min_count=1, workers=4)
word_vectors = word2vec_model.wv

# Function to get sentence vector
def get_sentence_vector(sentence, word_vectors, vector_size):
    words = sentence.split()
    vector = np.zeros(vector_size)
    for word in words:
        if word in word_vectors:
            vector += word_vectors[word]
        else:
            # If the word is not in the vocabulary, generate a random vector
            vector += np.random.uniform(low=-0.25, high=0.25, size=vector_size)
    vector /= len(words)

    # Ensure non-negative values
    vector = np.maximum(vector, 0)

    return vector

# Vectorize sentences using Word2Vec
X["vectorized"] = X["cleaned_post"].apply(lambda x: get_sentence_vector(x, word_vectors,vector_size=vectorsize))

# Apply SMOTE
oversample = SMOTE(random_state=42)
datasets = {}

for trait in ["E-I", "N-S", "F-T", "J-P"]:
    X_over, y_over = oversample.fit_resample(X["vectorized"].tolist(), y[f"type_{trait[0]}"])
    X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=42)
    datasets[trait] = (X_train, X_test, y_train, y_test)

# Model creation
def create_models():
    nb_clf  = MultinomialNB(alpha=0.01)
    dt_clf  = DecisionTreeClassifier(max_depth=7)
    rf_clf  = RandomForestClassifier(n_estimators=750)
    xgb_clf = xgboost.XGBClassifier(eval_metric=None)
    lr_clf  = LogisticRegression(max_iter=1000, random_state=42)
    return {"NaiveBayes": nb_clf, "DecisionTree": dt_clf, "RandomForest": rf_clf, "Xgboost": xgb_clf, "LogisticRegression": lr_clf}

_metrics = ["Accuracy", "Accuracy", "Accuracy", "Accuracy", "Precision", "Precision", "Precision", "Precision", 
            "Recall", "Recall", "Recall", "Recall", "F1-Score", "F1-Score", "F1-Score", "F1-Score", 
            "Roc-Auc Score", "Roc-Auc Score", "Roc-Auc Score", "Roc-Auc Score"]
_types   = ["E-I", "N-S", "F-T", "J-P", "E-I", "N-S", "F-T", "J-P", "E-I", "N-S", "F-T", "J-P", 
            "E-I", "N-S", "F-T", "J-P", "E-I", "N-S", "F-T", "J-P"]
_columns = ["NaiveBayes", "DecisionTree", "RandomForest", "Xgboost", "LogisticRegression"]
evaluation_df = pd.DataFrame(columns=_columns, index=[_metrics, _types])
models = create_models()

# Model training and evaluation
for model_name, model in models.items():
    for trait in ["E-I", "N-S", "F-T", "J-P"]:
        X_train, X_test, y_train, y_test = datasets[trait]
        print(f"{model_name} is training for {trait}...")
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
        evaluation_df.loc["Precision", trait][model_name] = round(metrics.precision_score(y_test, pred), 3)
        evaluation_df.loc["Recall", trait][model_name] = round(metrics.recall_score(y_test, pred), 3)
        evaluation_df.loc["F1-Score", trait][model_name] = round(metrics.f1_score(y_test, pred), 3)
        evaluation_df.loc["Roc-Auc Score",trait][model_name] = round(metrics.roc_auc_score(y_test, pred), 3)

        filename = f'{MODEL}\{model_name}_{trait}.sav'
        print(filename)
        pickle.dump(model, open(filename, 'wb'))

# Save evaluation results to a CSV file
evaluation_df.to_csv(os.path.join(DATA_DIR, 'evaluation_word2vec_SMOTE_augmented.csv'), index=False)
print(evaluation_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["vectorized"] = X["cleaned_post"].apply(lambda x: get_sentence_vector(x, word_vectors,vector_size=vectorsize))


NaiveBayes is training for E-I...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteNaiveBayes_E-I.sav
NaiveBayes is training for N-S...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteNaiveBayes_N-S.sav
NaiveBayes is training for F-T...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteNaiveBayes_F-T.sav
NaiveBayes is training for J-P...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteNaiveBayes_J-P.sav
DecisionTree is training for E-I...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteDecisionTree_E-I.sav
DecisionTree is training for N-S...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteDecisionTree_N-S.sav
DecisionTree is training for F-T...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteDecisionTree_F-T.sav
DecisionTree is training for J-P...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteDecisionTree_J-P.sav
RandomForest is training for E-I...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteRandomForest_E-I.sav
RandomForest is training for N-S...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteRandomForest_N-S.sav
RandomForest is training for F-T...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteRandomForest_F-T.sav
RandomForest is training for J-P...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteRandomForest_J-P.sav
Xgboost is training for E-I...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteXgboost_E-I.sav
Xgboost is training for N-S...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteXgboost_N-S.sav
Xgboost is training for F-T...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteXgboost_F-T.sav
Xgboost is training for J-P...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteXgboost_J-P.sav
LogisticRegression is training for E-I...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteLogisticRegression_E-I.sav
LogisticRegression is training for N-S...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteLogisticRegression_N-S.sav
LogisticRegression is training for F-T...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
You are setting values through chained assignment. Currently this works in certain cases, but when u

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteLogisticRegression_F-T.sav
LogisticRegression is training for J-P...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  evaluation_df.loc["Accuracy", trait][model_name] = round(metrics.accuracy_score(y_test, pred), 3)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0

C:\\Users\\USER\Downloads\Devraj\models\word2vec_smoteLogisticRegression_J-P.sav
                  NaiveBayes DecisionTree RandomForest Xgboost  \
Accuracy      E-I      0.593        0.702        0.859    0.84   
              N-S      0.492        0.772        0.926   0.921   
              F-T      0.477        0.547        0.601   0.544   
              J-P      0.541        0.576        0.707   0.638   
Precision     E-I      0.629        0.697        0.988   0.924   
              N-S      0.833        0.782        0.875   0.873   
              F-T        0.0        0.581        0.605   0.564   
              J-P      0.546        0.584          0.9   0.665   
Recall        E-I      0.455        0.717        0.727   0.743   
              N-S      0.019        0.775          1.0    0.99   
              F-T        0.0        0.481        0.676   0.554   
              J-P      0.482        0.531        0.465   0.558   
F1-Score      E-I      0.528        0.707        0.837   0.82

In [21]:
from gensim.models import Word2Vec
from imblearn.over_sampling import SMOTE
import numpy as np
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn import metrics
import pandas as pd
from sklearn.model_selection import train_test_split
import os

ROOT = r"C:\\Users\\USER\Downloads\Devraj"
DATA_DIR = os.path.join(ROOT, "dataset_rnn")
MBTI_RAW_CSV_PATH = os.path.join(DATA_DIR, "mbti_clean_biTri.csv")
MODEL = os.path.join(ROOT, "models", "word2vec_smote")

# Load data
data = pd.read_csv(MBTI_RAW_CSV_PATH)
training_data = data[["cleaned_post", "E-I", "N-S", "F-T", "J-P"]].copy()

# Function to create dummy variables
def make_dummies(data, columns=["E-I", "N-S", "F-T", "J-P"]):
    return pd.get_dummies(data, columns=columns, prefix="type")

training_data = make_dummies(training_data)
X = training_data[["cleaned_post"]]
y = training_data.drop(columns=["cleaned_post"])

vectorsize = 300
# Word2Vec model
word2vec_model = Word2Vec(sentences=X["cleaned_post"].apply(lambda x: x.split()), vector_size=vectorsize, window=5, min_count=1, workers=4)
word_vectors = word2vec_model.wv

# Function to get word vector
def get_word_vector(word, word_vectors, vector_size):
    if word in word_vectors:
        return word_vectors[word]
    else:
        # If the word is not in the vocabulary, generate a random vector
        return np.random.uniform(low=-0.25, high=0.25, size=vector_size)

# Function to get sentence vector
def get_sentence_vector(sentence, word_vectors, vector_size):
    words = sentence.split()
    vectors = [get_word_vector(word, word_vectors, vector_size) for word in words]
    if not vectors:
        # If no vectors are found, return a vector of zeros
        return np.zeros(vector_size)
    else:
        # Aggregate word vectors
        return np.mean(vectors, axis=0)

# Vectorize sentences using Word2Vec
X["vectorized"] = X["cleaned_post"].apply(lambda x: get_sentence_vector(x, word_vectors, vector_size=vectorsize))

# Apply SMOTE
oversample = SMOTE(random_state=42)
datasets = {}

for trait in ["E-I", "N-S", "F-T", "J-P"]:
    X_over, y_over = oversample.fit_resample(np.vstack(X["vectorized"].to_numpy()), y[f"type_{trait[0]}"])
    X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=42)
    datasets[trait] = (X_train, X_test, y_train, y_test)

# Logistic Regression model
lr_clf = LogisticRegression(max_iter=1000, random_state=42)

# Model training and evaluation for Logistic Regression
for trait in ["E-I", "N-S", "F-T", "J-P"]:
    X_train, X_test, y_train, y_test = datasets[trait]
    print(f"Logistic Regression is training for {trait}...")
    lr_clf.fit(X_train, y_train)
    pred = lr_clf.predict(X_test)

    # Evaluate and store results
    accuracy = metrics.accuracy_score(y_test, pred)
    precision = metrics.precision_score(y_test, pred)
    recall = metrics.recall_score(y_test, pred)
    f1_score = metrics.f1_score(y_test, pred)
    roc_auc = metrics.roc_auc_score(y_test, pred)

    print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1_score}, ROC-AUC: {roc_auc}")

    filename = f'{MODEL}\LogisticRegression_{trait}.sav'
    print(filename)
    pickle.dump(lr_clf, open(filename, 'wb'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["vectorized"] = X["cleaned_post"].apply(lambda x: get_sentence_vector(x, word_vectors, vector_size=vectorsize))


Logistic Regression is training for E-I...
Accuracy: 0.7108614232209738, Precision: 0.7110778443113772, Recall: 0.7110778443113772, F1-Score: 0.7110778443113772, ROC-AUC: 0.7108612609862733
C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\LogisticRegression_E-I.sav
Logistic Regression is training for N-S...
Accuracy: 0.7271815446339017, Precision: 0.7494852436513384, Recall: 0.7077122488658457, F1-Score: 0.728, ROC-AUC: 0.7278202128307129
C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\LogisticRegression_N-S.sav
Logistic Regression is training for F-T...
Accuracy: 0.7784877529286475, Precision: 0.8040904198062433, Recall: 0.7614678899082569, F1-Score: 0.7821989528795812, ROC-AUC: 0.7792846695918096
C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\LogisticRegression_F-T.sav
Logistic Regression is training for J-P...
Accuracy: 0.6383587786259542, Precision: 0.6392344497607656, Recall: 0.6367969494756911, F1-Score: 0.6380133715377269, ROC-AUC: 0.6383602703443403
C:\\Use

In [26]:
from gensim.models import Word2Vec
from imblearn.over_sampling import SMOTE
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import pickle
from sklearn import metrics
import pandas as pd
from sklearn.model_selection import train_test_split
import os

ROOT = r"C:\\Users\\USER\Downloads\Devraj"
DATA_DIR = os.path.join(ROOT, "dataset_rnn")
MBTI_RAW_CSV_PATH = os.path.join(DATA_DIR, "mbti_clean_biTri.csv")
MODEL = os.path.join(ROOT, "models", "word2vec_smote")

# Load data
data = pd.read_csv(MBTI_RAW_CSV_PATH)
training_data = data[["cleaned_post", "E-I", "N-S", "F-T", "J-P"]].copy()

# Function to create dummy variables
def make_dummies(data, columns=["E-I", "N-S", "F-T", "J-P"]):
    return pd.get_dummies(data, columns=columns, prefix="type")

training_data = make_dummies(training_data)
X = training_data[["cleaned_post"]]
y = training_data.drop(columns=["cleaned_post"])

vectorsize = 300
# Word2Vec model
word2vec_model = Word2Vec(sentences=X["cleaned_post"].apply(lambda x: x.split()), vector_size=vectorsize, window=5, min_count=1, workers=4)
word_vectors = word2vec_model.wv

# Function to get word vector
def get_word_vector(word, word_vectors, vector_size):
    if word in word_vectors:
        return word_vectors[word]
    else:
        # If the word is not in the vocabulary, generate a random vector
        return np.random.uniform(low=-0.25, high=0.75, size=vector_size)

# Function to convert text to bag-of-words representation
def text_to_bow(text, word_vectors, vector_size):
    words = text.split()
    bow = np.zeros(vector_size)
    for word in words:
        word_vector = get_word_vector(word, word_vectors, vector_size)
        bow += word_vector
    return bow

# Vectorize sentences using bag-of-words representation
X["bow"] = X["cleaned_post"].apply(lambda x: text_to_bow(x, word_vectors, vector_size=vectorsize))

# Apply SMOTE
oversample = SMOTE(random_state=42)
datasets = {}

for trait in ["E-I", "N-S", "F-T", "J-P"]:
    X_over, y_over = oversample.fit_resample(np.vstack(X["bow"].to_numpy()), y[f"type_{trait[0]}"])
    X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=42)
    datasets[trait] = (X_train, X_test, y_train, y_test)

# Classifiers
classifiers = {
    "Naive Bayes": MultinomialNB(alpha=0.01),
    "Decision Tree": DecisionTreeClassifier(max_depth=7, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=750, random_state=42),
    "GradB": GradientBoostingClassifier(random_state=42),
    "XGBoost": xgboost.XGBClassifier(eval_metric=None),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
}

# Dataframe to store evaluation metrics
evaluation_results_df = pd.DataFrame(columns=["Classifier", "Trait", "Accuracy", "Precision", "Recall", "F1-Score", "ROC-AUC"])

# Model training and evaluation for each classifier
for classifier_name, classifier in classifiers.items():
    for trait in ["E-I", "N-S", "F-T", "J-P"]:
        X_train, X_test, y_train, y_test = datasets[trait]
        if classifier_name == "Naive Bayes":
            # MultinomialNB expects non-negative input, so we use word counts (bag-of-words) representation
            X_train = np.where(X_train < 0, 0, X_train)
            X_test = np.where(X_test < 0, 0, X_test)
            classifier.fit(X_train, y_train)
        else:
            classifier.fit(X_train, y_train)
        pred = classifier.predict(X_test)

        # Evaluate and store results
        accuracy = metrics.accuracy_score(y_test, pred)
        precision = metrics.precision_score(y_test, pred)
        recall = metrics.recall_score(y_test, pred)
        f1_score = metrics.f1_score(y_test, pred)
        roc_auc = metrics.roc_auc_score(y_test, pred)

        # Create a new DataFrame with the results
        new_results_df = pd.DataFrame({
            "Classifier": [classifier_name],
            "Trait": [trait],
            "Accuracy": [accuracy],
            "Precision": [precision],
            "Recall": [recall],
            "F1-Score": [f1_score],
            "ROC-AUC": [roc_auc]
        })

        # Concatenate the new results to the main DataFrame
        evaluation_results_df = pd.concat([evaluation_results_df, new_results_df], ignore_index=True)

        # Save the model
        filename = f'{MODEL}\\{classifier_name.replace(" ", "")}_{trait}.sav'
        print(f"Saved model at: {filename}")
        pickle.dump(classifier, open(filename, 'wb'))

# Save evaluation dataframe as CSV
evaluation_results_df.to_csv(os.path.join(MODEL, "evaluation_metrics.csv"), index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["bow"] = X["cleaned_post"].apply(lambda x: text_to_bow(x, word_vectors, vector_size=vectorsize))
  evaluation_results_df = pd.concat([evaluation_results_df, new_results_df], ignore_index=True)


Saved model at: C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\NaiveBayes_E-I.sav
Saved model at: C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\NaiveBayes_N-S.sav
Saved model at: C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\NaiveBayes_F-T.sav
Saved model at: C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\NaiveBayes_J-P.sav
Saved model at: C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\DecisionTree_E-I.sav
Saved model at: C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\DecisionTree_N-S.sav
Saved model at: C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\DecisionTree_F-T.sav
Saved model at: C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\DecisionTree_J-P.sav
Saved model at: C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\RandomForest_E-I.sav
Saved model at: C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\RandomForest_N-S.sav
Saved model at: C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\RandomForest_F-T.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Saved model at: C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\LogisticRegression_E-I.sav


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Saved model at: C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\LogisticRegression_N-S.sav


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Saved model at: C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\LogisticRegression_F-T.sav


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Saved model at: C:\\Users\\USER\Downloads\Devraj\models\word2vec_smote\LogisticRegression_J-P.sav


In [20]:
from gensim.models import KeyedVectors
from imblearn.over_sampling import SMOTE
import numpy as np
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn import metrics
import pandas as pd
from sklearn.model_selection import train_test_split
import os

ROOT = r"C:\\Users\\USER\Downloads\Devraj"
DATA_DIR = os.path.join(ROOT, "dataset_rnn")
MBTI_RAW_CSV_PATH = os.path.join(DATA_DIR, "mbti_clean_biTri.csv")
MODEL = os.path.join(ROOT, "models", "glove_smote")

# Load data
data = pd.read_csv(MBTI_RAW_CSV_PATH)
training_data = data[["cleaned_post", "E-I", "N-S", "F-T", "J-P"]].copy()

# Function to create dummy variables
def make_dummies(data, columns=["E-I", "N-S", "F-T", "J-P"]):
    return pd.get_dummies(data, columns=columns, prefix="type")

training_data = make_dummies(training_data)
X = training_data[["cleaned_post"]]
y = training_data.drop(columns=["cleaned_post"])

# Load GloVe word vectors
glove_path = os.path.join(DATA_DIR, "glove.6B.100d.txt")
glove_model = KeyedVectors.load_word2vec_format(glove_path, binary=False)

# Function to get sentence vector using GloVe
def get_sentence_vector(sentence, word_vectors, vector_size):
    words = sentence.split()
    vector = np.zeros(vector_size)
    count = 0
    for word in words:
        if word in word_vectors:
            vector += word_vectors[word]
            count += 1

    # Avoid division by zero
    if count != 0:
        vector /= count

    # Ensure non-negative values
    vector = np.maximum(vector, 0)

    return vector

# Vectorize sentences using GloVe
X["vectorized"] = X["cleaned_post"].apply(lambda x: get_sentence_vector(x, glove_model, vector_size=100))

# Apply SMOTE
oversample = SMOTE(random_state=42)
datasets = {}

for trait in ["E-I", "N-S", "F-T", "J-P"]:
    # Convert "vectorized" column to 2D array
    X_over, y_over = oversample.fit_resample(np.vstack(X["vectorized"].to_numpy()), y[f"type_{trait[0]}"])
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=42)
    
    # Ensure that X_train and X_test are 2D arrays
    X_train = np.vstack(X_train)
    X_test = np.vstack(X_test)
    
    datasets[trait] = (X_train, X_test, y_train, y_test)

# Logistic Regression model
lr_clf = LogisticRegression(max_iter=1000, random_state=42)

# Model training and evaluation for Logistic Regression
for trait in ["E-I", "N-S", "F-T", "J-P"]:
    X_train, X_test, y_train, y_test = datasets[trait]
    print(f"Logistic Regression is training for {trait}...")
    lr_clf.fit(X_train, y_train)
    pred = lr_clf.predict(X_test)

    # Evaluate and store results
    accuracy = metrics.accuracy_score(y_test, pred)
    precision = metrics.precision_score(y_test, pred)
    recall = metrics.recall_score(y_test, pred)
    f1_score = metrics.f1_score(y_test, pred)
    roc_auc = metrics.roc_auc_score(y_test, pred)

    print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1_score}, ROC-AUC: {roc_auc}")

    filename = f'{MODEL}\\LogisticRegression_{trait}.sav'  # or use: filename = os.path.join(MODEL, f'LogisticRegression_{trait}.sav')
    print(filename)
    pickle.dump(lr_clf, open(filename, 'wb'))

ValueError: invalid literal for int() with base 10: 'the'

In [15]:
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import pickle
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

ROOT = r"C:\\Users\\devra\\Downloads\\Codes_PhD"
DATA_DIR = rf"{ROOT}\\dataset"
MBTI_RAW_CSV_PATH = os.path.join(DATA_DIR, "mbti_clean_biTri.csv")
MODEL = rf"{ROOT}\\models\\trained_ensemble\\"

data = pd.read_csv(MBTI_RAW_CSV_PATH)

training_data = data[["cleaned_post", "E-I", "N-S", "F-T", "J-P"]].copy()
def make_dummies(data, columns=["E-I", "N-S", "F-T", "J-P"]):
    for column in columns:
        temp_dummy = pd.get_dummies(data[column], prefix="type")
        data = data.join(temp_dummy)
    return data
training_data = make_dummies(training_data)

X = training_data[["cleaned_post"]]
y = training_data.drop(columns=["cleaned_post"])

oversample = RandomOverSampler()

vectorizer = TfidfVectorizer(max_features=10000)
vectorizer.fit(X["cleaned_post"])
X_transformed = vectorizer.transform(X["cleaned_post"])

y_columns = ["E-I", "N-S", "F-T", "J-P"]
evaluation_df = pd.DataFrame(columns=["Target", "Accuracy", "Precision", "Recall", "F1-Score", "Roc-AUC"])

for target_name in y_columns:
    y_target = y[f"type_{target_name[0]}"]
    X_over, y_over = oversample.fit_resample(X_transformed, y_target)

    X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=42)

    base_models = [(name, pickle.load(open(f'{MODEL}{name}_{target_name}.sav', 'rb'))) for name in [ "RandomForest", "Xgboost"]]
    voting_clf = VotingClassifier(estimators=base_models, voting='soft')
    
    # Fit the VotingClassifier
    voting_clf.fit(X_train, y_train)

    # Get the predictions for the test data
    voting_pred_proba_test = voting_clf.predict_proba(X_test)

    # Use the Voting predictions as features for AdaBoost
    ada_boost_clf = AdaBoostClassifier(n_estimators=50, random_state=42)
    ada_boost_clf.fit(voting_pred_proba_test, y_test)

    # Get feature importance from the base estimators in the ensemble
    base_estimator_feature_importance = np.mean([estimator.feature_importances_ for estimator in ada_boost_clf.estimators_], axis=0)
    print(f"Feature Importance for {target_name}:", base_estimator_feature_importance)

    # Evaluate the ensemble
    boosting_pred = ada_boost_clf.predict(voting_pred_proba_test)
    accuracy = metrics.accuracy_score(y_test, boosting_pred)
    precision = metrics.precision_score(y_test, boosting_pred)
    recall = metrics.recall_score(y_test, boosting_pred)
    f1_score = metrics.f1_score(y_test, boosting_pred)
    roc_auc_score = metrics.roc_auc_score(y_test, boosting_pred)

    # Update the evaluation_df DataFrame
    evaluation_df = pd.concat([evaluation_df, pd.DataFrame({
        "Target": [target_name],
        "Accuracy": [accuracy],
        "Precision": [precision],
        "Recall": [recall],
        "F1-Score": [f1_score],
        "Roc-AUC": [roc_auc_score]
    })], ignore_index=True)

    # Save the Boosting Classifier
    boosting_filename = f'{MODEL}BoostingClassifier_RX_{target_name}.sav'
    print(boosting_filename)
    pickle.dump(ada_boost_clf, open(boosting_filename, 'wb'))

# Save the evaluation_df to a CSV file
evaluation_df.to_csv(os.path.join(DATA_DIR, 'evaluation_boostingClassifier_RX.csv'), index=False)
print(evaluation_df)

  evaluation_df = pd.concat([evaluation_df, pd.DataFrame({


Feature Importance for E-I: [0.5 0.5]
C:\\Users\\devra\\Downloads\\Codes_PhD\\models\\trained_ensemble\\BoostingClassifier_RX_E-I.sav




Feature Importance for N-S: [0.46 0.54]
C:\\Users\\devra\\Downloads\\Codes_PhD\\models\\trained_ensemble\\BoostingClassifier_RX_N-S.sav




Feature Importance for F-T: [0.4 0.6]
C:\\Users\\devra\\Downloads\\Codes_PhD\\models\\trained_ensemble\\BoostingClassifier_RX_F-T.sav
Feature Importance for J-P: [0.44 0.56]
C:\\Users\\devra\\Downloads\\Codes_PhD\\models\\trained_ensemble\\BoostingClassifier_RX_J-P.sav
  Target  Accuracy  Precision    Recall  F1-Score   Roc-AUC
0    E-I  0.968914   0.996828  0.940868  0.968040  0.968935
1    N-S  0.998328   0.997413  0.999352  0.998381  0.998295
2    F-T  0.802449   0.772321  0.881753  0.823417  0.798736
3    J-P  0.824905   0.909856  0.721640  0.804891  0.825003




In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import pickle
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec

ROOT = r"C:\\Users\\devra\\Downloads\\Codes_PhD"
DATA_DIR = rf"{ROOT}\\dataset"
MBTI_RAW_CSV_PATH = os.path.join(DATA_DIR, "mbti_clean_biTri.csv")
MODEL = rf"{ROOT}\\models\\trained_ensemble\\"

data = pd.read_csv(MBTI_RAW_CSV_PATH)

label_encoder = LabelEncoder()

for target_name in ["E-I", "N-S", "F-T", "J-P"]:
    encoded_target = label_encoder.fit_transform(data[target_name])
    data[f"type_{target_name[0]}"] = encoded_target

training_data = data[["cleaned_post", "E-I", "N-S", "F-T", "J-P"]].copy()

def make_dummies(data, columns=["E-I", "N-S", "F-T", "J-P"]):
    for column in columns:
        temp_dummy = pd.get_dummies(data[column], prefix="type")
        data = data.join(temp_dummy)
    return data

training_data = make_dummies(training_data)

X = training_data[["cleaned_post"]]
y = training_data.drop(columns=["cleaned_post"])

tokenized_posts = X["cleaned_post"].apply(lambda x: x.split())
word2vec_model = Word2Vec(sentences=tokenized_posts, vector_size=100, window=5, min_count=1, workers=4)

y_columns = ["E-I", "N-S", "F-T", "J-P"]
evaluation_df = pd.DataFrame(columns=["Target", "Accuracy", "Precision", "Recall", "F1-Score", "Roc-AUC"])

for target_name in y_columns:
    y_target = y[f"type_{target_name[0]}"]
    X_transformed = word2vec_model.wv[X["cleaned_post"].apply(lambda x: x.split()).sum()]
    X_target = X_transformed[y_target.index]
    # Using Word2Vec for vectorization
    X_df = pd.DataFrame(X_target, index=X.index)

    # Using SMOTE for oversampling
    smote = SMOTE(random_state=42)
    X_over, y_over = smote.fit_resample(X_df.loc[y_target.index], y_target)

    X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=42)

    print('shape of X_train:',X_train.shape)
    print('shape of y_train:',y_train.shape)

    base_models = [(name, pickle.load(open(f'{MODEL}{name}_{target_name}.sav', 'rb'))) for name in ["RandomForest"]]
    voting_clf = VotingClassifier(estimators=base_models, voting='soft')

    # Fit the VotingClassifier
    voting_clf.fit(X_train, y_train)

    # Get the predictions for the test data
    voting_pred_proba_test = voting_clf.predict_proba(X_test)

    # Use the Voting predictions as features for AdaBoost
    ada_boost_clf = AdaBoostClassifier(n_estimators=50, random_state=42)
    ada_boost_clf.fit(voting_pred_proba_test, y_test)

    # Get feature importance from the base estimators in the ensemble
    base_estimator_feature_importance = np.mean([estimator.feature_importances_ for estimator in ada_boost_clf.estimators_], axis=0)
    print(f"Feature Importance for {target_name}:", base_estimator_feature_importance)

    # Evaluate the ensemble
    boosting_pred = ada_boost_clf.predict(voting_pred_proba_test)
    accuracy = metrics.accuracy_score(y_test, boosting_pred)
    precision = metrics.precision_score(y_test, boosting_pred)
    recall = metrics.recall_score(y_test, boosting_pred)
    f1_score = metrics.f1_score(y_test, boosting_pred)
    roc_auc_score = metrics.roc_auc_score(y_test, boosting_pred)

    # Update the evaluation_df DataFrame
    evaluation_df = pd.concat([evaluation_df, pd.DataFrame({
        "Target": [target_name],
        "Accuracy": [accuracy],
        "Precision": [precision],
        "Recall": [recall],
        "F1-Score": [f1_score],
        "Roc-AUC": [roc_auc_score]
    })], ignore_index=True)

    # Save the Boosting Classifier
    boosting_filename = f'{MODEL}BoostingClassifier_SMOTE_{target_name}.sav'
    print(boosting_filename)
    pickle.dump(ada_boost_clf, open(boosting_filename, 'wb'))

# Save the evaluation_df to a CSV file
evaluation_df.to_csv(os.path.join(DATA_DIR, 'evaluation_boostingClassifier_SMOTE.csv'), index=False)
print(evaluation_df)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


shape of X_train: (10680, 100)
shape of y_train: (10680,)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
  evaluation_df = pd.concat([evaluation_df, pd.DataFrame({


Feature Importance for E-I: [0.48 0.52]
C:\\Users\\devra\\Downloads\\Codes_PhD\\models\\trained_ensemble\\BoostingClassifier_SMOTE_E-I.sav
shape of X_train: (11963, 100)
shape of y_train: (11963,)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Feature Importance for N-S: [0.42 0.58]
C:\\Users\\devra\\Downloads\\Codes_PhD\\models\\trained_ensemble\\BoostingClassifier_SMOTE_N-S.sav
shape of X_train: (7508, 100)
shape of y_train: (7508,)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Feature Importance for F-T: [0.54 0.46]
C:\\Users\\devra\\Downloads\\Codes_PhD\\models\\trained_ensemble\\BoostingClassifier_SMOTE_F-T.sav
shape of X_train: (8384, 100)
shape of y_train: (8384,)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Feature Importance for J-P: [0.48 0.52]
C:\\Users\\devra\\Downloads\\Codes_PhD\\models\\trained_ensemble\\BoostingClassifier_SMOTE_J-P.sav
  Target  Accuracy  Precision    Recall  F1-Score   Roc-AUC
0    E-I  0.698876   0.748598  0.599551  0.665835  0.698951
1    N-S  0.782347   0.732050  0.911860  0.812121  0.778099
2    F-T  0.569755   0.592712  0.563710  0.577847  0.570038
3    J-P  0.598760   0.631980  0.474738  0.542188  0.598878


