# (MBTI) Myers-Briggs Personality Type Prediction

* Extroversion vs. Introversion
    * I - 0
    * E - 1 
    
* Sensing vs. Intuition 
    * N - 0 
    * S - 1
    
* Thinking vs. Feeling
    * F - 0
    * T - 1
    
* Judging vs. Perceiving
    * P - 0
    * J - 1 
    
## IMPLEMENTING DIFFERENT MODELS 

In [None]:
# importing dependencies here
import numpy as np
import pandas as pd

# visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# data stratifying and splitting
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

# preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.compose import ColumnTransformer

# class imbalance
from imblearn.pipeline import make_pipeline as imb_make_pipeline
from imblearn.under_sampling import RandomUnderSampler

# algorithms/models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier


# from imblearn.over_sampling import RandomOverSampler
# from imblearn.over_sampling import SMOTE
# from imblearn.over_sampling import ADASYN
# from sklearn.preprocessing import StandardScaler

# model evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import (
    classification_report,
    f1_score,
    accuracy_score,
    roc_auc_score,
)

# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer
# from sklearn.preprocessing import Normalizer
# from sklearn.compose import ColumnTransformer
# from sklearn.feature_extraction.text import TfidfVectorizer

# from sklearn.pipeline import make_pipeline
# from sklearn.feature_selection import f_classif
# from sklearn.feature_selection import SelectKBest

# model performance evaluation and selection


# performance check
import time
import warnings

warnings.filterwarnings("ignore")

# sparse to dense
from sklearn.base import TransformerMixin

class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()


# saving the model
from joblib import dump

# code formatter
%load_ext nb_black

In [2]:
# reading the final datasets
personality_data = pd.read_csv("data_ekta/clean_data_3.csv")

<IPython.core.display.Javascript object>

In [3]:
# checking counts dataset
personality_data.head(2)

Unnamed: 0,type,is_Extrovert,is_Sensing,is_Thinking,is_Judging,posts,clean_posts,compound_sentiment,pos_sentiment,neg_sentiment,...,colons,emojis,word_count,unique_words,avg_word_ct,post_length_var,upper,link_count,ellipses,img_count
0,INFJ,0,0,0,1,'When asked of the things you wish you did ear...,asked thing wish earlier find answering...,0.9998,0.404284,0.13302,...,16,4,1549,746,30.98,78.414931,73,2,31,0
1,INFJ,0,0,0,1,'I love both and they are equally important to...,love equally important music window soul in...,0.99995,0.601071,0.131455,...,7,0,1429,636,28.58,160.7444,81,1,2,0


<IPython.core.display.Javascript object>

### Modelling

#### Setting predictors and target variable


In [7]:
X = personality_data[
    [
        "clean_posts",
        "compound_sentiment",
        "S_ADJ_med",
        "S_ADJ_std",
        "S_ADP_med",
        "S_ADP_std",
        "S_ADV_med",
        "S_ADV_std",
        "S_CONJ_med",
        "S_CONJ_std",
        "S_DET_med",
        "S_DET_std",
        "S_NOUN_med",
        "S_NOUN_std",
        "S_NUM_med",
        "S_NUM_std",
        "S_PRT_med",
        "S_PRT_std",
        "S_PRON_med",
        "S_PRON_std",
        "S_VERB_med",
        "S_VERB_std",
        "qm",
        "em",
        "colons",
        "emojis",
        "word_count",
        "unique_words",
        "upper",
        "link_count",
        "ellipses",
        "img_count",
    ]
]

y = personality_data.iloc[:, 1:5]


print(X.shape)
print(y.shape)

(8588, 32)
(8588, 4)


<IPython.core.display.Javascript object>

### Setting up preprocessor for vectorization and selecting best counts and scores

In [9]:
# preprocessing steps for selecting best k columns/features from counts & scores and for vectorizing words

counts_n_scores = [
    "compound_sentiment",
    "S_ADJ_med",
    "S_ADJ_std",
    "S_ADP_med",
    "S_ADP_std",
    "S_ADV_med",
    "S_ADV_std",
    "S_CONJ_med",
    "S_CONJ_std",
    "S_DET_med",
    "S_DET_std",
    "S_NOUN_med",
    "S_NOUN_std",
    "S_NUM_med",
    "S_NUM_std",
    "S_PRT_med",
    "S_PRT_std",
    "S_PRON_med",
    "S_PRON_std",
    "S_VERB_med",
    "S_VERB_std",
    "qm",
    "em",
    "colons",
    "emojis",
    "word_count",
    "unique_words",
    "upper",
    "link_count",
    "ellipses",
    "img_count",
]

best_k_features = make_pipeline(MinMaxScaler(), SelectKBest(f_classif, k=5))

# best_k_words = make_pipeline(
#     TfidfVectorizer(min_df=25, max_df=0.8,), SelectKBest(f_classif, k=1000)
# )


# setting up preprocessing for tf-idf vectorizer
preprocesser_tf = ColumnTransformer(
    transformers=[
        ("tfidf", TfidfVectorizer(min_df=25, max_df=0.8,), "clean_posts",),
        #         ("bestwords", best_k_words, "clean_posts"),
        ("selectbest", best_k_features, counts_n_scores),
    ],
    remainder="passthrough",
)

# setting up preprocessing for count vectorizer
preprocesser_ct = ColumnTransformer(
    transformers=[
        ("ct_vect", CountVectorizer(min_df=25, max_df=0.8,), "clean_posts",),
        ("selectbest", best_k_features, counts_n_scores),
    ],
    remainder="passthrough",
)

<IPython.core.display.Javascript object>

In [10]:
def create_model(model, X, target, nsplits):

    mbti_type = {
        "is_Extrovert": "Extrovert vs Introvert",
        "is_Sensing": "Sensing vs Intuition",
        "is_Thinking": "Thinking vs Feeling",
        "is_Judging": "Judging vs Perceiving",
    }

    kfold = StratifiedKFold(n_splits=nsplits, shuffle=True, random_state=100)

    # to time the individual model run time
    t = time.time()

    predictions = []

    for col in target.columns:

        print(f"\n{mbti_type[col]}")
        y = target[col]
        auc_list, acc_list, f1_list = [], [], []

        for train_index, test_index in kfold.split(X, y):

            X_train, X_test, y_train, y_test = (
                X.iloc[train_index],
                X.iloc[test_index],
                y[train_index],
                y[test_index],
            )

            model.fit(X_train, y_train)
      
            y_pred = model.predict(X_test)

            preds = model.predict_proba(X_test)[
                :, 1
            ]  # returns the probablity of class 1
            auc = roc_auc_score(y_test, preds)
            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average="weighted")

            auc_list.append(auc)
            acc_list.append(acc)
            f1_list.append(f1)
            
            dump(model, f"clf_{col}.joblib")

        print(
            f"Avg AUC: {np.mean(auc_list):.2f}, Avg Accuracy: {np.mean(acc_list):.2f}, Avg F1: {np.mean(f1_list):.2f}"
        )
        print(classification_report(y_test, y_pred))

    print(f"\nTime Taken: {time.time()-t:.2f} seconds")   
 


<IPython.core.display.Javascript object>

### Logistic Regression

In [11]:
# using TF-IDF vectorized data
clf_tf = imb_make_pipeline(
    preprocesser_tf,
    #     DenseTransformer(),
    RandomUnderSampler(),
    LogisticRegression(),
)
create_model(clf_tf, X, y, nsplits=5)


Extrovert vs Introvert
Avg AUC: 0.76, Avg Accuracy: 0.70, Avg F1: 0.72
              precision    recall  f1-score   support

           0       0.88      0.72      0.79      1321
           1       0.41      0.66      0.51       396

    accuracy                           0.70      1717
   macro avg       0.64      0.69      0.65      1717
weighted avg       0.77      0.70      0.72      1717


Sensing vs Intuition
Avg AUC: 0.76, Avg Accuracy: 0.69, Avg F1: 0.73
              precision    recall  f1-score   support

           0       0.93      0.68      0.79      1480
           1       0.25      0.68      0.37       237

    accuracy                           0.68      1717
   macro avg       0.59      0.68      0.58      1717
weighted avg       0.84      0.68      0.73      1717


Thinking vs Feeling
Avg AUC: 0.88, Avg Accuracy: 0.79, Avg F1: 0.79
              precision    recall  f1-score   support

           0       0.83      0.76      0.79       929
           1       0.74   

<IPython.core.display.Javascript object>

In [12]:
# lr = clf_tf[-1]
# importance = lr.coef_[0]
# importance = f_classif(X_train, y_train)
# summarize feature importance
# for i, v in enumerate(importance):
#     print("Feature: %0d, Score: %.5f" % (i, v))
# # plot feature importance
# plt.bar([x for x in range(len(importance))], importance)
# plt.show()

<IPython.core.display.Javascript object>

In [14]:
# using Count vectorized data
clf_ct = imb_make_pipeline(
    preprocesser_ct, DenseTransformer(), RandomUnderSampler(), LogisticRegression(),
)
create_model(clf_ct, X, y, nsplits=5)


Extrovert vs Introvert
Avg AUC: 0.70, Avg Accuracy: 0.65, Avg F1: 0.67
              precision    recall  f1-score   support

           0       0.85      0.64      0.73      1321
           1       0.34      0.61      0.44       396

    accuracy                           0.63      1717
   macro avg       0.59      0.63      0.58      1717
weighted avg       0.73      0.63      0.66      1717


Sensing vs Intuition
Avg AUC: 0.71, Avg Accuracy: 0.65, Avg F1: 0.70
              precision    recall  f1-score   support

           0       0.92      0.64      0.75      1480
           1       0.22      0.64      0.33       237

    accuracy                           0.64      1717
   macro avg       0.57      0.64      0.54      1717
weighted avg       0.82      0.64      0.69      1717


Thinking vs Feeling
Avg AUC: 0.83, Avg Accuracy: 0.75, Avg F1: 0.75
              precision    recall  f1-score   support

           0       0.79      0.73      0.76       929
           1       0.71   

<IPython.core.display.Javascript object>

### Naive Bayes

In [16]:
# using TF-IDF vectorized data
nb_tf = imb_make_pipeline(
    preprocesser_tf,
    DenseTransformer(),
    #     MinMaxScaler(),
    RandomUnderSampler(),
    MultinomialNB(),
)
create_model(nb_tf, X, y, nsplits=5)


Extrovert vs Introvert
Avg AUC: 0.74, Avg Accuracy: 0.64, Avg F1: 0.67
              precision    recall  f1-score   support

           0       0.88      0.65      0.75      1321
           1       0.38      0.71      0.50       396

    accuracy                           0.66      1717
   macro avg       0.63      0.68      0.62      1717
weighted avg       0.77      0.66      0.69      1717


Sensing vs Intuition
Avg AUC: 0.77, Avg Accuracy: 0.68, Avg F1: 0.73
              precision    recall  f1-score   support

           0       0.93      0.69      0.80      1480
           1       0.27      0.69      0.38       237

    accuracy                           0.69      1717
   macro avg       0.60      0.69      0.59      1717
weighted avg       0.84      0.69      0.74      1717


Thinking vs Feeling
Avg AUC: 0.85, Avg Accuracy: 0.77, Avg F1: 0.77
              precision    recall  f1-score   support

           0       0.77      0.81      0.79       929
           1       0.77   

<IPython.core.display.Javascript object>

In [17]:
# using Count vectorized data
nb_ct = imb_make_pipeline(
    preprocesser_ct, DenseTransformer(), RandomUnderSampler(), MultinomialNB(),
)
create_model(nb_ct, X, y, nsplits=5)


Extrovert vs Introvert
Avg AUC: 0.74, Avg Accuracy: 0.68, Avg F1: 0.70
              precision    recall  f1-score   support

           0       0.88      0.69      0.77      1321
           1       0.40      0.69      0.50       396

    accuracy                           0.69      1717
   macro avg       0.64      0.69      0.64      1717
weighted avg       0.77      0.69      0.71      1717


Sensing vs Intuition
Avg AUC: 0.76, Avg Accuracy: 0.69, Avg F1: 0.74
              precision    recall  f1-score   support

           0       0.93      0.69      0.79      1480
           1       0.25      0.66      0.37       237

    accuracy                           0.69      1717
   macro avg       0.59      0.67      0.58      1717
weighted avg       0.83      0.69      0.73      1717


Thinking vs Feeling
Avg AUC: 0.85, Avg Accuracy: 0.77, Avg F1: 0.77
              precision    recall  f1-score   support

           0       0.79      0.78      0.79       929
           1       0.75   

<IPython.core.display.Javascript object>

### Random Forest

In [18]:
# using TF-IDF vectorized data
rf_tf = imb_make_pipeline(
    preprocesser_tf,
    DenseTransformer(),
    RandomUnderSampler(),
    RandomForestClassifier(n_estimators=100, max_depth=10),
)
create_model(rf_tf, X, y, nsplits=5)


Extrovert vs Introvert
Avg AUC: 0.69, Avg Accuracy: 0.64, Avg F1: 0.66
              precision    recall  f1-score   support

           0       0.85      0.65      0.73      1321
           1       0.34      0.61      0.44       396

    accuracy                           0.64      1717
   macro avg       0.59      0.63      0.59      1717
weighted avg       0.73      0.64      0.67      1717


Sensing vs Intuition
Avg AUC: 0.67, Avg Accuracy: 0.63, Avg F1: 0.68
              precision    recall  f1-score   support

           0       0.91      0.62      0.74      1480
           1       0.20      0.59      0.30       237

    accuracy                           0.62      1717
   macro avg       0.55      0.61      0.52      1717
weighted avg       0.81      0.62      0.68      1717


Thinking vs Feeling
Avg AUC: 0.81, Avg Accuracy: 0.74, Avg F1: 0.74
              precision    recall  f1-score   support

           0       0.77      0.72      0.74       929
           1       0.69   

<IPython.core.display.Javascript object>

In [19]:
# using TF-IDF vectorized data
rf_ct = imb_make_pipeline(
    preprocesser_ct,
    DenseTransformer(),
    RandomUnderSampler(),
    RandomForestClassifier(n_estimators=100, max_depth=10),
)
create_model(rf_ct, X, y, nsplits=5)


Extrovert vs Introvert
Avg AUC: 0.70, Avg Accuracy: 0.65, Avg F1: 0.68
              precision    recall  f1-score   support

           0       0.85      0.64      0.73      1321
           1       0.35      0.64      0.45       396

    accuracy                           0.64      1717
   macro avg       0.60      0.64      0.59      1717
weighted avg       0.74      0.64      0.67      1717


Sensing vs Intuition
Avg AUC: 0.68, Avg Accuracy: 0.64, Avg F1: 0.70
              precision    recall  f1-score   support

           0       0.91      0.64      0.75      1480
           1       0.22      0.62      0.32       237

    accuracy                           0.64      1717
   macro avg       0.56      0.63      0.54      1717
weighted avg       0.82      0.64      0.69      1717


Thinking vs Feeling
Avg AUC: 0.82, Avg Accuracy: 0.74, Avg F1: 0.74
              precision    recall  f1-score   support

           0       0.77      0.72      0.75       929
           1       0.70   

<IPython.core.display.Javascript object>

### Final Model

In [None]:
# Logistics Regression using TF-IDF Vectorizer

# clf_tf = imb_make_pipeline(MinMaxScaler(), RandomUnderSampler(), LogisticRegression(),)
# create_model(clf_tf, X_tf, y, nsplits=5)

# clf_ct = imb_make_pipeline(MinMaxScaler(), RandomUnderSampler(), LogisticRegression(),)
# create_model(clf_ct, X_ct, y, nsplits=5)

# nb_tf = imb_make_pipeline(MinMaxScaler(), RandomUnderSampler(), MultinomialNB())
# create_model(nb_tf, X_tf, y, nsplits=5)

In [20]:
def create_final_model(model, X, target):
    
    # to time the individual model run time
    t = time.time()

    for col in target.columns:

        model.fit(X, target[col])

        dump(model, f"clf_{col}.joblib")

    print(f"\nTime Taken: {time.time()-t:.2f} seconds")    


<IPython.core.display.Javascript object>

In [21]:
clf_tf = imb_make_pipeline(
    preprocesser_tf, DenseTransformer(), RandomUnderSampler(), LogisticRegression(),
)
create_final_model(clf_tf, X, y)


Time Taken: 32.26 seconds


<IPython.core.display.Javascript object>