# (MBTI) Myers-Briggs Personality Type Prediction

* Extroversion vs. Introversion
    * I - 0
    * E - 1 
    
* Sensing vs. Intuition 
    * N - 0 
    * S - 1
    
* Thinking vs. Feeling
    * F - 0
    * T - 1
    
* Judging vs. Perceiving
    * P - 0
    * J - 1 

In [1]:
# importing dependencies here
import numpy as np
import pandas as pd

# visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# data stratifying and splitting
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

# algorithms/models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from imblearn.pipeline import make_pipeline as imb_make_pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

# model performance evaluation and selection
from sklearn.metrics import (
    classification_report,
    f1_score,
    accuracy_score,
    roc_auc_score,
)

# performance check
import time
import warnings

warnings.filterwarnings("ignore")

# saving the model
from joblib import dump


# code formatter
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# reading the final datasets
personality_data = pd.read_csv("data_ekta/clean_data_3.csv")
tfidf_vectorized_data = pd.read_csv("data_ekta/tfidf_vectorized_data.csv")
count_vectorized_data = pd.read_csv("data_ekta/count_vectorized_data.csv")

<IPython.core.display.Javascript object>

In [3]:
# checking counts dataset
personality_data.head(2)

Unnamed: 0,type,is_Extrovert,is_Sensing,is_Thinking,is_Judging,posts,clean_posts,compound_sentiment,pos_sentiment,neg_sentiment,...,word_count,unique_words,avg_word_ct,post_length_var,med_char,med_word,upper,link_count,ellipses,img_count
0,INFJ,0,0,0,1,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,youtube moment youtube sportscenter top t...,0.997599,0.352861,0.292958,...,578,376,11.56,135.29,1.0,1.0,13,24,7,7
1,ENTP,1,0,1,0,'I'm finding the lack of me in these posts ver...,im finding lack post alarming sex boring posit...,0.99925,0.361035,0.349296,...,1194,596,23.88,187.4756,1.0,1.0,82,10,0,8


<IPython.core.display.Javascript object>

In [4]:
# checking TF-IDF vectorized dataset
tfidf_vectorized_data.head(2)

Unnamed: 0,ability,able,absolute,absolutely,abstract,accept,according,account,accurate,across,...,yesterday,yet,youd,youll,young,younger,youre,youtube,youve,yup
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.030203,0.58208,0.051687,0.0
1,0.0,0.038337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.103384,0.031132,0.0,0.0


<IPython.core.display.Javascript object>

In [5]:
# checking Count vectorized dataset
count_vectorized_data.head(2)

Unnamed: 0,ability,able,absolute,absolutely,abstract,accept,according,account,accurate,act,...,yes,yesterday,youd,youll,young,younger,youre,youtube,youve,yup
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,16,1,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,4,1,0,0


<IPython.core.display.Javascript object>

In [6]:
# checking the number of rows and columns in each dataset
print(personality_data.shape)
print(tfidf_vectorized_data.shape)
print(count_vectorized_data.shape)

(8675, 141)
(8675, 1469)
(8675, 1353)


<IPython.core.display.Javascript object>

### Modelling

In [7]:
X = personality_data[
    [
        "compound_sentiment",
        "S_ADJ_med",
        "S_ADJ_std",
        "S_ADP_med",
        "S_ADP_std",
        "S_ADV_med",
        "S_ADV_std",
        "S_CONJ_med",
        "S_CONJ_std",
        "S_DET_med",
        "S_DET_std",
        "S_NOUN_med",
        "S_NOUN_std",
        "S_NUM_med",
        "S_NUM_std",
        "S_PRT_med",
        "S_PRT_std",
        "S_PRON_med",
        "S_PRON_std",
        "S_VERB_med",
        "S_VERB_std",
        "qm",
        "em",
        "colons",
        "emojis",
        "word_count",
        "unique_words",
        "avg_word_ct",
        "post_length_var",
        "med_char",
        "med_word",
        "upper",
        "link_count",
        "ellipses",
        "img_count",
    ]
]

<IPython.core.display.Javascript object>

#### Setting predictors and target variable


In [8]:
# combining counts, sentiment score and TF-IDF vectorized data
# X_tf = pd.concat([personality_data.iloc[:, 8:20], tfidf_vectorized_data], axis=1)
X_tf = pd.concat([X, tfidf_vectorized_data], axis=1)

# combining counts, sentiment score and Count vectorized data
# X_ct = pd.concat([personality_data.iloc[:, 8:20], count_vectorized_data], axis=1)
X_ct = pd.concat([X, count_vectorized_data], axis=1)

# since it is a multiclass problem with class imbalance, we will use 4 type variables as predictors instead of 1 "type" col
y = personality_data.iloc[:, 1:5]
# y = target_y

print(X_tf.shape)
print(X_ct.shape)
print(y.shape)

(8675, 1504)
(8675, 1388)
(8675, 4)


<IPython.core.display.Javascript object>

In [9]:
def create_model(model, X, target, nsplits):

    mbti_type = {
        "is_Extrovert": "Extrovert vs Introvert",
        "is_Sensing": "Sensing vs Intuition",
        "is_Thinking": "Thinking vs Feeling",
        "is_Judging": "Judging vs Perceiving",
    }

    trans_list = [
        {0: "I", 1: "E"},
        {0: "N", 1: "S"},
        {0: "F", 1: "T"},
        {0: "J", 1: "P"},
    ]

    kfold = StratifiedKFold(n_splits=nsplits, shuffle=True, random_state=1)

    # to time the individual model run time
    t = time.time()

    predictions = []

    for col in target.columns:

        print(f"\n{mbti_type[col]}")
        y = target[col]
        auc_list, acc_list, f1_list = [], [], []

        #         X_train, X_test, y_train, y_test = train_test_split(
        #             X, target[col], random_state=0, test_size=0.20, stratify=target[col]
        #         )

        for train_index, test_index in kfold.split(X, y):

            X_train, X_test, y_train, y_test = (
                X.iloc[train_index],
                X.iloc[test_index],
                y[train_index],
                y[test_index],
            )

            model.fit(X_train, y_train)

            y_pred = model.predict(X_test)
            #         predictions.append(y_pred)
            #             print("RESULT", y_pred)

            preds = model.predict_proba(X_test)[
                :, 1
            ]  # returns the probablity of class 1
            auc = roc_auc_score(y_test, preds)
            acc = accuracy_score(y_test, preds.round())
            f1 = f1_score(y_test, preds.round())

            #         auc = roc_auc_score(y_test, y_pred)
            #         acc = accuracy_score(y_test, y_pred)
            #         f1 = f1_score(y_test, y_pred)

            auc_list.append(auc)
            acc_list.append(acc)
            f1_list.append(f1)

        #         print(f"Avg AUC: {auc:.2f}, Avg Accuracy: {acc:.2f}, Avg F1: {f1:.2f}")
        print(
            f"Avg AUC: {np.mean(auc_list):.2f}, Avg Accuracy: {np.mean(acc_list):.2f}, Avg F1: {np.mean(f1_list):.2f}"
        )
        print(classification_report(y_test, y_pred))

    #         print("Cross val score:", cross_val_score(model, X, y[col], cv=5, scoring="f1"))

    print(f"\nTime Taken: {time.time()-t:.2f} seconds")

<IPython.core.display.Javascript object>

### Logistic Regression

In [10]:
# using TF-IDF vectorized data
# clf_tf = LogisticRegression(class_weight="balanced")

clf_tf = imb_make_pipeline(MinMaxScaler(), RandomUnderSampler(), LogisticRegression())
create_model(clf_tf, X_tf, y, nsplits=5)


Extrovert vs Introvert
Avg AUC: 0.68, Avg Accuracy: 0.63, Avg F1: 0.43
              precision    recall  f1-score   support

           0       0.85      0.64      0.73      1335
           1       0.34      0.61      0.43       400

    accuracy                           0.63      1735
   macro avg       0.59      0.63      0.58      1735
weighted avg       0.73      0.63      0.66      1735


Sensing vs Intuition
Avg AUC: 0.67, Avg Accuracy: 0.63, Avg F1: 0.31
              precision    recall  f1-score   support

           0       0.92      0.62      0.74      1495
           1       0.21      0.64      0.32       240

    accuracy                           0.63      1735
   macro avg       0.56      0.63      0.53      1735
weighted avg       0.82      0.63      0.68      1735


Thinking vs Feeling
Avg AUC: 0.84, Avg Accuracy: 0.76, Avg F1: 0.74
              precision    recall  f1-score   support

           0       0.79      0.74      0.77       938
           1       0.72   

<IPython.core.display.Javascript object>

In [11]:
# Saving this model for app testing

dump(clf_tf, "clf.joblib")

['clf.joblib']

<IPython.core.display.Javascript object>

In [12]:
# using Count vectorized data
# clf_ct = LogisticRegression()

clf_ct = imb_make_pipeline(MinMaxScaler(), RandomUnderSampler(), LogisticRegression())
create_model(clf_ct, X_ct, y, nsplits=5)


Extrovert vs Introvert
Avg AUC: 0.68, Avg Accuracy: 0.63, Avg F1: 0.44
              precision    recall  f1-score   support

           0       0.86      0.64      0.73      1335
           1       0.35      0.65      0.45       400

    accuracy                           0.64      1735
   macro avg       0.60      0.64      0.59      1735
weighted avg       0.74      0.64      0.67      1735


Sensing vs Intuition
Avg AUC: 0.68, Avg Accuracy: 0.62, Avg F1: 0.31
              precision    recall  f1-score   support

           0       0.92      0.60      0.73      1495
           1       0.21      0.68      0.33       240

    accuracy                           0.61      1735
   macro avg       0.57      0.64      0.53      1735
weighted avg       0.82      0.61      0.67      1735


Thinking vs Feeling
Avg AUC: 0.84, Avg Accuracy: 0.76, Avg F1: 0.74
              precision    recall  f1-score   support

           0       0.79      0.74      0.76       938
           1       0.71   

<IPython.core.display.Javascript object>

### Naive Bayes

In [13]:
# using TF-IDF vectorized data
# nb_tf = MultinomialNB()
# create_model(nb_tf, X_tf, y, num_splits=5)

nb_tf = imb_make_pipeline(MinMaxScaler(), RandomUnderSampler(), MultinomialNB())
create_model(nb_tf, X_tf, y, nsplits=5)


Extrovert vs Introvert
Avg AUC: 0.72, Avg Accuracy: 0.65, Avg F1: 0.47
              precision    recall  f1-score   support

           0       0.87      0.67      0.76      1335
           1       0.38      0.67      0.48       400

    accuracy                           0.67      1735
   macro avg       0.62      0.67      0.62      1735
weighted avg       0.76      0.67      0.69      1735


Sensing vs Intuition
Avg AUC: 0.72, Avg Accuracy: 0.65, Avg F1: 0.34
              precision    recall  f1-score   support

           0       0.93      0.65      0.76      1495
           1       0.24      0.69      0.35       240

    accuracy                           0.65      1735
   macro avg       0.58      0.67      0.56      1735
weighted avg       0.83      0.65      0.71      1735


Thinking vs Feeling
Avg AUC: 0.83, Avg Accuracy: 0.76, Avg F1: 0.74
              precision    recall  f1-score   support

           0       0.77      0.75      0.76       938
           1       0.71   

<IPython.core.display.Javascript object>

In [14]:
# using Count vectorized data
# nb_ct = MultinomialNB()
# create_model(nb_ct, X_ct, y, num_splits=5)

nb_ct = imb_make_pipeline(MinMaxScaler(), RandomUnderSampler(), MultinomialNB())
create_model(nb_ct, X_ct, y, nsplits=5)


Extrovert vs Introvert
Avg AUC: 0.71, Avg Accuracy: 0.64, Avg F1: 0.46
              precision    recall  f1-score   support

           0       0.86      0.65      0.74      1335
           1       0.36      0.66      0.47       400

    accuracy                           0.65      1735
   macro avg       0.61      0.65      0.60      1735
weighted avg       0.75      0.65      0.68      1735


Sensing vs Intuition
Avg AUC: 0.71, Avg Accuracy: 0.63, Avg F1: 0.34
              precision    recall  f1-score   support

           0       0.93      0.64      0.75      1495
           1       0.23      0.68      0.34       240

    accuracy                           0.64      1735
   macro avg       0.58      0.66      0.55      1735
weighted avg       0.83      0.64      0.70      1735


Thinking vs Feeling
Avg AUC: 0.83, Avg Accuracy: 0.75, Avg F1: 0.73
              precision    recall  f1-score   support

           0       0.77      0.75      0.76       938
           1       0.72   

<IPython.core.display.Javascript object>

### Random Forest

In [15]:
# using TF-IDF vectorized data
# rf_tf = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight="balanced")
# create_model(rf_tf, X_tf, y, num_splits=5)

rf_tf = imb_make_pipeline(
    MinMaxScaler(),
    RandomUnderSampler(),
    RandomForestClassifier(n_estimators=100, max_depth=8),
)
create_model(rf_tf, X_tf, y, nsplits=5)


Extrovert vs Introvert
Avg AUC: 0.68, Avg Accuracy: 0.63, Avg F1: 0.44
              precision    recall  f1-score   support

           0       0.85      0.64      0.73      1335
           1       0.34      0.62      0.44       400

    accuracy                           0.63      1735
   macro avg       0.59      0.63      0.58      1735
weighted avg       0.73      0.63      0.66      1735


Sensing vs Intuition
Avg AUC: 0.67, Avg Accuracy: 0.61, Avg F1: 0.31
              precision    recall  f1-score   support

           0       0.92      0.64      0.75      1495
           1       0.23      0.67      0.34       240

    accuracy                           0.64      1735
   macro avg       0.58      0.65      0.55      1735
weighted avg       0.83      0.64      0.70      1735


Thinking vs Feeling
Avg AUC: 0.80, Avg Accuracy: 0.73, Avg F1: 0.72
              precision    recall  f1-score   support

           0       0.76      0.69      0.72       938
           1       0.67   

<IPython.core.display.Javascript object>

In [16]:
# using TF-IDF vectorized data
# rf_ct = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight="balanced")
# create_model(rf_ct, X_ct, y, num_splits=5)


rf_ct = imb_make_pipeline(
    MinMaxScaler(),
    RandomUnderSampler(),
    RandomForestClassifier(n_estimators=100, max_depth=8),
)
create_model(rf_ct, X_ct, y, nsplits=5)


Extrovert vs Introvert
Avg AUC: 0.69, Avg Accuracy: 0.64, Avg F1: 0.45
              precision    recall  f1-score   support

           0       0.86      0.67      0.75      1335
           1       0.37      0.64      0.47       400

    accuracy                           0.66      1735
   macro avg       0.61      0.65      0.61      1735
weighted avg       0.75      0.66      0.69      1735


Sensing vs Intuition
Avg AUC: 0.67, Avg Accuracy: 0.62, Avg F1: 0.31
              precision    recall  f1-score   support

           0       0.91      0.61      0.73      1495
           1       0.20      0.61      0.30       240

    accuracy                           0.61      1735
   macro avg       0.55      0.61      0.52      1735
weighted avg       0.81      0.61      0.67      1735


Thinking vs Feeling
Avg AUC: 0.80, Avg Accuracy: 0.72, Avg F1: 0.71
              precision    recall  f1-score   support

           0       0.75      0.70      0.72       938
           1       0.67   

<IPython.core.display.Javascript object>

### Plotting ROC AUC Graphs

In [17]:
# from sklearn.metrics import roc_curve, auc

# fpr = dict()
# tpr = dict()
# roc_auc = dict()
# for i in range(2):
#     fpr[i], tpr[i], _ = roc_curve(y_test, pred)
#     roc_auc[i] = auc(fpr[i], tpr[i])

# print roc_auc_score(test, pred)
# plt.figure()
# plt.plot(fpr[1], tpr[1])
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.title("Receiver operating characteristic")
# plt.show()

<IPython.core.display.Javascript object>

In [18]:
# model_list = [lr_tf_model, lr_ct_model, mnb_tf_model, mnb_ct_model]
# model_names = ['LR TFIDF', 'LR Word ct','MNB TFIDF', 'MNB Word ct']
# plot_

<IPython.core.display.Javascript object>

### FInal Model