# (MBTI) Myers-Briggs Personality Type Prediction

* Extroversion vs. Introversion
    * I - 0
    * E - 1 
    
* Sensing vs. Intuition 
    * N - 0 
    * S - 1
    
* Thinking vs. Feeling
    * F - 0
    * T - 1
    
* Judging vs. Perceiving
    * P - 0
    * J - 1 

In [1]:
# importing dependencies here
import numpy as np
import pandas as pd

# visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# feature engineering
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download("wordnet")
nltk.download("vader_lexicon")

# sentiment scoring
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# scikit
# vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# scaling to handle negative values (for Naive Bayes)
from sklearn.preprocessing import MinMaxScaler

# data stratifying and splitting
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

# algorithms/models
# from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.preprocessing import Normalizer
from imblearn.pipeline import make_pipeline as imb_make_pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

# model performance evaluation and selection
from sklearn.metrics import (
    classification_report,
    f1_score,
    accuracy_score,
    roc_auc_score,
)

# performance check
import time

from joblib import load

# code formatter
%load_ext nb_black

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eshom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eshom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\eshom\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eshom\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\eshom\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


<IPython.core.display.Javascript object>

In [2]:
# reading the test dataset
df = pd.read_csv("../data/test_data.csv")

<IPython.core.display.Javascript object>

In [3]:
# checking top records
df.head(2)

Unnamed: 0,name,type,posts
0,Donald Trump,ESTP,My thoughts and prayers are with the @USMC cre...
1,Barack Obama,ENFJ,"Happy Hanukkah! Over these eight nights, we dr..."


<IPython.core.display.Javascript object>

In [4]:
def categorize_types(personality_data):

    personality_data["is_Extrovert"] = personality_data["type"].apply(
        lambda x: 1 if x[0] == "E" else 0
    )
    personality_data["is_Sensing"] = personality_data["type"].apply(
        lambda x: 1 if x[1] == "S" else 0
    )
    personality_data["is_Thinking"] = personality_data["type"].apply(
        lambda x: 1 if x[2] == "T" else 0
    )
    personality_data["is_Judging"] = personality_data["type"].apply(
        lambda x: 1 if x[3] == "J" else 0
    )

    # rearranging the dataframe columns
    personality_data = personality_data[
        ["type", "is_Extrovert", "is_Sensing", "is_Thinking", "is_Judging", "posts"]
    ]


#######################################################################################################3


def clean_posts(personality_data):

    # converting posts into lower case
    personality_data["clean_posts"] = personality_data["posts"].str.lower()

    # replacing ||| with space
    personality_data["clean_posts"] = personality_data["clean_posts"].str.replace(
        re.compile(r"\|\|\|"), ""
    )

    # replacing urls with domain name
    personality_data["clean_posts"] = personality_data["clean_posts"].str.replace(
        re.compile(r"https?:\/\/(www)?.?([A-Za-z_0-9-]+)([\S])*"),
        lambda match: match.group(2),
    )

    # dropping emails
    personality_data["clean_posts"] = personality_data["clean_posts"].str.replace(
        re.compile(r"\S+@\S+"), ""
    )

    # dropping punctuations
    personality_data["clean_posts"] = personality_data["clean_posts"].str.replace(
        re.compile(r"[^a-z\s]"), ""
    )

    # dropping MBTIs mentioned in the posts. There are quite a few mention of these types in these posts.
    mbti = personality_data["type"].unique()
    for type_word in mbti:
        personality_data["clean_posts"] = personality_data["clean_posts"].str.replace(
            type_word.lower(), ""
        )

    # tag_posts will be a list of 50 lists. need it for word stats (per post for each user)
    # replacing urls with domain name
    personality_data["tag_posts"] = personality_data["posts"].str.replace(
        re.compile(r"https?:\/\/(www)?.?([A-Za-z_0-9-]+)([\S])*"),
        lambda match: match.group(2),
    )
    # replacing ||| with space
    personality_data["tag_posts"] = [
        post for post in personality_data["tag_posts"].str.split("\|\|\|")
    ]


#################################################################################################################


def sentiment_score(personality_data):

    analyzer = SentimentIntensityAnalyzer()

    nlp_sentiment_score = []

    for post in personality_data["clean_posts"]:
        score = analyzer.polarity_scores(post)["compound"]
        nlp_sentiment_score.append(score)

    personality_data["compound_sentiment"] = nlp_sentiment_score


###############################################################################################################


def pos_tagging(personality_data):

    personality_data["tagged_words"] = personality_data["tag_posts"].apply(
        lambda x: [nltk.pos_tag(word_tokenize(line)) for line in x]
    )

    # grouping pos tags based on stanford list
    tags_dict = {
        "ADJ": ["JJ", "JJR", "JJS"],
        "ADP": ["EX", "TO"],
        "ADV": ["RB", "RBR", "RBS", "WRB"],
        "CONJ": ["CC", "IN"],
        "DET": ["DT", "PDT", "WDT"],
        "NOUN": ["NN", "NNS", "NNP", "NNPS"],
        "NUM": ["CD"],
        "PRT": ["RP"],
        "PRON": ["PRP", "PRP$", "WP", "WP$"],
        "VERB": ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"],
        ".": ["#", "$", "''", "(", ")", ",", ".", ":"],
        "X": ["FW", "LS", "UH"],
    }

    def stanford_tag(x, tag):
        tags_list = [len([y for y in line if y[1] in tags_dict[col]]) for line in x]
        return tags_list

    for col in tags_dict.keys():
        personality_data["S_" + col + "_med"] = personality_data["tagged_words"].apply(
            lambda x: np.median(stanford_tag(x, col))
        )
        personality_data["S_" + col + "_std"] = personality_data["tagged_words"].apply(
            lambda x: np.std(stanford_tag(x, col))
        )


###############################################################################################################


def get_counts(personality_data):
    def unique_words(s):
        unique = set(s.split(" "))
        return len(unique)

    def emojis(post):
        # does not include emojis made purely from symbols, only :word:
        emoji_count = 0
        words = post.split()
        for e in words:
            if "http" not in e:
                if e.count(":") == 2:
                    emoji_count += 1
        return emoji_count

    def colons(post):
        # Includes colons used in emojis
        colon_count = 0
        words = post.split()
        for e in words:
            if "http" not in e:
                colon_count += e.count(":")
        return colon_count

    personality_data["qm"] = personality_data["posts"].apply(lambda s: s.count("?"))
    personality_data["em"] = personality_data["posts"].apply(lambda s: s.count("!"))
    personality_data["colons"] = personality_data["posts"].apply(colons)
    personality_data["emojis"] = personality_data["posts"].apply(emojis)
    personality_data["word_count"] = personality_data["posts"].apply(
        lambda s: s.count(" ") + 1
    )
    personality_data["unique_words"] = personality_data["posts"].apply(unique_words)
    personality_data["avg_word_ct"] = personality_data["word_count"].apply(
        lambda s: s / 50
    )
    personality_data["post_length_var"] = personality_data["posts"].apply(
        lambda x: np.var([len(post.split()) for post in x.split("|||")])
    )
    #     personality_data["med_char"] = personality_data["tagged_words"].apply(
    #         lambda x: np.median([len(i) for i in x]))
    #     personality_data["med_word"] = personality_data["tagged_words"].apply(
    #         lambda x: np.median([len(i.split()) for i in x]))
    personality_data["upper"] = personality_data["posts"].apply(
        lambda x: len([x for x in x.split() if x.isupper()])
    )
    personality_data["link_count"] = personality_data["posts"].apply(
        lambda s: s.count("http")
    )
    ellipses_count = [
        len(re.findall(r"\.\.\.\ ", posts)) for posts in personality_data["posts"]
    ]
    personality_data["ellipses"] = ellipses_count
    personality_data["img_count"] = [
        len(re.findall(r"(\.jpg)|(\.jpeg)|(\.gif)|(\.png)", post))
        for post in personality_data["posts"]
    ]


#################################################################################################################


def vectorize(personality_data):

    tfidf_vectorizer = TfidfVectorizer(
        min_df=0.05, max_df=0.85, analyzer="word", ngram_range=(1, 2), max_features=1500
    )
    tfidf_words = tfidf_vectorizer.fit_transform(personality_data["clean_posts"])

    tfidf_vectorized_data = pd.DataFrame(
        data=tfidf_words.toarray(), columns=tfidf_vectorizer.get_feature_names()
    )
    return tfidf_vectorized_data

<IPython.core.display.Javascript object>

In [5]:
def prep_data(personality_data):

    t = time.time()

    categorize_types(personality_data)

    clean_posts(personality_data)

    sentiment_score(personality_data)

    pos_tagging(personality_data)

    get_counts(personality_data)

    tfidf_vectorized_data = vectorize(personality_data)

    features = personality_data[
        [
            "compound_sentiment",
            "S_ADJ_med",
            "S_ADJ_std",
            "S_ADP_med",
            "S_ADP_std",
            "S_ADV_med",
            "S_ADV_std",
            "S_CONJ_med",
            "S_CONJ_std",
            "S_DET_med",
            "S_DET_std",
            "S_NOUN_med",
            "S_NOUN_std",
            "S_NUM_med",
            "S_NUM_std",
            "S_PRT_med",
            "S_PRT_std",
            "S_PRON_med",
            "S_PRON_std",
            "S_VERB_med",
            "S_VERB_std",
            "qm",
            "em",
            "colons",
            "emojis",
            "word_count",
            "unique_words",
            "avg_word_ct",
            "post_length_var",
            #         "med_char",
            #         "med_word",
            "upper",
            "link_count",
            "ellipses",
            "img_count",
        ]
    ]

    X = pd.concat([features, tfidf_vectorized_data], axis=1)
    y = personality_data.iloc[:, 3:7]

    print(f"Total Preprocessing Time: {time.time()-t} seconds")

    return X, y

<IPython.core.display.Javascript object>

### Modelling

In [6]:
def combine_classes(y_pred1, y_pred2, y_pred3, y_pred4):
    
    combined = []
    for i in range(len(y_pred1)):
        combined.append(
            str(y_pred1[i]) + str(y_pred2[i]) + str(y_pred3[i]) + str(y_pred4[i])
        )
    
    result = trace_back(combined)
    return result
    

def trace_back(combined):
        
    type_list = [
    {"0": "I", "1": "E"},
    {"0": "N", "1": "S"},
    {"0": "F", "1": "T"},
    {"0": "P", "1": "J"},
    ]

    result = []
    for num in combined:
        s = ""
        for i in range(len(num)):
            s += type_list[i][num[i]]
        result.append(s)
        
    return result


<IPython.core.display.Javascript object>

In [7]:
def predict(path_to_csv):

    df = pd.read_csv(path_to_csv)

    X, y = prep_data(df)

    # loading the 4 models
    EorI_model = load("clf_is_Extrovert.joblib")
    SorN_model = load("clf_is_Sensing.joblib")
    TorF_model = load("clf_is_Thinking.joblib")
    JorP_model = load("clf_is_Judging.joblib")

    # predicting
    EorI_pred = EorI_model.predict(X)
    print("y_true", y["is_Extrovert"].values)
    print("preds", EorI_pred)

    SorN_pred = SorN_model.predict(X)
    print("y_true", y["is_Sensing"].values)
    print("preds", SorN_pred)

    TorF_pred = TorF_model.predict(X)
    print("y_true", y["is_Thinking"].values)
    print("preds", TorF_pred)

    JorP_pred = JorP_model.predict(X)
    print("y_true", y["is_Judging"].values)
    print("preds", JorP_pred)

    # combining the predictions from the 4 models
    result = combine_classes(EorI_pred, SorN_pred, TorF_pred, JorP_pred)

    return result

<IPython.core.display.Javascript object>

In [8]:
if __name__ == "__main__":

    predictions = predict("../data/test_data.csv")
    y_truth = pd.read_csv("../data/test_data.csv")["type"].values
    
    print("\n")
    print(y_truth)
    print(predictions)


Total Preprocessing Time: 0.9983274936676025 seconds
y_true [1 1 0 0 1 0 1 0 0 0 0]
preds [0 0 0 0 0 1 0 0 0 0 0]
y_true [1 0 1 0 1 0 0 1 0 0 0]
preds [0 0 1 0 1 0 0 1 0 0 0]
y_true [1 0 0 1 0 0 0 0 0 1 1]
preds [1 1 1 1 1 1 1 1 1 1 1]
y_true [0 1 0 1 0 0 0 1 1 1 1]
preds [1 1 0 1 0 0 0 0 0 0 0]


['ESTP' 'ENFJ' 'ISFP' 'INTJ' 'ESFP' 'INFP' 'ENFP' 'ISFJ' 'INFJ' 'INTJ'
 'INTJ']
['INTJ', 'INTJ', 'ISTP', 'INTJ', 'ISTP', 'ENTP', 'INTP', 'ISTP', 'INTP', 'INTP', 'INTP']


<IPython.core.display.Javascript object>

In [9]:
df["result"] = predictions
df

Unnamed: 0,name,type,posts,result
0,Donald Trump,ESTP,My thoughts and prayers are with the @USMC cre...,INTJ
1,Barack Obama,ENFJ,"Happy Hanukkah! Over these eight nights, we dr...",INTJ
2,Kanye West,ISFP,@jarrodspector @TheCherShow the dynamics of Ch...,ISTP
3,Arnold Schwarzenegger,INTJ,Fantastic to spend some time with you teaming ...,INTJ
4,Justin Bieber,ESFP,All love over here Aaron. You got my support||...,ISTP
5,Kina Grannis,INFP,happiest of birthdays to this sweetest human i...,ENTP
6,Kristen Bell,ENFP,NEW BOOK: “Congo Stories” shares the voices of...,INTP
7,Kim Kardashian,ISFJ,Very calm except for when she wants food lol S...,ISTP
8,Lady Gaga,INFJ,#Enigma #GagaVegas https://t.co/lGl7cxSCAH|||#...,INTP
9,Elon Musk,INTJ,Tracking shot of Falcon water landing https://...,INTP


<IPython.core.display.Javascript object>